「Pytorch」解析 nn.Embedding 词嵌入 及反推输入

在这里插入图片描述

在Pytorch里面有三种编码方式,Embedding 就是其中之一,不同于 one_hot 需要用同长度的张量存 Embedding将单词映射为稠密向量,并且指定 embedding_dim维度。通过 Embedding层 转换的数据并不是经过词典映射的,而是原始数据。Embedding的输出可以作为神经网络的输入,进一步进行训练和处理。在深度学习中,Embedding是将离散的输入映射为连续的向量表示的一种技术。它在自然语言处理和推荐系统等任务中广泛使用。

在这里插入图片描述

One-hot 的缺点:

  • one-hot 没有任何的语义信息
  • one-hot 编码的 维度等于字典的长度,转换张量为一个低稠密度向量

其实一般就是前用两个参数

  1. num_embeddings 就是生成 num_embeddings 个嵌入向量,
  2. embedding_dim 就是嵌入向量的维度,即使用一个 [1, embedding_dim] 向量表示一个
import torch
from torch import nn

k = 49408
embedding = nn.Embedding(k, embedding_dim=16)

emb = embedding(torch.tensor([1, 4, 5, 7])).detach()  		# 输入数据大小不能超过 阈值k,detach是去除梯度信息
print('emb',emb.data)

# 输出是一个 [4,16] 维的张量,
tensor([[-2.0561, -1.0517,  1.1487,  1.2165, -0.7428,  0.0255, -2.3840, -0.6686,
         -0.1522, -0.6380,  0.8078,  2.3382, -0.3678, -0.5860, -0.1160,  0.4851],
        [-0.0821,  0.6428, -0.7851, -0.6309, -0.1035, -0.0277,  0.4757,  0.0314,
          0.2354,  0.0902,  0.6876, -0.6741, -0.7009,  0.6107, -0.5739, -0.7009],
        [ 0.2205, -0.5750, -0.4575,  0.2809,  0.1322, -0.4874,  0.0903, -0.9487,
          0.4847,  0.5165,  0.0463,  0.2832,  0.1155, -1.0731, -0.4711, -0.4850],
        [-1.3978,  0.8461,  0.4533,  0.6129,  0.4035,  0.1373, -1.1821,  1.2093,
         -0.0140,  1.4716,  0.2694,  1.7981,  0.4558,  0.6396,  0.4042, -1.9535]])

注意:nn.Embedding()是可以训练的,参考 torch.nn.Embedding是否有梯度,是否会被训练

Embedding 反推输入

Embedding 在创建的时候会创建一个 [ k , H ] [k, H] [k,H] 维度的张量,可以通过 embedding.weigh 获取

embedding = nn.Embedding(k, embedding_dim=H)

原本还想证明 embedding 是可以反推输入的,但是直到下面这几行输出,我发现了什么 (ΩДΩ) (ΩДΩ) (ΩДΩ)

在这里插入图片描述

AxialAttention和AxialBlock的模块代码如下: def conv1x1(in_planes, out_planes, stride=1): """1x1 convolution""" return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False) class qkv_transform(nn.Conv1d): """Conv1d for qkv_transform""" pass #AxialAttention:通道数可以自适应,无需特意指定 class AxialAttention(nn.Module): def __init__(self, in_planes, out_planes, groups=8, kernel_size=56, stride=1, bias=False, width=False): assert (in_planes % groups == 0) and (out_planes % groups == 0) super(AxialAttention, self).__init__() self.in_planes = in_planes self.out_planes = out_planes self.groups = groups self.group_planes = out_planes // groups self.kernel_size = kernel_size self.stride = stride self.bias = bias self.width = width # Multi-head self attention self.qkv_transform = qkv_transform(in_planes, out_planes * 2, kernel_size=1, stride=1, padding=0, bias=False) self.bn_qkv = nn.BatchNorm1d(out_planes * 2) self.bn_similarity = nn.BatchNorm2d(groups * 3) #self.bn_qk = nn.BatchNorm2d(groups) #self.bn_qr = nn.BatchNorm2d(groups) #self.bn_kr = nn.BatchNorm2d(groups) self.bn_output = nn.BatchNorm1d(out_planes * 2) # Position embedding self.relative = nn.Parameter(torch.randn(self.group_planes * 2, kernel_size * 2 - 1), requires_grad=True) query_index = torch.arange(kernel_size).unsqueeze(0) key_index = torch.arange(kernel_size).unsqueeze(1) relative_index = key_index - query_index + kernel_size - 1 self.register_buffer('flatten_index', relative_index.view(-1)) if stride > 1: self.pooling = nn.AvgPool2d(stride, stride=stride) self.reset_parameters() def forward(self, x): if self.width: x = x.permute(0, 2, 1, 3) else: x = x.permute(0, 3, 1, 2) # N, W, C, H N, W, C, H = x.shape x = x.contiguous().view(N * W, C, H) # Transformations qkv = self.bn_qkv(self.qkv_transform(x)) q, k, v = torch.split(qkv.reshape(N * W, self.groups, self.group_planes * 2, H), [self.group_planes // 2, self.group_planes // 2, self.group_planes], dim=2) # Calculate position embedding all_embeddings = torch.index_select(self.relative, 1, self.flatten_index).view(self.group_planes * 2, self.kernel_size, self.kernel_size) q_embedding, k_embedding, v_embedding = torch.split(all_embeddings, [self.group_planes // 2, self.group_planes // 2, self.group_planes], dim=0) qr = torch.einsum('bgci,cij->bgij', q, q_embedding) kr = torch.einsum('bgci,cij->bgij', k, k_embedding).transpose(2, 3) qk = torch.einsum('bgci, bgcj->bgij', q, k) stacked_similarity = torch.cat([qk, qr, kr], dim=1) stacked_similarity = self.bn_similarity(stacked_similarity).view(N * W, 3, self.groups, H, H).sum(dim=1) #stacked_similarity = self.bn_qr(qr) + self.bn_kr(kr) + self.bn_qk(qk) # (N, groups, H, H, W) similarity = F.softmax(stacked_similarity, dim=3) sv = torch.einsum('bgij,bgcj->bgci', similarity, v) sve = torch.einsum('bgij,cij->bgci', similarity, v_embedding) stacked_output = torch.cat([sv, sve], dim=-1).view(N * W, self.out_planes * 2, H) output = self.bn_output(stacked_output).view(N, W, self.out_planes, 2, H).sum(dim=-2) if self.width: output = output.permute(0, 2, 1, 3) else: output = output.permute(0, 2, 3, 1) if self.stride > 1: output = self.pooling(output) return output def reset_parameters(self): self.qkv_transform.weight.data.normal_(0, math.sqrt(1. / self.in_planes)) #nn.init.uniform_(self.relative, -0.1, 0.1) nn.init.normal_(self.relative, 0., math.sqrt(1. / self.group_planes)) #AxialBlock:中间通道plane×2=输出通道。inc=outc时,plane=0.5×inc。 class AxialBlock(nn.Module): expansion = 2 def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1, base_width=64, dilation=1, norm_layer=None, kernel_size=56): super(AxialBlock, self).__init__() if norm_layer is None: norm_layer = nn.BatchNorm2d width = int(planes * (base_width / 64.)) # Both self.conv2 and self.downsample layers downsample the input when stride != 1 self.conv_down = conv1x1(inplanes, width) self.bn1 = norm_layer(width) self.hight_block = AxialAttention(width, width, groups=groups, kernel_size=kernel_size) self.width_block = AxialAttention(width, width, groups=groups, kernel_size=kernel_size, stride=stride, width=True) self.conv_up = conv1x1(width, planes * self.expansion) self.bn2 = norm_layer(planes * self.expansion) self.relu = nn.ReLU(inplace=True) self.downsample = downsample self.stride = stride def forward(self, x): identity = x out = self.conv_down(x) out = self.bn1(out) out = self.relu(out) out = self.hight_block(out) out = self.width_block(out) out = self.relu(out) out = self.conv_up(out) out = self.bn2(out) if self.downsample is not None: identity = self.downsample(x) out += identity out = self.relu(out) return out ultralytics.nn.tasks.py中的parse_model函数中,AxialAttention和AxialBlock的elif分支如下: elif m is AxialAttention: c1 = ch[f] # 输入通道 (自动适应) # 解析参数并设置默认值 (与类定义严格匹配) # 参数顺序: [out_planes, groups, kernel_size, stride, bias, width] c2 = args[0] # 必需: 输出通道 groups = args[1] if len(args) > 1 else 8 kernel_size = args[2] if len(args) > 2 else 56 stride = args[3] if len(args) > 3 else 1 bias = args[4] if len(args) > 4 else False width_flag = args[5] if len(args) > 5 else False # 避免与width变量冲突 # 重建参数列表 new_args = [c1, c2, groups, kernel_size, stride, bias, width_flag] args = new_args elif m is AxialBlock: c1 = ch[f] # 输入通道 (自动适应) # 解析参数并设置默认值 (与类定义严格匹配) # 参数顺序: [planes, stride, groups, base_width, dilation, kernel_size] planes = args[0] # 必需: 基础通道数 stride = args[1] if len(args) > 1 else 1 groups = args[2] if len(args) > 2 else 1 base_width = args[3] if len(args) > 3 else 64 dilation = args[4] if len(args) > 4 else 1 kernel_size = args[5] if len(args) > 5 else 56 # 计算实际输出通道 (考虑expansion系数) c2 = planes * AxialBlock.expansion # 重建参数列表 new_args = [ c1, # inplanes planes, # planes stride, # stride None, # downsample (由框架自动处理) groups, # groups base_width, # base_width dilation, # dilation None, # norm_layer (使用默认) kernel_size # kernel_size ] args = new_args Yaml结构如下: # Ultralytics YOLO 🚀, AGPL-3.0 license # YOLOv8-seg instance segmentation model. For Usage examples see https://docs.ultralytics.com/tasks/segment # Parameters nc: 80 # number of classes scales: # model compound scaling constants, i.e. 'model=yolov8n-seg.yaml' will call yolov8-seg.yaml with scale 'n' # [depth, width, max_channels] n: [0.33, 0.25, 1024] s: [0.33, 0.50, 1024] m: [0.67, 0.75, 768] l: [1.00, 1.00, 512] x: [1.00, 1.25, 512] # YOLOv8.0n backbone backbone: # [from, repeats, module, args] - [-1, 1, Conv, [64, 3, 2]] # 0-P1/2 - [-1, 1, AxialAttention, [32, 8, 128, 1, False, False]] # 1-P2/4 - [-1, 3, C2f, [128, True]] - [-1, 1, Conv, [256, 3, 2]] # 3-P3/8 - [-1, 6, C2f, [256, True]] - [-1, 1, Conv, [512, 3, 2]] # 5-P4/16 - [-1, 6, C2f, [512, True]] - [-1, 1, Conv, [1024, 3, 2]] # 7-P5/32 - [-1, 3, C2f, [1024, True]] - [-1, 1, SPPF, [1024, 5]] # 9 # YOLOv8.0n head head: - [-1, 1, nn.Upsample, [None, 2, "nearest"]] - [[-1, 6], 1, Concat, [1]] # cat backbone P4 - [-1, 3, C2f, [512]] # 12 - [-1, 1, nn.Upsample, [None, 2, "nearest"]] - [[-1, 4], 1, Concat, [1]] # cat backbone P3 - [-1, 3, C2f, [256]] # 15 (P3/8-small) - [-1, 1, Conv, [256, 3, 2]] - [[-1, 12], 1, Concat, [1]] # cat head P4 - [-1, 3, C2f, [512]] # 18 (P4/16-medium) - [-1, 1, Conv, [512, 3, 2]] - [[-1, 9], 1, Concat, [1]] # cat head P5 - [-1, 3, C2f, [1024]] # 21 (P5/32-large) - [[15, 18, 21], 1, Segment, [nc, 32, 256]] # Segment(P3, P4, P5) 当第1层- [-1, 1, AxialAttention, [32, 8, 128, 1, False, False]] # 1-P2/4 的kernel_size取128时,网络正常加载,训练时报错: Traceback (most recent call last): File "E:\42yolo_model_change\yolov8-42\42_demo\start_train.py", line 27, in <module> results = model.train(data='A_my_data.yaml', epochs=100, imgsz=640, batch=8, workers=8, deterministic=True, seed=42, conf=0.25) # 开始训练 File "E:\42yolo_model_change\yolov8-42\ultralytics\engine\model.py", line 813, in train self.trainer.train() File "E:\42yolo_model_change\yolov8-42\ultralytics\engine\trainer.py", line 208, in train self._do_train(world_size) File "E:\42yolo_model_change\yolov8-42\ultralytics\engine\trainer.py", line 385, in _do_train self.loss, self.loss_items = self.model(batch) File "D:\Anaconda3\envs\pytorch\lib\site-packages\torch\nn\modules\module.py", line 1518, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "D:\Anaconda3\envs\pytorch\lib\site-packages\torch\nn\modules\module.py", line 1527, in _call_impl return forward_call(*args, **kwargs) File "E:\42yolo_model_change\yolov8-42\ultralytics\nn\tasks.py", line 108, in forward return self.loss(x, *args, **kwargs) File "E:\42yolo_model_change\yolov8-42\ultralytics\nn\tasks.py", line 289, in loss preds = self.forward(batch["img"]) if preds is None else preds File "E:\42yolo_model_change\yolov8-42\ultralytics\nn\tasks.py", line 109, in forward return self.predict(x, *args, **kwargs) File "E:\42yolo_model_change\yolov8-42\ultralytics\nn\tasks.py", line 127, in predict return self._predict_once(x, profile, visualize, embed) File "E:\42yolo_model_change\yolov8-42\ultralytics\nn\tasks.py", line 148, in _predict_once x = m(x) # run File "D:\Anaconda3\envs\pytorch\lib\site-packages\torch\nn\modules\module.py", line 1518, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "D:\Anaconda3\envs\pytorch\lib\site-packages\torch\nn\modules\module.py", line 1527, in _call_impl return forward_call(*args, **kwargs) File "E:\42yolo_model_change\yolov8-42\ultralytics\nn\modules\block.py", line 1299, in forward qr = torch.einsum('bgci,cij->bgij', q, q_embedding) File "D:\Anaconda3\envs\pytorch\lib\site-packages\torch\functional.py", line 377, in einsum return _VF.einsum(equation, operands) # type: ignore[attr-defined] RuntimeError: einsum(): subscript i has size 128 for operand 1 which does not broadcast with previously seen size 320 当第1层- [-1, 1, AxialAttention, [32, 8, 128, 1, False, False]] # 1-P2/4 的kernel_size取320时,网络不能正常加载,报错: Traceback (most recent call last): File "E:\42yolo_model_change\yolov8-42\42_demo\start_train.py", line 25, in <module> model = YOLO(r'E:\42yolo_model_change\yolov8-42\ultralytics\cfg\models\v8\yolov8-seg-AxialAttention-H1.yaml') # load a pretrained model (recommended for training) File "E:\42yolo_model_change\yolov8-42\ultralytics\models\yolo\model.py", line 23, in __init__ super().__init__(model=model, task=task, verbose=verbose) File "E:\42yolo_model_change\yolov8-42\ultralytics\engine\model.py", line 140, in __init__ self._new(model, task=task, verbose=verbose) File "E:\42yolo_model_change\yolov8-42\ultralytics\engine\model.py", line 260, in _new self.model = (model or self._smart_load("model"))(cfg_dict, verbose=verbose and RANK == -1) # build model File "E:\42yolo_model_change\yolov8-42\ultralytics\nn\tasks.py", line 409, in __init__ super().__init__(cfg=cfg, ch=ch, nc=nc, verbose=verbose) File "E:\42yolo_model_change\yolov8-42\ultralytics\nn\tasks.py", line 333, in __init__ m.stride = torch.tensor([s / x.shape[-2] for x in _forward(torch.zeros(1, ch, s, s))]) # forward File "E:\42yolo_model_change\yolov8-42\ultralytics\nn\tasks.py", line 331, in _forward return self.forward(x)[0] if isinstance(m, (Segment, Pose, OBB)) else self.forward(x) File "E:\42yolo_model_change\yolov8-42\ultralytics\nn\tasks.py", line 109, in forward return self.predict(x, *args, **kwargs) File "E:\42yolo_model_change\yolov8-42\ultralytics\nn\tasks.py", line 127, in predict return self._predict_once(x, profile, visualize, embed) File "E:\42yolo_model_change\yolov8-42\ultralytics\nn\tasks.py", line 148, in _predict_once x = m(x) # run File "D:\Anaconda3\envs\pytorch\lib\site-packages\torch\nn\modules\module.py", line 1518, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "D:\Anaconda3\envs\pytorch\lib\site-packages\torch\nn\modules\module.py", line 1527, in _call_impl return forward_call(*args, **kwargs) File "E:\42yolo_model_change\yolov8-42\ultralytics\nn\modules\block.py", line 1299, in forward qr = torch.einsum('bgci,cij->bgij', q, q_embedding) File "D:\Anaconda3\envs\pytorch\lib\site-packages\torch\functional.py", line 377, in einsum return _VF.einsum(equation, operands) # type: ignore[attr-defined] RuntimeError: einsum(): subscript i has size 320 for operand 1 which does not broadcast with previously seen size 128 两次报错运行的是相同的一个脚本: from ultralytics import YOLO import random import numpy as np import torch if __name__ == "__main__": random.seed(42) np.random.seed(42) torch.manual_seed(42) torch.cuda.manual_seed_all(42) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False model = YOLO(r'E:\42yolo_model_change\yolov8-42\ultralytics\cfg\models\v8\yolov8-seg-AxialAttention-H1.yaml') # load a pretrained model (recommended for training) model.load('yolov8n-seg.pt') results = model.train(data='A_my_data.yaml', epochs=100, imgsz=640, batch=8, workers=8, deterministic=True, seed=42, conf=0.25) # 开始训练 #time.sleep(10) # 睡眠10s,主要是用于服务器多次训练的过程中使用 分析报错原因。注意,前后两次报错对应的yaml只有第1层的一个参数发生变化,分析为什么会取128时报错320,取320时报错128。
最新发布
07-06
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

ViatorSun

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值