AxialAttention和AxialBlock的模块代码如下:
def conv1x1(in_planes, out_planes, stride=1):
"""1x1 convolution"""
return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
class qkv_transform(nn.Conv1d):
"""Conv1d for qkv_transform"""
pass
#AxialAttention:通道数可以自适应,无需特意指定
class AxialAttention(nn.Module):
def __init__(self, in_planes, out_planes, groups=8, kernel_size=56,
stride=1, bias=False, width=False):
assert (in_planes % groups == 0) and (out_planes % groups == 0)
super(AxialAttention, self).__init__()
self.in_planes = in_planes
self.out_planes = out_planes
self.groups = groups
self.group_planes = out_planes // groups
self.kernel_size = kernel_size
self.stride = stride
self.bias = bias
self.width = width
# Multi-head self attention
self.qkv_transform = qkv_transform(in_planes, out_planes * 2, kernel_size=1, stride=1,
padding=0, bias=False)
self.bn_qkv = nn.BatchNorm1d(out_planes * 2)
self.bn_similarity = nn.BatchNorm2d(groups * 3)
#self.bn_qk = nn.BatchNorm2d(groups)
#self.bn_qr = nn.BatchNorm2d(groups)
#self.bn_kr = nn.BatchNorm2d(groups)
self.bn_output = nn.BatchNorm1d(out_planes * 2)
# Position embedding
self.relative = nn.Parameter(torch.randn(self.group_planes * 2, kernel_size * 2 - 1), requires_grad=True)
query_index = torch.arange(kernel_size).unsqueeze(0)
key_index = torch.arange(kernel_size).unsqueeze(1)
relative_index = key_index - query_index + kernel_size - 1
self.register_buffer('flatten_index', relative_index.view(-1))
if stride > 1:
self.pooling = nn.AvgPool2d(stride, stride=stride)
self.reset_parameters()
def forward(self, x):
if self.width:
x = x.permute(0, 2, 1, 3)
else:
x = x.permute(0, 3, 1, 2) # N, W, C, H
N, W, C, H = x.shape
x = x.contiguous().view(N * W, C, H)
# Transformations
qkv = self.bn_qkv(self.qkv_transform(x))
q, k, v = torch.split(qkv.reshape(N * W, self.groups, self.group_planes * 2, H), [self.group_planes // 2, self.group_planes // 2, self.group_planes], dim=2)
# Calculate position embedding
all_embeddings = torch.index_select(self.relative, 1, self.flatten_index).view(self.group_planes * 2, self.kernel_size, self.kernel_size)
q_embedding, k_embedding, v_embedding = torch.split(all_embeddings, [self.group_planes // 2, self.group_planes // 2, self.group_planes], dim=0)
qr = torch.einsum('bgci,cij->bgij', q, q_embedding)
kr = torch.einsum('bgci,cij->bgij', k, k_embedding).transpose(2, 3)
qk = torch.einsum('bgci, bgcj->bgij', q, k)
stacked_similarity = torch.cat([qk, qr, kr], dim=1)
stacked_similarity = self.bn_similarity(stacked_similarity).view(N * W, 3, self.groups, H, H).sum(dim=1)
#stacked_similarity = self.bn_qr(qr) + self.bn_kr(kr) + self.bn_qk(qk)
# (N, groups, H, H, W)
similarity = F.softmax(stacked_similarity, dim=3)
sv = torch.einsum('bgij,bgcj->bgci', similarity, v)
sve = torch.einsum('bgij,cij->bgci', similarity, v_embedding)
stacked_output = torch.cat([sv, sve], dim=-1).view(N * W, self.out_planes * 2, H)
output = self.bn_output(stacked_output).view(N, W, self.out_planes, 2, H).sum(dim=-2)
if self.width:
output = output.permute(0, 2, 1, 3)
else:
output = output.permute(0, 2, 3, 1)
if self.stride > 1:
output = self.pooling(output)
return output
def reset_parameters(self):
self.qkv_transform.weight.data.normal_(0, math.sqrt(1. / self.in_planes))
#nn.init.uniform_(self.relative, -0.1, 0.1)
nn.init.normal_(self.relative, 0., math.sqrt(1. / self.group_planes))
#AxialBlock:中间通道plane×2=输出通道。inc=outc时,plane=0.5×inc。
class AxialBlock(nn.Module):
expansion = 2
def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
base_width=64, dilation=1, norm_layer=None, kernel_size=56):
super(AxialBlock, self).__init__()
if norm_layer is None:
norm_layer = nn.BatchNorm2d
width = int(planes * (base_width / 64.))
# Both self.conv2 and self.downsample layers downsample the input when stride != 1
self.conv_down = conv1x1(inplanes, width)
self.bn1 = norm_layer(width)
self.hight_block = AxialAttention(width, width, groups=groups, kernel_size=kernel_size)
self.width_block = AxialAttention(width, width, groups=groups, kernel_size=kernel_size, stride=stride, width=True)
self.conv_up = conv1x1(width, planes * self.expansion)
self.bn2 = norm_layer(planes * self.expansion)
self.relu = nn.ReLU(inplace=True)
self.downsample = downsample
self.stride = stride
def forward(self, x):
identity = x
out = self.conv_down(x)
out = self.bn1(out)
out = self.relu(out)
out = self.hight_block(out)
out = self.width_block(out)
out = self.relu(out)
out = self.conv_up(out)
out = self.bn2(out)
if self.downsample is not None:
identity = self.downsample(x)
out += identity
out = self.relu(out)
return out
ultralytics.nn.tasks.py中的parse_model函数中,AxialAttention和AxialBlock的elif分支如下:
elif m is AxialAttention:
c1 = ch[f] # 输入通道 (自动适应)
# 解析参数并设置默认值 (与类定义严格匹配)
# 参数顺序: [out_planes, groups, kernel_size, stride, bias, width]
c2 = args[0] # 必需: 输出通道
groups = args[1] if len(args) > 1 else 8
kernel_size = args[2] if len(args) > 2 else 56
stride = args[3] if len(args) > 3 else 1
bias = args[4] if len(args) > 4 else False
width_flag = args[5] if len(args) > 5 else False # 避免与width变量冲突
# 重建参数列表
new_args = [c1, c2, groups, kernel_size, stride, bias, width_flag]
args = new_args
elif m is AxialBlock:
c1 = ch[f] # 输入通道 (自动适应)
# 解析参数并设置默认值 (与类定义严格匹配)
# 参数顺序: [planes, stride, groups, base_width, dilation, kernel_size]
planes = args[0] # 必需: 基础通道数
stride = args[1] if len(args) > 1 else 1
groups = args[2] if len(args) > 2 else 1
base_width = args[3] if len(args) > 3 else 64
dilation = args[4] if len(args) > 4 else 1
kernel_size = args[5] if len(args) > 5 else 56
# 计算实际输出通道 (考虑expansion系数)
c2 = planes * AxialBlock.expansion
# 重建参数列表
new_args = [
c1, # inplanes
planes, # planes
stride, # stride
None, # downsample (由框架自动处理)
groups, # groups
base_width, # base_width
dilation, # dilation
None, # norm_layer (使用默认)
kernel_size # kernel_size
]
args = new_args
Yaml结构如下:
# Ultralytics YOLO 🚀, AGPL-3.0 license
# YOLOv8-seg instance segmentation model. For Usage examples see https://docs.ultralytics.com/tasks/segment
# Parameters
nc: 80 # number of classes
scales: # model compound scaling constants, i.e. 'model=yolov8n-seg.yaml' will call yolov8-seg.yaml with scale 'n'
# [depth, width, max_channels]
n: [0.33, 0.25, 1024]
s: [0.33, 0.50, 1024]
m: [0.67, 0.75, 768]
l: [1.00, 1.00, 512]
x: [1.00, 1.25, 512]
# YOLOv8.0n backbone
backbone:
# [from, repeats, module, args]
- [-1, 1, Conv, [64, 3, 2]] # 0-P1/2
- [-1, 1, AxialAttention, [32, 8, 128, 1, False, False]] # 1-P2/4
- [-1, 3, C2f, [128, True]]
- [-1, 1, Conv, [256, 3, 2]] # 3-P3/8
- [-1, 6, C2f, [256, True]]
- [-1, 1, Conv, [512, 3, 2]] # 5-P4/16
- [-1, 6, C2f, [512, True]]
- [-1, 1, Conv, [1024, 3, 2]] # 7-P5/32
- [-1, 3, C2f, [1024, True]]
- [-1, 1, SPPF, [1024, 5]] # 9
# YOLOv8.0n head
head:
- [-1, 1, nn.Upsample, [None, 2, "nearest"]]
- [[-1, 6], 1, Concat, [1]] # cat backbone P4
- [-1, 3, C2f, [512]] # 12
- [-1, 1, nn.Upsample, [None, 2, "nearest"]]
- [[-1, 4], 1, Concat, [1]] # cat backbone P3
- [-1, 3, C2f, [256]] # 15 (P3/8-small)
- [-1, 1, Conv, [256, 3, 2]]
- [[-1, 12], 1, Concat, [1]] # cat head P4
- [-1, 3, C2f, [512]] # 18 (P4/16-medium)
- [-1, 1, Conv, [512, 3, 2]]
- [[-1, 9], 1, Concat, [1]] # cat head P5
- [-1, 3, C2f, [1024]] # 21 (P5/32-large)
- [[15, 18, 21], 1, Segment, [nc, 32, 256]] # Segment(P3, P4, P5)
当第1层- [-1, 1, AxialAttention, [32, 8, 128, 1, False, False]] # 1-P2/4
的kernel_size取128时,网络正常加载,训练时报错:
Traceback (most recent call last):
File "E:\42yolo_model_change\yolov8-42\42_demo\start_train.py", line 27, in <module>
results = model.train(data='A_my_data.yaml', epochs=100, imgsz=640, batch=8, workers=8, deterministic=True, seed=42, conf=0.25) # 开始训练
File "E:\42yolo_model_change\yolov8-42\ultralytics\engine\model.py", line 813, in train
self.trainer.train()
File "E:\42yolo_model_change\yolov8-42\ultralytics\engine\trainer.py", line 208, in train
self._do_train(world_size)
File "E:\42yolo_model_change\yolov8-42\ultralytics\engine\trainer.py", line 385, in _do_train
self.loss, self.loss_items = self.model(batch)
File "D:\Anaconda3\envs\pytorch\lib\site-packages\torch\nn\modules\module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "D:\Anaconda3\envs\pytorch\lib\site-packages\torch\nn\modules\module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "E:\42yolo_model_change\yolov8-42\ultralytics\nn\tasks.py", line 108, in forward
return self.loss(x, *args, **kwargs)
File "E:\42yolo_model_change\yolov8-42\ultralytics\nn\tasks.py", line 289, in loss
preds = self.forward(batch["img"]) if preds is None else preds
File "E:\42yolo_model_change\yolov8-42\ultralytics\nn\tasks.py", line 109, in forward
return self.predict(x, *args, **kwargs)
File "E:\42yolo_model_change\yolov8-42\ultralytics\nn\tasks.py", line 127, in predict
return self._predict_once(x, profile, visualize, embed)
File "E:\42yolo_model_change\yolov8-42\ultralytics\nn\tasks.py", line 148, in _predict_once
x = m(x) # run
File "D:\Anaconda3\envs\pytorch\lib\site-packages\torch\nn\modules\module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "D:\Anaconda3\envs\pytorch\lib\site-packages\torch\nn\modules\module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "E:\42yolo_model_change\yolov8-42\ultralytics\nn\modules\block.py", line 1299, in forward
qr = torch.einsum('bgci,cij->bgij', q, q_embedding)
File "D:\Anaconda3\envs\pytorch\lib\site-packages\torch\functional.py", line 377, in einsum
return _VF.einsum(equation, operands) # type: ignore[attr-defined]
RuntimeError: einsum(): subscript i has size 128 for operand 1 which does not broadcast with previously seen size 320
当第1层- [-1, 1, AxialAttention, [32, 8, 128, 1, False, False]] # 1-P2/4
的kernel_size取320时,网络不能正常加载,报错:
Traceback (most recent call last):
File "E:\42yolo_model_change\yolov8-42\42_demo\start_train.py", line 25, in <module>
model = YOLO(r'E:\42yolo_model_change\yolov8-42\ultralytics\cfg\models\v8\yolov8-seg-AxialAttention-H1.yaml') # load a pretrained model (recommended for training)
File "E:\42yolo_model_change\yolov8-42\ultralytics\models\yolo\model.py", line 23, in __init__
super().__init__(model=model, task=task, verbose=verbose)
File "E:\42yolo_model_change\yolov8-42\ultralytics\engine\model.py", line 140, in __init__
self._new(model, task=task, verbose=verbose)
File "E:\42yolo_model_change\yolov8-42\ultralytics\engine\model.py", line 260, in _new
self.model = (model or self._smart_load("model"))(cfg_dict, verbose=verbose and RANK == -1) # build model
File "E:\42yolo_model_change\yolov8-42\ultralytics\nn\tasks.py", line 409, in __init__
super().__init__(cfg=cfg, ch=ch, nc=nc, verbose=verbose)
File "E:\42yolo_model_change\yolov8-42\ultralytics\nn\tasks.py", line 333, in __init__
m.stride = torch.tensor([s / x.shape[-2] for x in _forward(torch.zeros(1, ch, s, s))]) # forward
File "E:\42yolo_model_change\yolov8-42\ultralytics\nn\tasks.py", line 331, in _forward
return self.forward(x)[0] if isinstance(m, (Segment, Pose, OBB)) else self.forward(x)
File "E:\42yolo_model_change\yolov8-42\ultralytics\nn\tasks.py", line 109, in forward
return self.predict(x, *args, **kwargs)
File "E:\42yolo_model_change\yolov8-42\ultralytics\nn\tasks.py", line 127, in predict
return self._predict_once(x, profile, visualize, embed)
File "E:\42yolo_model_change\yolov8-42\ultralytics\nn\tasks.py", line 148, in _predict_once
x = m(x) # run
File "D:\Anaconda3\envs\pytorch\lib\site-packages\torch\nn\modules\module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "D:\Anaconda3\envs\pytorch\lib\site-packages\torch\nn\modules\module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "E:\42yolo_model_change\yolov8-42\ultralytics\nn\modules\block.py", line 1299, in forward
qr = torch.einsum('bgci,cij->bgij', q, q_embedding)
File "D:\Anaconda3\envs\pytorch\lib\site-packages\torch\functional.py", line 377, in einsum
return _VF.einsum(equation, operands) # type: ignore[attr-defined]
RuntimeError: einsum(): subscript i has size 320 for operand 1 which does not broadcast with previously seen size 128
两次报错运行的是相同的一个脚本:
from ultralytics import YOLO
import random
import numpy as np
import torch
if __name__ == "__main__":
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed_all(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
model = YOLO(r'E:\42yolo_model_change\yolov8-42\ultralytics\cfg\models\v8\yolov8-seg-AxialAttention-H1.yaml') # load a pretrained model (recommended for training)
model.load('yolov8n-seg.pt')
results = model.train(data='A_my_data.yaml', epochs=100, imgsz=640, batch=8, workers=8, deterministic=True, seed=42, conf=0.25) # 开始训练
#time.sleep(10) # 睡眠10s,主要是用于服务器多次训练的过程中使用
分析报错原因。注意,前后两次报错对应的yaml只有第1层的一个参数发生变化,分析为什么会取128时报错320,取320时报错128。
最新发布