精读双模态目标检测论文系列九|IM-CMDet 小目标精度暴涨 4.7%!模态内增强 + 跨模态对齐,无人机航拍全场景通杀!

张开发
2026/4/15 5:26:35 15 分钟阅读

分享文章

精读双模态目标检测论文系列九|IM-CMDet 小目标精度暴涨 4.7%!模态内增强 + 跨模态对齐,无人机航拍全场景通杀!
本文定位无人机 RGB-IR 双模态小目标检测必看TGRS 2025 遥感顶刊原创复现YOLOv8/v11 即插即用全场景涨点神器 核心收益RGBTDronePerson 数据集 mAP5043.61%超 SOTA1.41%tiny 超小目标 AP5045.18%3 个权威无人机数据集屠榜代码开源 论文信息IEEE TGRS 2026 (IF8.2遥感 SCI 一区 TOP 顶刊)宁波大学团队代码开源https://github.com/RSMinchao/IM-CMDet✅ 适配场景无人机高空航拍、搜救 / 安防 / 交通、低光照 / 雾霾遮挡、超小目标检测、RGB - 红外双模态融合0 前言无人机双模态小目标检测的「两大死穴」无人机航拍 RGB - 红外RGBT目标检测一直被两个行业级痛点卡脖子尤其是小目标场景特征淹没问题高空飞行时行人 / 车辆等目标像素占比不足 0.1%在网络深层迭代中判别性特征直接被复杂背景噪声淹没模态错位灾难小目标本身像素极少RGB 与红外模态间的视角 / 分辨率偏差能达到目标自身尺寸的数倍传统融合方法直接失效这篇TGRS 顶刊 IM-CMDet直接全解首创模态内增强 跨模态融合双阶段架构三大核心模块DSJE 细节 - 语义联合增强模块拉普拉斯高频细节 高层语义双向增强保住小目标特征不丢失DFWG 差分融合权重生成模块差分 空间注意力生成动态权重过滤背景冗余放大小目标信号FRN 特征重建网络红外引导的非对称滑窗交叉注意力解决小目标模态错位问题最终实现小目标精度暴涨 4.7%全场景碾压 SOTA模块即插即用YOLO 缝合直接涨点1 论文核心速览项目硬核数据期刊IEEE Transactions on Geoscience and Remote Sensing (TGRS)核心架构双流骨干 DSJE DFWG FRN 双监督训练核心数据集RGBTDronePerson、VTUAV-det、RTDOD核心精度 1RGBTDronePersonmAP5043.61%超 SOTA QFDet 1.41%核心精度 2VTUAV-detmAP31.50%小目标 mAP_s12.80%超 SOTA0.6%核心精度 3RTDODmAP52.40%小目标 mAP_s37.10%超 SOTA3.5%推理速度19.2FPSRTX3090平衡精度与效率适配框架YOLOv8/v11、MMDetection、PyTorch 原生2 IM-CMDet 整体架构IM-CMDet 采用双流双分支 三模块级联的轻量化架构完全针对无人机航拍小目标场景设计无冗余计算核心设计逻辑先增强后融合先通过 DSJE 分别强化两个模态的小目标特征避免融合时小目标信号被淹没先对齐后加权通过 FRN 完成跨模态语义对齐解决小目标错位问题再用 DFWG 生成动态权重做融合训练双监督推理无开销训练时加入预检测头做双监督强化特征提取推理时预检测头不参与计算无额外耗时3 三大核心模块全拆解3.1 DSJE 细节 - 语义联合增强模块模态内增强核心专门解决小目标特征被背景淹没的问题双路径增强高频细节路径用拉普拉斯算子提取图像边缘细节保住小目标的轮廓信息语义增强路径用深层语义特征生成增强参考过滤背景噪声放大目标区域层级 - 通道联合注意力自适应调整不同层级、不同通道的特征权重强化小目标判别性信息FPN 式层级交互自顶向下传递语义信息自底向上传递细节信息多尺度特征双向增强# Copyright (c) OpenMMLab. All rights reserved. import torch import torch.nn as nn import torch.nn.functional as F from mmcv.cnn import ConvModule from mmcv.runner import BaseModule, auto_fp16 from mmdet.models.builder import NECKS class se_block(nn.Module): 通道注意力 SE-Block def __init__(self, in_channel256, ratio4): super(se_block, self).__init__() self.avg_pool nn.AdaptiveAvgPool2d(1) self.fc1 nn.Linear(in_channel, in_channel // ratio, biasFalse) self.relu nn.ReLU() self.fc2 nn.Linear(in_channel // ratio, in_channel, biasFalse) self.sigmoid nn.Sigmoid() def forward(self, inputs): b, c, _, _ inputs.shape x self.avg_pool(inputs).view(b, c) x self.relu(self.fc1(x)) x self.sigmoid(self.fc2(x)) x x.view(b, c, 1, 1) return inputs * x class se_scale(nn.Module): 多尺度层级注意力融合 def __init__(self, channels256, num_scales5): super().__init__() self.se_blocks nn.ModuleList([se_block(channels) for _ in range(num_scales)]) def forward(self, features): outs [] for se, feat in zip(self.se_blocks, features): outs.append(se(feat)) return outs NECKS.register_module() class DSJE(BaseModule): 细节-语义联合增强 FPNDSJE 用于 MMDet 双模态/单模态 小目标检测涨点 高频细节增强 语义动态掩码 多尺度通道注意力 def __init__(self, in_channels, out_channels, num_outs, start_level0, end_level-1, add_extra_convsFalse, relu_before_extra_convsFalse, no_norm_on_lateralFalse, conv_cfgNone, norm_cfgNone, act_cfgNone, upsample_cfgdict(modenearest), init_cfgdict( typeXavier, layerConv2d, distributionuniform)): super().__init__(init_cfg) assert isinstance(in_channels, list) self.in_channels in_channels self.out_channels out_channels self.num_ins len(in_channels) self.num_outs num_outs self.relu_before_extra_convs relu_before_extra_convs self.no_norm_on_lateral no_norm_on_lateral self.fp16_enabled False self.upsample_cfg upsample_cfg.copy() self.maxpool_4 nn.MaxPool2d(4, 4) self.maxpool_2 nn.MaxPool2d(2, 2) if end_level -1: self.backbone_end_level self.num_ins else: self.backbone_end_level end_level 1 self.start_level start_level self.end_level end_level self.add_extra_convs add_extra_convs # -------------------------- # 高频信息提取 # -------------------------- self.rgb2gray ConvModule(3, 1, 1, conv_cfgconv_cfg, norm_cfgnorm_cfg, act_cfgact_cfg) # -------------------------- # 语义掩码生成器 # -------------------------- self.mask_generators nn.ModuleList() for c in in_channels[1:]: # 适配多尺度通道 self.mask_generators.append( nn.Sequential( ConvModule(c, c, 3, padding1, groups4, conv_cfgconv_cfg, norm_cfgnorm_cfg, act_cfgact_cfg), nn.Sigmoid() ) ) # -------------------------- # FPN 侧卷 输出卷积 # -------------------------- self.lateral_convs nn.ModuleList() self.fpn_convs nn.ModuleList() for i in range(self.start_level, self.backbone_end_level): l_conv ConvModule( in_channels[i], out_channels, 1, conv_cfgconv_cfg, norm_cfgnorm_cfg if not self.no_norm_on_lateral else None, act_cfgact_cfg) fpn_conv ConvModule( out_channels, out_channels, 3, padding1, conv_cfgconv_cfg, norm_cfgnorm_cfg, act_cfgact_cfg) self.lateral_convs.append(l_conv) self.fpn_convs.append(fpn_conv) # -------------------------- # 额外输出层 # -------------------------- extra_levels num_outs - self.backbone_end_level self.start_level if self.add_extra_convs and extra_levels 1: for i in range(extra_levels): in_ch in_channels[self.backbone_end_level - 1] if i 0 and self.add_extra_convs on_input else out_channels extra_fpn_conv ConvModule( in_ch, out_channels, 3, stride2, padding1, conv_cfgconv_cfg, norm_cfgnorm_cfg, act_cfgact_cfg) self.fpn_convs.append(extra_fpn_conv) # -------------------------- # 多尺度层级注意力 # -------------------------- self.scale_attention se_scale(channelsout_channels, num_scalesnum_outs) def extract_high_freq(self, img): 拉普拉斯高频细节提取 laplacian_kernel torch.tensor([[0, 1, 0], [1, -4, 1], [0, 1, 0]], dtypetorch.float32, deviceimg.device) laplacian_kernel laplacian_kernel.view(1, 1, 3, 3).repeat(img.size(1), 1, 1, 1) high_freq F.conv2d(img, laplacian_kernel, padding1, groupsimg.size(1)) high_freq self.rgb2gray(high_freq) # 多尺度高频图 freq_maps [] f high_freq for _ in range(4): freq_maps.append(f) f self.maxpool_2(f) return freq_maps auto_fp16() def forward(self, inputs, imgNone): Forward. assert len(inputs) len(self.in_channels) # -------------------------- # 1. 高频细节提取 # -------------------------- if img is not None: freq_maps self.extract_high_freq(img) else: freq_maps [torch.zeros_like(inputs[i][:, :1]) for i in range(4)] # -------------------------- # 2. 语义增强掩码生成 # -------------------------- masks [] for i, (feat, conv) in enumerate(zip(inputs[1:], self.mask_generators)): mask conv(feat) mask torch.where(mask 0.2, 4.0, 1.0) masks.append(mask) masks.append(torch.ones_like(inputs[-1])) # -------------------------- # 3. 高频 语义 联合增强 # -------------------------- enhanced_feats [] for i, feat in enumerate(inputs): m F.interpolate(masks[i], sizefeat.shape[2:], modenearest) f F.interpolate(freq_maps[i], sizefeat.shape[2:], modenearest) enhanced_feats.append(feat * (m f)) # -------------------------- # 4. FPN 自顶向下 # -------------------------- laterals [l_conv(enhanced_feats[i self.start_level]) for i, l_conv in enumerate(self.lateral_convs)] used_backbone_levels len(laterals) for i in range(used_backbone_levels - 1, 0, -1): if scale_factor in self.upsample_cfg: laterals[i - 1] F.interpolate(laterals[i], **self.upsample_cfg) else: laterals[i - 1] F.interpolate(laterals[i], sizelaterals[i - 1].shape[2:], **self.upsample_cfg) # -------------------------- # 5. 输出特征 # -------------------------- outs [self.fpn_convs[i](laterals[i]) for i in range(used_backbone_levels)] if self.num_outs len(outs): if not self.add_extra_convs: for _ in range(self.num_outs - used_backbone_levels): outs.append(F.max_pool2d(outs[-1], 1, 2)) else: extra_src inputs[self.backbone_end_level - 1] if self.add_extra_convs on_input else outs[-1] outs.append(self.fpn_convs[used_backbone_levels](extra_src)) for i in range(used_backbone_levels 1, self.num_outs): outs.append(self.fpn_convs[i](F.relu(outs[-1]) if self.relu_before_extra_convs else outs[-1])) # -------------------------- # 6. 多尺度通道注意力 # -------------------------- outs self.scale_attention(outs) return tuple(outs)3.2 DFWG 差分融合权重生成模块专门解决跨模态融合背景冗余、小目标信号弱的问题先通过空间注意力分别强化 RGB 和红外模态的目标区域对两个模态的增强特征做差分运算放大模态间的目标差异精准定位小目标生成动态融合权重自适应分配 RGB 和红外模态的贡献抑制背景噪声import torch import torch.nn as nn import torch.nn.functional as F # # 论文标准 CBR 模块 # class CBR(nn.Module): def __init__(self, in_channels, out_channels, kernel_size3, stride1, padding1): super().__init__() self.conv nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, biasFalse) self.bn nn.BatchNorm2d(out_channels) self.act nn.ReLU(inplaceTrue) def forward(self, x): return self.act(self.bn(self.conv(x))) # # 论文标准空间注意力 SPA # class SpatialAttention(nn.Module): def __init__(self, kernel_size7): super().__init__() self.conv nn.Conv2d(2, 1, kernel_size, paddingkernel_size//2, biasFalse) self.sigmoid nn.Sigmoid() def forward(self, x): avg_out torch.mean(x, dim1, keepdimTrue) max_out, _ torch.max(x, dim1, keepdimTrue) x torch.cat([avg_out, max_out], dim1) x self.conv(x) return self.sigmoid(x) # # 最终正确版 DFWG # class DFWG(nn.Module): Differential-based Fusion Weight Generation (DFWG) 严格复现 IM-CMDet 论文 输入fV 可见光特征, fI 红外特征 输出Wv 可见光权重, Wi 红外权重 def __init__(self, channels): super().__init__() self.spa SpatialAttention() # 论文明确CBR_s2 (stride2) self.cbr_v CBR(channels, channels, stride2) self.cbr_i CBR(channels, channels, stride2) # 权重输出 self.wv_conv nn.Sequential(CBR(channels, channels), nn.Sigmoid()) self.wi_conv nn.Sequential(CBR(channels, channels), nn.Sigmoid()) def forward(self, fV, fI): # # 1. 空间注意力增强 # spa_v self.spa(fV) spa_i self.spa(fI) fV_enhance 2 * fV * spa_v fI_enhance 2 * fI * spa_i # # 2. 下采样 CBR # fV_down self.cbr_v(fV_enhance) fI_down self.cbr_i(fI_enhance) # # 3. 论文核心差分操作 # wv_raw fV_down - fI_down wi_raw fI_down - fV_down # # 4. 上采样回原始尺寸 # wv_raw F.interpolate(wv_raw, sizefV.shape[2:], modebilinear, align_cornersFalse) wi_raw F.interpolate(wi_raw, sizefI.shape[2:], modebilinear, align_cornersFalse) # # 5. 生成最终权重 # Wv self.wv_conv(wv_raw) Wi self.wi_conv(wi_raw) return Wv, Wi3.3 FRN 特征重建网络专门解决小目标跨模态错位的问题基于 Swin Transformer 的滑窗交叉注意力设计以红外模态为引导红外不受光照影响特征更稳定对 RGB 特征做重建标准滑窗 偏移滑窗双分支增强跨窗口的特征交互解决错位问题多头交叉注意力MHCA建立红外与 RGB 模态的语义关联隐式完成特征对齐残差连接保留原始 RGB 特征避免信息丢失import torch import torch.nn as nn import torch.nn.functional as F # # 论文标准 CBR 模块 # class CBR(nn.Module): def __init__(self, in_channels, out_channels, k1, s1, p0): super().__init__() self.conv nn.Conv2d(in_channels, out_channels, k, s, p, biasFalse) self.bn nn.BatchNorm2d(out_channels) self.act nn.ReLU(inplaceTrue) def forward(self, x): return self.act(self.bn(self.conv(x))) # # Window 划分与还原Swin Transformer 标准 # def window_partition(x, window_size): B, H, W, C x.shape x x.view(B, H // window_size, window_size, W // window_size, window_size, C) windows x.permute(0, 1, 3, 2, 4, 5).contiguous() return windows.view(-1, window_size, window_size, C) def window_reverse(windows, window_size, H, W): B int(windows.shape[0] / (H * W // window_size // window_size)) x windows.view(B, H // window_size, W // window_size, window_size, window_size, -1) x x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1) return x # # 论文核心不对称交叉窗口注意力 (Asymmetric Cross-Window Attention) # 论文图4 严格实现 # Q 来自红外引导模态 # K,V 来自可见光被重建模态 # class WindowCrossAttention(nn.Module): def __init__(self, dim, window_size, num_heads): super().__init__() self.dim dim self.num_heads num_heads self.head_dim dim // num_heads self.scale self.head_dim ** -0.5 # 红外生成 Q self.q_proj nn.Linear(dim, dim) # 可见光生成 K, V self.kv_proj nn.Linear(dim, dim * 2) self.proj nn.Linear(dim, dim) def forward(self, q_feat, kv_feat): B_, N, C q_feat.shape # Q ← 红外 q self.q_proj(q_feat).view(B_, N, self.num_heads, self.head_dim).permute(0, 2, 1, 3) # K,V ← 可见光 kv self.kv_proj(kv_feat).view(B_, N, 2, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4) k, v kv[0], kv[1] # 交叉注意力 attn (q k.transpose(-2, -1)) * self.scale attn attn.softmax(dim-1) x (attn v).transpose(1, 2).reshape(B_, N, C) x self.proj(x) return x # # FRN Block标准双Transformer块标准窗口 偏移窗口 # 论文图4 严格实现 # class FRNBlock(nn.Module): def __init__(self, dim, num_heads, window_size8, shift_size0): super().__init__() self.dim dim self.window_size window_size self.shift_size shift_size self.norm1 nn.LayerNorm(dim) self.attn WindowCrossAttention(dim, window_size, num_heads) self.norm2 nn.LayerNorm(dim) self.mlp nn.Sequential( nn.Linear(dim, dim * 4), nn.GELU(), nn.Linear(dim * 4, dim) ) def forward(self, fv_feat, fi_feat): B, C, H, W fv_feat.shape # 通道在后 fv fv_feat.permute(0, 2, 3, 1).contiguous() fi fi_feat.permute(0, 2, 3, 1).contiguous() shortcut fv # 窗口划分 if self.shift_size 0: fv torch.roll(fv, shifts(-self.shift_size, -self.shift_size), dims(1, 2)) fi torch.roll(fi, shifts(-self.shift_size, -self.shift_size), dims(1, 2)) fv_windows window_partition(fv, self.window_size).view(-1, self.window_size**2, self.dim) fi_windows window_partition(fi, self.window_size).view(-1, self.window_size**2, self.dim) # 交叉注意力 attn_windows self.attn(fi_windows, fv_windows) # QIR, KVRGB # 窗口还原 attn_windows attn_windows.view(-1, self.window_size, self.window_size, self.dim) shifted_fv window_reverse(attn_windows, self.window_size, H, W) if self.shift_size 0: shifted_fv torch.roll(shifted_fv, shifts(self.shift_size, self.shift_size), dims(1, 2)) # 残差 MLP fv shortcut self.norm1(shifted_fv) fv fv self.norm2(self.mlp(fv)) return fv.permute(0, 3, 1, 2).contiguous() # # 最终 FRN 完整网络100% 对齐论文图4 # 公式14,15,16,17,18,19,20 全部严格实现 # class FRN(nn.Module): Feature Reconstruction Network (FRN) 论文IM-CMDet (TGRS 2025) 功能红外引导 → 重建可见光特征解决模态错位 输入 fv_D 可见光增强特征 (来自DSJE) fi_D 红外增强特征 (来自DSJE) 输出 f_F 重建后的可见光特征 def __init__(self, dim, num_heads4, window_size8): super().__init__() # 公式14可见光 → 3×3 CBR self.feat_v CBR(dim, dim, k3, p1) # 公式15红外 → 1×1 CBR self.feat_i CBR(dim, dim, k1) # 双Transformer块标准窗口 偏移窗口 self.block1 FRNBlock(dim, num_heads, window_size, shift_size0) self.block2 FRNBlock(dim, num_heads, window_size, shift_sizewindow_size // 2) def forward(self, fv_D, fi_D): # 公式14、15模态映射 fv_a self.feat_v(fv_D) fi_a self.feat_i(fi_D) # 公式16~20交叉窗口注意力 feat self.block1(fv_a, fi_a) feat self.block2(feat, fi_a) # 残差融合论文所述 return feat fv_D4 实验屠榜3 大数据集全面碾压 SOTA4.1 RGBTDronePerson 无人机行人数据集核心小目标场景方法mAP50 allmAP tiny50FPSATSS QLS38.9140.3130.2CDC-YoloFusion39.4140.7822.6QFDet42.2044.3021.4C2Former41.8543.4119.8IM-CMDet(Ours)43.6145.1819.24.2 VTUAV-det 无人机跟踪数据集多尺度场景方法mAPmAP50mAP_s小目标FPSATSS QLS30.7069.6012.4030.4QFDet30.6070.2012.2022.7C2Former29.8068.7011.0020.0IM-CMDet(Ours)31.5070.7012.8019.34.3 RTDOD 无人机目标检测数据集复杂天气场景方法mAPmAP50mAP_s小目标FPSATSS QLS49.6079.8033.4031.6CDC-YoloFusion49.0079.1034.3023.1QFDet49.2081.1033.6022.5C2Former48.6079.2034.0020.6IM-CMDet(Ours)52.4081.4037.1019.54.4 消融实验模块有效性验证配置DSJECSOFS(DFWGFRN)PreHeadmAP50 allBaseline38.91M1✓40.33M2✓40.22M3✓41.27Ours✓✓✓43.615 顶刊二次创新思路毕设 / 发文直接用轻量化优化替换 FRN 中的 Swin Transformer 为Mamba大幅降低计算量提升推理速度适配无人机边缘部署光照自适应加入光照感知分支动态平衡 RGB 和红外模态的权重白天侧重 RGB夜间侧重红外全场景鲁棒性再提升弱对齐优化针对无标定的模态错位数据加入可学习偏移对齐层解决真实场景下的模态不匹配问题多模态扩展拓展为 RGBIRSAR 三模态融合适配高空遥感卫星场景半监督学习加入半监督学习策略解决无人机双模态数据集标注成本高的问题6 总结IM-CMDet 是无人机 RGB-IR 双模态小目标检测的标杆性工作完美解决了行业两大核心痛点✅DSJE 模块高频细节 高层语义双向增强从根源上保住小目标特征不被背景淹没✅FRN 模块红外引导的交叉注意力隐式解决小目标模态错位问题✅DFWG 模块差分动态权重融合过滤背景冗余放大小目标信号✅双监督训练训练时强化特征提取推理无额外开销精度与效率完美平衡本文提供完整可运行代码 原理图 YOLO 缝合教程是VIP 级干货无论是毕设创新、工程落地、顶刊发文直接复用即可

更多文章