0%

OD-DenseBox随记

1. DenseBox

paper: https://arxiv.org/pdf/1509.04874.pdf

code: https://github.com/CaptainEven/DenseBox

最近在看文本检测算法EAST,其核心思想与DenseBox很是相似,结合了Unet结构。DenseBox是比较早期的Anchor-Free目标检测算法,方法本身有很多值得借鉴的地方,很多思想也比较超前, 比如端到端的训练和识别,多尺度特征融合,结合关键点增强检测效果等。

image-20220124190818693

image-20220124191404862

整体架构如Figure1所示,测试时,输入图片大小为(mxnx3),输出为(m/4 x n/4 *5),第一维s为分类置信度,后四维为像素位置至目标边界的距离,转化为bbox后进行后处理NMS得到最终的检测结果。

2. 网络结构

image-20220124191824821

主干网络以VGG-19为基础,只采用前12层,后边层重新设计,con3_4上采样之后与con4_4进行多尺度特征融合,然后Head部分分为两个分支,其中一个为score_map(1-channel map for class score),第二个分支为boundingbox参数的回归分支(the relative position of bounding box by 4-channel map)。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
class DenseBox(torch.nn.Module):
"""
implemention of densebox network with py-torch
"""

def __init__(self,
vgg19):
"""
init the first 12 layers with pre-trained weights
:param vgg19: pre-trained net
"""
super(DenseBox, self).__init__()

feats = vgg19.features._modules

# print(feats), print(classifier)

# ----------------- Conv1
self.conv1_1_1 = copy.deepcopy(feats['0']) # (0)
self.conv1_1_2 = copy.deepcopy(feats['1']) # (1)
self.conv1_1 = nn.Sequential(
self.conv1_1_1,
self.conv1_1_2
) # conv_layer1

self.conv1_2_1 = copy.deepcopy(feats['2']) # (2) conv_layer2
self.conv1_2_2 = copy.deepcopy(feats['3']) # (3)
self.conv1_2 = nn.Sequential(
self.conv1_2_1,
self.conv1_2_2
) # conv_layer2

self.pool1 = copy.deepcopy(feats['4']) # (4)

# ----------------- Conv2
self.conv2_1_1 = copy.deepcopy(feats['5']) # (5)
self.conv2_1_2 = copy.deepcopy(feats['6']) # (6)
self.conv2_1 = nn.Sequential(
self.conv2_1_1,
self.conv2_1_2
) # conv_layer3

self.conv2_2_1 = copy.deepcopy(feats['7']) # (7)
self.conv2_2_2 = copy.deepcopy(feats['8']) # (8)
self.conv2_2 = nn.Sequential(
self.conv2_2_1,
self.conv2_2_2
) # conv_layer4

self.pool2 = copy.deepcopy(feats['9']) # (9)

# ----------------- Conv3
self.conv3_1_1 = copy.deepcopy(feats['10']) # (10)
self.conv3_1_2 = copy.deepcopy(feats['11']) # (11)
self.conv3_1 = nn.Sequential(
self.conv3_1_1,
self.conv3_1_2
) # conv_layer5

self.conv3_2_1 = copy.deepcopy(feats['12']) # (12)
self.conv3_2_2 = copy.deepcopy(feats['13']) # (13)
self.conv3_2 = nn.Sequential(
self.conv3_2_1,
self.conv3_2_2
) # conv_layer6

self.conv3_3_1 = copy.deepcopy(feats['14']) # (14)
self.conv3_3_2 = copy.deepcopy(feats['15']) # (15)
self.conv3_3 = nn.Sequential(
self.conv3_3_1,
self.conv3_3_2
) # conv_layer7

self.conv3_4_1 = copy.deepcopy(feats['16']) # (16)
self.conv3_4_2 = copy.deepcopy(feats['17']) # (17)
self.conv3_4 = nn.Sequential(
self.conv3_4_1,
self.conv3_4_2
) # conv_layer8

self.pool3 = copy.deepcopy(feats['18']) # (18)

# ----------------- Conv4
self.conv4_1_1 = copy.deepcopy(feats['19']) # (19)
self.conv4_1_2 = copy.deepcopy(feats['20']) # (20)
self.conv4_1 = nn.Sequential(
self.conv4_1_1,
self.conv4_1_2
) # conv_layer9

self.conv4_2_1 = copy.deepcopy(feats['21']) # (21)
self.conv4_2_2 = copy.deepcopy(feats['22']) # (22)
self.conv4_2 = nn.Sequential(
self.conv4_2_1,
self.conv4_2_2
) # conv_layer10

self.conv4_3_1 = copy.deepcopy(feats['23']) # (23)
self.conv4_3_2 = copy.deepcopy(feats['24']) # (24)
self.conv4_3 = nn.Sequential(
self.conv4_3_1,
self.conv4_3_2
) # conv_layer11

self.conv4_4_1 = copy.deepcopy(feats['25']) # (25)
self.conv4_4_2 = copy.deepcopy(feats['26']) # (26)
self.conv4_4 = nn.Sequential(
self.conv4_4_1,
self.conv4_4_2
)

# route: up-sample and concatenate
# self.up_sampling = nn.Upsample(size=(60, 60),
# mode='bilinear',
# align_corners=True)

# -------------------------------------- ouput layers
# scores output
self.conv5_1_det = nn.Conv2d(in_channels=768,
out_channels=512,
kernel_size=(1, 1))
self.conv5_2_det = nn.Conv2d(in_channels=512,
out_channels=1,
kernel_size=(1, 1))
torch.nn.init.xavier_normal_(self.conv5_1_det.weight.data)
torch.nn.init.xavier_normal_(self.conv5_2_det.weight.data)

self.output_score = nn.Sequential(
self.conv5_1_det,
nn.Dropout(),
self.conv5_2_det
)

# locs output
self.conv5_1_loc = nn.Conv2d(in_channels=768,
out_channels=512,
kernel_size=(1, 1))
self.conv5_2_loc = nn.Conv2d(in_channels=512,
out_channels=4,
kernel_size=(1, 1))
torch.nn.init.xavier_normal_(self.conv5_1_loc.weight.data)
torch.nn.init.xavier_normal_(self.conv5_2_loc.weight.data)

self.output_loc = nn.Sequential(
self.conv5_1_loc,
nn.Dropout(),
self.conv5_2_loc
)

def forward(self, X):
"""
:param X:
:return:
"""
X = self.conv1_1(X)
X = self.conv1_2(X)
X = self.pool1(X)

X = self.conv2_1(X)
X = self.conv2_2(X)
X = self.pool2(X)

X = self.conv3_1(X)
X = self.conv3_2(X)
X = self.conv3_4(X)

# conv3_4 result
conv3_4_X = X.clone()
# conv3_4_X = torch.Tensor.new_tensor(data=X,
# dtype=torch.float32,
# device=device,
# requires_grad=True)

X = self.pool3(X)

X = self.conv4_1(X)
X = self.conv4_2(X)
X = self.conv4_3(X)
conv4_4_X = self.conv4_4(X)

# upsample_X = self.up_sampling
# upsample of conv4_4
conv4_4_X_us = nn.Upsample(size=(conv3_4_X.size(2),
conv3_4_X.size(3)),
mode='bilinear',
align_corners=True)(conv4_4_X)

# feature fusion: concatenate along channel axis
fusion = torch.cat((conv4_4_X_us, conv3_4_X), dim=1)
# print('=> fusion shape', fusion.shape)

# output layer
scores = self.output_score(fusion)
locs = self.output_loc(fusion)
# print('=> scores shape: ', scores.shape)
# print('=> locs shape: ', locs.shape)

return scores, locs

3. Loss设计

分类置信度和边界框回归均采用了L2损失,特别地,增加了平衡采样策略Balance Sampling。

  1. 或略灰色(模糊)区域

    所谓灰色区域,是指正负样本边界部分的像素点,因为在这些区域由于标注的样本是很难区分的,让其参与训练反而会降低模型的精度,因此这一部分不会参与训练,计算loss,在输出坐标空间中,对于每一个非正标记的像素,只要其半径范围2内存在任意一个带正标记的像素,则设置为1。

  2. Hard Negative Mining

    • 计算整个patch的3600个所有样本点,并根据loss进行排序;
    • 取其中的1%,也就是36个作为hard-negative样本;
    • 随机采样36个负样本和hard-negative构成72个负样本;
    • 随机采样72个正样本。

    使用上面策略得到的144个样本参与训练,将其掩码参数置为1,其余样本的置为0。

最终,每个像素点位置的掩码设计为如下公式:

image-20220124194415082

总的Loss设计为:

image-20220124194520881

其中为卷积网络的参数, 表示只有正样本参与bounding box的训练。

4. 标签设计

这里只展示部分关键代码,详细的部分请参考源码哦。

  • DenseBox中对采用后的正负样本Patch,分配标签, 1个score, 2个box的左上和右下坐标,4个关键点坐标
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
# class DenseBoxDataset(data.Dataset) 
# 正负样本标签设置
# judge postive or negative...
if data_1 == data_2 == data_3 == data_4 \
== data_5 == data_6 == data_7 == data_8 \
== data_9 == data_10 == data_11 == data_12 == 0.0:

self.labels.append(torch.FloatTensor([0.0]))
self.bboxes.append(torch.FloatTensor([0.0, 0.0, 0.0, 0.0]))
self.vertices.append(torch.FloatTensor([0.0, 0.0, 0.0, 0.0,
0.0, 0.0, 0.0, 0.0]))
else:
# 2 bbox corners
# turn coordinate to 60×60 coordinate space, float

self.labels.append(torch.FloatTensor([1.0]))

bbox_leftup_x = data_1 / 4.0
bbox_leftup_y = data_2 / 4.0
bbox_rightdown_x = data_3 / 4.0
bbox_rightdown_y = data_4 / 4.0

self.bboxes.append(torch.FloatTensor(np.array([
bbox_leftup_x,
bbox_leftup_y,
bbox_rightdown_x,
bbox_rightdown_y
])))

# 4 vertices: 240×240 coordinate space, float
# turn coordinate to 60×60 coordinate space, float

leftup_x = data_5 / 4.0
leftup_y = data_6 / 4.0

rightup_x = data_7 / 4.0
rightup_y = data_8 / 4.0

rightdown_x = data_9 / 4.0
rightdown_y = data_10 / 4.0

leftdown_x = data_11 / 4.0
leftdown_y = data_12 / 4.0

self.vertices.append(torch.FloatTensor(np.array([
leftup_x,
leftup_y, # leftup corner
rightup_x,
rightup_y, # rightup corner
rightdown_x,
rightdown_y, # rightdown corner
leftdown_x,
leftdown_y]))) # leftdown corner


  • 初始化score_map, 设置正值时,与论文稍有出入。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
# init score map: N×1×60×60
# cls_map_gt = init_score(bboxes=bbox,labels=labels,ratio=0.3)

def init_score(bboxes, labels, ratio=0.3):
"""
init for both positive patch and negative patch
:param bboxes:
:param labels:
:param ratio:
:return:
"""
assert bboxes.size()[0] == labels.size(0) and \
bboxes.size() == torch.Size([bboxes.size(0), 4]) and \
labels.size() == torch.Size([labels.size(0), 1])

score_map = torch.zeros([bboxes.size(0), 1, 60, 60], dtype=torch.float32)

for item_i, (coord, lb) in enumerate(zip(bboxes.numpy(), labels.numpy())):
# process each item in the batch

if lb == 0.0: # negative patch sample
continue

bbox_center_x = float(coord[0] + coord[2]) * 0.5
bbox_center_y = float(coord[1] + coord[3]) * 0.5

bbox_w = coord[2] - coord[0]
bbox_h = coord[3] - coord[1]

org_x = int(bbox_center_x - float(ratio * bbox_w * 0.5) + 0.5)
org_y = int(bbox_center_y - float(ratio * bbox_h * 0.5) + 0.5)
end_x = int(float(org_x) + float(ratio * bbox_w) + 0.5)
end_y = int(float(org_y) + float(ratio * bbox_h) + 0.5)

try:
score_map[item_i, :, org_y: end_y + 1, org_x: end_x + 1] = 1.0
except Exception as e:
print(e)
continue

return score_map
  • 初始化loc_map
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
def init_loc(bboxes, labels):
"""
init loc map including positive and negative samples
0 1 2 3
:param bboxes: batch_size×4: leftup_x, leftup_y, rightdown_x, rightdown_y
:param bboxes:
:param labels:
:return:
"""
assert bboxes.size()[0] == labels.size(0) and \
bboxes.size() == torch.Size([bboxes.size(0), 4]) and \
labels.size() == torch.Size([labels.size(0), 1])

loc_map = torch.zeros([bboxes.size(0), 4, 60, 60], dtype=torch.float32)

for item_i, (coord, lb) in enumerate(zip(bboxes.numpy(), labels.numpy())):
# process each item in the batch
if lb == 0.0:
continue

for y in range(60): # dim H
for x in range(60): # dim W

loc_map[item_i, 0, y, x] = float(x) - coord[0] # dist_xt
loc_map[item_i, 1, y, x] = float(y) - coord[1] # dist_yt
loc_map[item_i, 2, y, x] = float(x) - coord[2] # dist_xb
loc_map[item_i, 3, y, x] = float(y) - coord[3] # dist_yb

return loc_map

4. 结合关键点检测增强检测效果

image-20220124195107507

  • DenseBox加入关键点检测的任务分支时模型的精度会进一步提升,这时只需要在图3的conv3_4和conv4_4融合之后的结果上添加一个用于关键点检测的分支即可,分支的详细结构如Figure4所示。假设样本有个关键点(在MALF中 72 ),DenseBox的关键点检测的输出是 个热图,热图中的每个像素点表示该点对应位置关键点的置信度。
  • 加入关键点检测分支之后,DenseBox根据关键点的置信度图和boudning box的置信度图构成了新的检测损失,并将其命名为Refine Network,通过拼接的方式融合了关键点检测的Conv5_2_landmark层和图2中bounding box的Conv5_2_det层,之后接了Max Pooling层,卷积层,上采样层最后生成新的预测值,这里也可以理解为新的score_map预测。

加入关键点定位即Refine Network结构后,最终损失设计为:

image-20220124195757989

Paper中关于加入关键点定位的消融实验:

image-20220124200516603

5. Reference

  1. https://zhuanlan.zhihu.com/p/40221183
  2. https://zhuanlan.zhihu.com/p/44021975
  3. https://www.jianshu.com/p/244d7f1dcdab