# 一. 算法概述

Fig.1 SSD 框架

# 二. Default box

Fig.2 default boxes

$\vee$

$S_k=S_{min} + \frac{S_{max} - S_{min}}{m-1}(k-1), k\in{[1, m]}$

$a_r = \{1, 2, 3, 1/2, 1/3\}$

$w_k^a=s_k\sqrt{a_r}$

$h_k^a=s_k/\sqrt{a_r}$

$s_k^{'}=\sqrt{s_{k}s_{k+1}}$

（训练自己的样本的时候可以在 FindMatch() 之后检查是否覆盖了所有得 ground truth box，实际上是全覆盖了，因为会至少找一个最大匹配）

具体到代码 ssd_pascal.py 中是这样设计的：这里与论文中的公式有细微变化，自己体会。。。

mbox_source_layers = ['conv4_3', 'fc7', 'conv6_2', 'conv7_2', 'conv8_2', 'conv9_2']
# in percent %
min_ratio = 20
max_ratio = 90
step = int(math.floor((max_ratio - min_ratio) / (len(mbox_source_layers) - 2)))
min_sizes = []
max_sizes = []
for ratio in xrange(min_ratio, max_ratio + 1, step):
min_sizes.append(min_dim * ratio / 100.)
max_sizes.append(min_dim * (ratio + step) / 100.)
min_sizes = [min_dim * 10 / 100.] + min_sizes
max_sizes = [min_dim * 20 / 100.] + max_sizes
steps = [8, 16, 32, 64, 100, 300]
aspect_ratios = [[2], [2, 3], [2, 3], [2, 3], [2], [2]]

caffe 源码 prior_box_layer.cpp 中是这样提取 prior box 的：

 for (int h = 0; h < layer_height; ++h) {
for (int w = 0; w < layer_width; ++w) {
float center_x = (w + offset_) * step_w;
float center_y = (h + offset_) * step_h;
float box_width, box_height;
for (int s = 0; s < min_sizes_.size(); ++s) {
int min_size_ = min_sizes_[s];
// first prior: aspect_ratio = 1, size = min_size
box_width = box_height = min_size_;
// xmin
top_data[idx++] = (center_x - box_width / 2.) / img_width;
// ymin
top_data[idx++] = (center_y - box_height / 2.) / img_height;
// xmax
top_data[idx++] = (center_x + box_width / 2.) / img_width;
// ymax
top_data[idx++] = (center_y + box_height / 2.) / img_height;

if (max_sizes_.size() > 0) {
CHECK_EQ(min_sizes_.size(), max_sizes_.size());
int max_size_ = max_sizes_[s];
// second prior: aspect_ratio = 1, size = sqrt(min_size * max_size)
box_width = box_height = sqrt(min_size_ * max_size_);
// xmin
top_data[idx++] = (center_x - box_width / 2.) / img_width;
// ymin
top_data[idx++] = (center_y - box_height / 2.) / img_height;
// xmax
top_data[idx++] = (center_x + box_width / 2.) / img_width;
// ymax
top_data[idx++] = (center_y + box_height / 2.) / img_height;
}

// rest of priors
for (int r = 0; r < aspect_ratios_.size(); ++r) {
float ar = aspect_ratios_[r];
if (fabs(ar - 1.) < 1e-6) {
continue;
}
box_width = min_size_ * sqrt(ar);
box_height = min_size_ / sqrt(ar);
// xmin
top_data[idx++] = (center_x - box_width / 2.) / img_width;
// ymin
top_data[idx++] = (center_y - box_height / 2.) / img_height;
// xmax
top_data[idx++] = (center_x + box_width / 2.) / img_width;
// ymax
top_data[idx++] = (center_y + box_height / 2.) / img_height;
}
}
}
}
View Code

 feature map feature map size min_size($s_k$) max_size($s_{k+1}$) aspect_ratio step offset variance conv4_3 38×38 30 60 1,2 8 0.50 0.1， 0.1， 0.2， 0.2 fc6 19×19 60 111 1,2,3 16 conv6_2 10×10 111 162 1,2,3 32 conv7_2 5×5 162 213 1,2,3 64 conv8_2 3×3 213 264 1,2 100 conv9_2 1×1 264 315 1,2 300

# 三. 正负样本

Fig.3 positive and negtive sample VS ground_truth box

## 1.正样本获得

void FindMatches(const vector<LabelBBox>& all_loc_preds,                //
const map<int, vector<NormalizedBBox> >& all_gt_bboxes,        // 所有的 ground truth
const vector<NormalizedBBox>& prior_bboxes,                                // 所有的default boxes，8732个
const vector<vector<float> >& prior_variances,
const MultiBoxLossParameter& multibox_loss_param,
vector<map<int, vector<float> > >* all_match_overlaps,        // 所有匹配上的default box jaccard overlap
vector<map<int, vector<int> > >* all_match_indices) {            // 所有匹配上的default box序号

const int num_classes = multibox_loss_param.num_classes(); // 类别总数 = 21
const bool share_location = multibox_loss_param.share_location(); // 共享？ true
const int loc_classes = share_location ? 1 : num_classes; // 1
const MatchType match_type = multibox_loss_param.match_type(); // MultiBoxLossParameter_MatchType_PER_PREDICTION
const float overlap_threshold = multibox_loss_param.overlap_threshold(); // jaccard overlap = 0.5
const bool use_prior_for_matching =multibox_loss_param.use_prior_for_matching(); // true
const int background_label_id = multibox_loss_param.background_label_id();
const CodeType code_type = multibox_loss_param.code_type();
const bool encode_variance_in_target =
multibox_loss_param.encode_variance_in_target();
const bool ignore_cross_boundary_bbox =
multibox_loss_param.ignore_cross_boundary_bbox();
// Find the matches.
int num = all_loc_preds.size();
for (int i = 0; i < num; ++i) {
map<int, vector<int> > match_indices; // 匹配上的default box 序号
map<int, vector<float> > match_overlaps; // 匹配上的default box jaccard overlap
// Check if there is ground truth for current image.
if (all_gt_bboxes.find(i) == all_gt_bboxes.end()) {
// There is no gt for current image. All predictions are negative.
all_match_indices->push_back(match_indices);
all_match_overlaps->push_back(match_overlaps);
continue;
}
// Find match between predictions and ground truth.
const vector<NormalizedBBox>& gt_bboxes = all_gt_bboxes.find(i)->second; // N个ground truth
if (!use_prior_for_matching) {
for (int c = 0; c < loc_classes; ++c) {
int label = share_location ? -1 : c;
if (!share_location && label == background_label_id) {
// Ignore background loc predictions.
continue;
}
// Decode the prediction into bbox first.
vector<NormalizedBBox> loc_bboxes;
bool clip_bbox = false;
DecodeBBoxes(prior_bboxes, prior_variances,
code_type, encode_variance_in_target, clip_bbox,
all_loc_preds[i].find(label)->second, &loc_bboxes);
MatchBBox(gt_bboxes, loc_bboxes, label, match_type,
overlap_threshold, ignore_cross_boundary_bbox,
&match_indices[label], &match_overlaps[label]);
}
} else {
// Use prior bboxes to match against all ground truth.
vector<int> temp_match_indices;
vector<float> temp_match_overlaps;
const int label = -1;
MatchBBox(gt_bboxes, prior_bboxes, label, match_type, overlap_threshold,
ignore_cross_boundary_bbox, &temp_match_indices,
&temp_match_overlaps);
if (share_location) {
match_indices[label] = temp_match_indices;
match_overlaps[label] = temp_match_overlaps;
} else {
// Get ground truth label for each ground truth bbox.
vector<int> gt_labels;
for (int g = 0; g < gt_bboxes.size(); ++g) {
gt_labels.push_back(gt_bboxes[g].label());
}
// Distribute the matching results to different loc_class.
for (int c = 0; c < loc_classes; ++c) {
if (c == background_label_id) {
// Ignore background loc predictions.
continue;
}
match_indices[c].resize(temp_match_indices.size(), -1);
match_overlaps[c] = temp_match_overlaps;
for (int m = 0; m < temp_match_indices.size(); ++m) {
if (temp_match_indices[m] > -1) {
const int gt_idx = temp_match_indices[m];
CHECK_LT(gt_idx, gt_labels.size());
if (c == gt_labels[gt_idx]) {
match_indices[c][m] = gt_idx;
}
}
}
}
}
}
all_match_indices->push_back(match_indices);
all_match_overlaps->push_back(match_overlaps);
}
}
View Code
void MatchBBox(const vector<NormalizedBBox>& gt_bboxes,
const vector<NormalizedBBox>& pred_bboxes, const int label,
const MatchType match_type, const float overlap_threshold,
const bool ignore_cross_boundary_bbox,
vector<int>* match_indices, vector<float>* match_overlaps) {
int num_pred = pred_bboxes.size();
match_indices->clear();
match_indices->resize(num_pred, -1);
match_overlaps->clear();
match_overlaps->resize(num_pred, 0.);

int num_gt = 0;
vector<int> gt_indices;
if (label == -1) {
// label -1 means comparing against all ground truth.
num_gt = gt_bboxes.size();
for (int i = 0; i < num_gt; ++i) {
gt_indices.push_back(i);
}
} else {
// Count number of ground truth boxes which has the desired label.
for (int i = 0; i < gt_bboxes.size(); ++i) {
if (gt_bboxes[i].label() == label) {
num_gt++;
gt_indices.push_back(i);
}
}
}
if (num_gt == 0) {
return;
}

// Store the positive overlap between predictions and ground truth.
map<int, map<int, float> > overlaps;
for (int i = 0; i < num_pred; ++i) {
if (ignore_cross_boundary_bbox && IsCrossBoundaryBBox(pred_bboxes[i])) {
(*match_indices)[i] = -2;
continue;
}
for (int j = 0; j < num_gt; ++j) {
float overlap = JaccardOverlap(pred_bboxes[i], gt_bboxes[gt_indices[j]]);
if (overlap > 1e-6) {
(*match_overlaps)[i] = std::max((*match_overlaps)[i], overlap);
overlaps[i][j] = overlap;
}
}
}

// Bipartite matching.
vector<int> gt_pool;
for (int i = 0; i < num_gt; ++i) {
gt_pool.push_back(i);
}
while (gt_pool.size() > 0) {
// Find the most overlapped gt and cooresponding predictions.
int max_idx = -1;
int max_gt_idx = -1;
float max_overlap = -1;
for (map<int, map<int, float> >::iterator it = overlaps.begin();
it != overlaps.end(); ++it) {
int i = it->first;
if ((*match_indices)[i] != -1) {
// The prediction already has matched ground truth or is ignored.
continue;
}
for (int p = 0; p < gt_pool.size(); ++p) {
int j = gt_pool[p];
if (it->second.find(j) == it->second.end()) {
// No overlap between the i-th prediction and j-th ground truth.
continue;
}
// Find the maximum overlapped pair.
if (it->second[j] > max_overlap) {
// If the prediction has not been matched to any ground truth,
// and the overlap is larger than maximum overlap, update.
max_idx = i;
max_gt_idx = j;
max_overlap = it->second[j];
}
}
}
if (max_idx == -1) {
// Cannot find good match.
break;
} else {
CHECK_EQ((*match_indices)[max_idx], -1);
(*match_indices)[max_idx] = gt_indices[max_gt_idx];
(*match_overlaps)[max_idx] = max_overlap;
// Erase the ground truth.
gt_pool.erase(std::find(gt_pool.begin(), gt_pool.end(), max_gt_idx));
}
}

switch (match_type) {
case MultiBoxLossParameter_MatchType_BIPARTITE:
break;
case MultiBoxLossParameter_MatchType_PER_PREDICTION:
// Get most overlaped for the rest prediction bboxes.
for (map<int, map<int, float> >::iterator it = overlaps.begin();
it != overlaps.end(); ++it) {
int i = it->first;
if ((*match_indices)[i] != -1) {
// The prediction already has matched ground truth or is ignored.
continue;
}
int max_gt_idx = -1;
float max_overlap = -1;
for (int j = 0; j < num_gt; ++j) {
if (it->second.find(j) == it->second.end()) {
// No overlap between the i-th prediction and j-th ground truth.
continue;
}
// Find the maximum overlapped pair.
float overlap = it->second[j];
if (overlap >= overlap_threshold && overlap > max_overlap) {
// If the prediction has not been matched to any ground truth,
// and the overlap is larger than maximum overlap, update.
max_gt_idx = j;
max_overlap = overlap;
}
}
if (max_gt_idx != -1) {
// Found a matched ground truth.
CHECK_EQ((*match_indices)[i], -1);
(*match_indices)[i] = gt_indices[max_gt_idx];
(*match_overlaps)[i] = max_overlap;
}
}
break;
default:
LOG(FATAL) << "Unknown matching type.";
break;
}

return;
}
View Code
• 将每一个prior box 与 每一个 groundtruth box 进行匹配，获得待处理匹配 map<int, map<int, float> > overlaps（小于8732），JaccardOverlap > 0 的 prior box 才保留，其他舍去。一个 ground truth box 可能和多个 prior box 能匹配上
• 从待处理匹配中为 ground truth box 找到最匹配的一对放入候选正样本集  vector<int>* match_indices, vector<float>* match_overlaps
• 剩下的每个待处理匹配中一个 ground truth box 可能匹配多个 prior box，因此我们为剩下的每个 prior box 寻找满足与 groundtruth box 的 JaccardOverlap > 0.5 的一个最大匹配放入候选正样本集  vector<int>* match_indices, vector<float>* match_overlaps

## 2.负样本获得

enum MultiBoxLossParameter_MiningType {
MultiBoxLossParameter_MiningType_NONE = 0,
MultiBoxLossParameter_MiningType_MAX_NEGATIVE = 1,
MultiBoxLossParameter_MiningType_HARD_EXAMPLE = 2
};

## 3.回归

预测输出 predict box = [$\hat{p}^x, \hat{p}^y, \hat{p}^w, \hat{p}^h$]

编码系数 prior_variance = [0.1, 0.1, 0.2, 0.2]

\label{decode}
\begin{split}
& p^x = 0.1 * \hat{p}^x * d^w + d^x \\
& p^y = 0.1 * \hat{p}^y * d^h + d^y \\
& p^w = e^{(0.2*\hat{p}^w)} * d^w \\
& p^h = e^{(0.2*\hat{p}^h)} * d^h \\
\end{split}

原始 groundtruth box = [$g^x, g^y, g^w, g^h$]

编码系数 prior_variance = [0.1, 0.1, 0.2, 0.2]

\label{encode}
\begin{split}
& \hat{g}^x = \frac{g^x - d^x}{d^w * 0.1} \\
& \hat{g}^y = \frac{g^y - d^y}{d^h * 0.1} \\
& \hat{g}^w = \frac{log(\frac{g^w}{d^w})}{0.2} \\
& \hat{g}^h = \frac{log(\frac{g^h}{d^h})}{0.2} \\
\end{split}

## 4.Data argument

• 使用原始的图像
• 随机采样多个 patch(CropImage)，与物体之间最小的 jaccard overlap 为：

#### 一个sampler的参数说明

// Sample a bbox in the normalized space [0, 1] with provided constraints.
message Sampler {
// 最大最小scale数
optional float min_scale = 1 [default = 1.];
optional float max_scale = 2 [default = 1.];
// 最大最小采样长宽比，真实的长宽比在这两个数中间取值
optional float min_aspect_ratio = 3 [default = 1.];
optional float max_aspect_ratio = 4 [default = 1.];
}

#### 对于选择的sample_box的限制条件

// Constraints for selecting sampled bbox.
message SampleConstraint {
// Minimum Jaccard overlap between sampled bbox and all bboxes in
// AnnotationGroup.
optional float min_jaccard_overlap = 1;
// Maximum Jaccard overlap between sampled bbox and all bboxes in
// AnnotationGroup.
optional float max_jaccard_overlap = 2;
// Minimum coverage of sampled bbox by all bboxes in AnnotationGroup.
optional float min_sample_coverage = 3;
// Maximum coverage of sampled bbox by all bboxes in AnnotationGroup.
optional float max_sample_coverage = 4;
// Minimum coverage of all bboxes in AnnotationGroup by sampled bbox.
optional float min_object_coverage = 5;
// Maximum coverage of all bboxes in AnnotationGroup by sampled bbox.
optional float max_object_coverage = 6;
} 我们们往往只用max_jaccard_overlap

#### 对于一个batch进行采样的参数设置

// Sample a batch of bboxes with provided constraints.
message BatchSampler {
// 是否使用原来的图片
optional bool use_original_image = 1 [default = true];
// sampler的参数
optional Sampler sampler = 2;
// 对于采样box的限制条件，决定一个采样数据positive or negative
optional SampleConstraint sample_constraint = 3;
// 当采样总数满足条件时，直接结束
optional uint32 max_sample = 4;
// 为了避免死循环，采样最大try的次数.
optional uint32 max_trials = 5 [default = 100];
}

#### 转存datalayer数据的参数

message TransformationParameter {
// 对于数据预处理，我们可以仅仅进行scaling和减掉预先提供的平均值。
// 需要注意的是在scaling之前要先减掉平均值
optional float scale = 1 [default = 1];
// 是否随机镜像操作
optional bool mirror = 2 [default = false];
// 是否随机crop操作
optional uint32 crop_size = 3 [default = 0];
optional uint32 crop_h = 11 [default = 0];
optional uint32 crop_w = 12 [default = 0];
// 提供mean_file的路径，但是不能和mean_value同时提供
// if specified can be repeated once (would substract it from all the
// channels) or can be repeated the same number of times as channels
// (would subtract them from the corresponding channel)
optional string mean_file = 4;
repeated float mean_value = 5;
// Force the decoded image to have 3 color channels.
optional bool force_color = 6 [default = false];
// Force the decoded image to have 1 color channels.
optional bool force_gray = 7 [default = false];
// Resize policy
optional ResizeParameter resize_param = 8;
// Noise policy
optional NoiseParameter noise_param = 9;
// Distortion policy
optional DistortionParameter distort_param = 13;
// Expand policy
optional ExpansionParameter expand_param = 14;
// Constraint for emitting the annotation after transformation.
optional EmitConstraint emit_constraint = 10;
}

#### SSD中的数据转换和采样参数设置

transform_param {
mirror: true
mean_value: 104
mean_value: 117
mean_value: 123
resize_param {
prob: 1
resize_mode: WARP
height: 300
width: 300
interp_mode: LINEAR
interp_mode: AREA
interp_mode: NEAREST
interp_mode: CUBIC
interp_mode: LANCZOS4
}
emit_constraint {
emit_type: CENTER
}
distort_param {
brightness_prob: 0.5
brightness_delta: 32
contrast_prob: 0.5
contrast_lower: 0.5
contrast_upper: 1.5
hue_prob: 0.5
hue_delta: 18
saturation_prob: 0.5
saturation_lower: 0.5
saturation_upper: 1.5
random_order_prob: 0.0
}
expand_param {
prob: 0.5
max_expand_ratio: 4.0
}
}

annotated_data_param {
batch_sampler {
max_sample: 1
max_trials: 1
}
batch_sampler {
sampler {
min_scale: 0.3
max_scale: 1.0
min_aspect_ratio: 0.5
max_aspect_ratio: 2.0
}
sample_constraint {
min_jaccard_overlap: 0.1
}
max_sample: 1
max_trials: 50
}
batch_sampler {
sampler {
min_scale: 0.3
max_scale: 1.0
min_aspect_ratio: 0.5
max_aspect_ratio: 2.0
}
sample_constraint {
min_jaccard_overlap: 0.3
}
max_sample: 1
max_trials: 50
}
batch_sampler {
sampler {
min_scale: 0.3
max_scale: 1.0
min_aspect_ratio: 0.5
max_aspect_ratio: 2.0
}
sample_constraint {
min_jaccard_overlap: 0.5
}
max_sample: 1
max_trials: 50
}
batch_sampler {
sampler {
min_scale: 0.3
max_scale: 1.0
min_aspect_ratio: 0.5
max_aspect_ratio: 2.0
}
sample_constraint {
min_jaccard_overlap: 0.7
}
max_sample: 1
max_trials: 50
}
batch_sampler {
sampler {
min_scale: 0.3
max_scale: 1.0
min_aspect_ratio: 0.5
max_aspect_ratio: 2.0
}
sample_constraint {
min_jaccard_overlap: 0.9
}
max_sample: 1
max_trials: 50
}
batch_sampler {
sampler {
min_scale: 0.3
max_scale: 1.0
min_aspect_ratio: 0.5
max_aspect_ratio: 2.0
}
sample_constraint {
max_jaccard_overlap: 1.0
}
max_sample: 1
max_trials: 50
}
label_map_file: "E:/tyang/caffe-master_/data/VOC0712/labelmap_voc.prototxt"
}
View Code

Fig.4 SSD data argument

# 四. 网络结构

Fig.5 SSD 流程

SSD 网络中输入图片尺寸是3×300×300，经过pool5层后输出为512×19×19，接下来经过fc6（改成卷积层）

layer {
name: "fc6"
type: "Convolution"
bottom: "pool5"
top: "fc6"
param {
lr_mult: 1.0
decay_mult: 1.0
}
param {
lr_mult: 2.0
decay_mult: 0.0
}
convolution_param {
num_output: 1024
kernel_size: 3
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0.0
}
dilation: 6
}
}

$output = \frac{(input + 2*pad - kernel\_extern)}{stride} +１$
$kernel\_extern = dilation * (kernel - 1) + 1$

 feature map conv4_3 fc7 conv6_2 conv7_2 conv8_2 conv9_2 size 512×38×38 1024×19×19 512×10×10 256×5×5 256×3×3 256×1×1

layer {
name: "conv6_2_mbox_conf"
type: "Convolution"
bottom: "conv6_2"
top: "conv6_2_mbox_conf"
param {
lr_mult: 1.0
decay_mult: 1.0
}
param {
lr_mult: 2.0
decay_mult: 0.0
}
convolution_param {
num_output: 126
kernel_size: 3
stride: 1
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0.0
}
}
}

 layer conv4_3_norm_mbox_conf fc7_mbox_conf conv6_2_mbox_conf conv7_2_mbox_conf conv8_2_mbox_conf conv9_2_mbox_conf size 84 38 38 126 19 19 126 10 10 126 5 5 84 3 3 84 1 1

 layer conv4_3_norm_mbox_conf_perm fc7_mbox_conf_perm conv6_2_mbox_conf_perm conv7_2_mbox_conf_perm conv8_2_mbox_conf_perm conv9_2_mbox_conf_perm size 38 38 84 19 19 126 10 10 126 5 5 126 3 3 84 1 1 84

 layer conv4_3_norm_mbox_conf_flat fc7_mbox_conf_flat conv6_2_mbox_conf_flat conv7_2_mbox_conf_flat conv8_2_mbox_conf_flat conv9_2_mbox_conf_flat size 121296 45486 12600 3150 756 84

layer {
name: "conv6_2_mbox_loc"
type: "Convolution"
bottom: "conv6_2"
top: "conv6_2_mbox_loc"
param {
lr_mult: 1.0
decay_mult: 1.0
}
param {
lr_mult: 2.0
decay_mult: 0.0
}
convolution_param {
num_output: 24
kernel_size: 3
stride: 1
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0.0
}
}
}

 layer conv4_3_norm_mbox_loc fc7_mbox_loc conv6_2_mbox_loc conv7_2_mbox_loc conv8_2_mbox_loc conv9_2_mbox_loc size 16 38 38 24 19 19 24 10 10 24 5 5 16 3 3 16 1 1

 layer conv4_3_norm_mbox_loc_perm fc7_mbox_loc_perm conv6_2_mbox_loc_perm conv7_2_mbox_loc_perm conv8_2_mbox_loc_perm conv9_2_mbox_loc_perm size 38 38 16 19 19 24 10 10 24 5 5 24 3 3 16 1 1 16

 layer conv4_3_norm_mbox_loc_flat fc7_mbox_loc_flat conv6_2_mbox_loc_flat conv7_2_mbox_loc_flat conv8_2_mbox_loc_flat conv9_2_mbox_loc_flat size 23104 8664 2400 600 144 16

layer {
name: "conv6_2_mbox_priorbox"
type: "PriorBox"
bottom: "conv6_2"
bottom: "data"
top: "conv6_2_mbox_priorbox"
prior_box_param {
min_size: 111.0
max_size: 162.0
aspect_ratio: 2.0
aspect_ratio: 3.0
flip: true
clip: false
variance: 0.10000000149
variance: 0.10000000149
variance: 0.20000000298
variance: 0.20000000298
step: 32.0
offset: 0.5
}
}

 layer conv4_3_norm_mbox_priorbox fc7_mbox_priorbox conv6_2_mbox_priorbox conv7_2_mbox_priorbox conv8_2_mbox_priorbox conv9_2_mbox_priorbox size 2 23104 2 8664 2 2400 2 600 2 144 2 16

layer {
name: "mbox_loc"
type: "Concat"
bottom: "conv4_3_norm_mbox_loc_flat"
bottom: "fc7_mbox_loc_flat"
bottom: "conv6_2_mbox_loc_flat"
bottom: "conv7_2_mbox_loc_flat"
bottom: "conv8_2_mbox_loc_flat"
bottom: "conv9_2_mbox_loc_flat"
top: "mbox_loc"
concat_param {
axis: 1
}
}
layer {
name: "mbox_conf"
type: "Concat"
bottom: "conv4_3_norm_mbox_conf_flat"
bottom: "fc7_mbox_conf_flat"
bottom: "conv6_2_mbox_conf_flat"
bottom: "conv7_2_mbox_conf_flat"
bottom: "conv8_2_mbox_conf_flat"
bottom: "conv9_2_mbox_conf_flat"
top: "mbox_conf"
concat_param {
axis: 1
}
}
layer {
name: "mbox_priorbox"
type: "Concat"
bottom: "conv4_3_norm_mbox_priorbox"
bottom: "fc7_mbox_priorbox"
bottom: "conv6_2_mbox_priorbox"
bottom: "conv7_2_mbox_priorbox"
bottom: "conv8_2_mbox_priorbox"
bottom: "conv9_2_mbox_priorbox"
top: "mbox_priorbox"
concat_param {
axis: 2
}
}

 layer mbox_loc mbox_conf mbox_priorbox size 34928(8732*4) 183372(8732*21) 2 34928(8732*4)

layer {
name: "mbox_loss"
type: "MultiBoxLoss"
bottom: "mbox_loc"
bottom: "mbox_conf"
bottom: "mbox_priorbox"
bottom: "label"
top: "mbox_loss"
include {
phase: TRAIN
}
propagate_down: true
propagate_down: true
propagate_down: false
propagate_down: false
loss_param {
normalization: VALID
}
multibox_loss_param {
loc_loss_type: SMOOTH_L1
conf_loss_type: SOFTMAX
loc_weight: 1.0
num_classes: 21
share_location: true
match_type: PER_PREDICTION
overlap_threshold: 0.5
use_prior_for_matching: true
background_label_id: 0
use_difficult_gt: true
neg_pos_ratio: 3.0
neg_overlap: 0.5
code_type: CENTER_SIZE
ignore_cross_boundary_bbox: false
mining_type: MAX_NEGATIVE
}
}

# 五. 损失函数

SSD 训练的目标函数（training objective）源自于 MultiBox 的目标函数，但是本文将其拓展，使其可以处理多个目标类别。具体过程是我们会让每一个 prior box 经过Jaccard 系数计算和真实框的相似度，阈值只有大于 0.5 的才可以列为候选名单；假设选择出来的是N个匹配度高于百分之五十的框吧，我们令 表示第 i 个默认框，j 表示第 j 个真实框，p表示第p个类。那么$x_{ij}^p$ 表示 第 i 个 prior box 与 类别 p 的 第 j 个 ground truth box 相匹配的Jaccard系数，若不匹配的话，则$x_{ij}^p=0$。总的目标损失函数（objective loss function）就由 localization loss（loc） 与 confidence loss（conf） 的加权求和：

• localization loss（loc） 是 Fast R-CNN 中 Smooth L1 Loss，用在 predict box（l 与 ground truth box（ 参数（即中心坐标位置，width、height）中，回归 bounding boxes 的中心位置，以及 width、height
• confidence loss（conf） 是 Softmax Loss，输入为每一类的置信度
• 权重项

## Softmax Loss

\label{SoftmaxLoss}
\begin{split}
& L_{softmax} = - \sum_{j=1}^{T} y_j \log{s_j} \\
& s_j = \frac{e_{p^j}}{\sum e_{p^i}} \\
\end{split}

## Smooth L1 loss

\label{L1}
L_1(x) = \lvert{x}\rvert ; \ \frac{d{L_1(x)}}{d{x}} = \begin{cases}
\ 1 & if\ x \ge 0 \\
\ -1 & \ otherwise
\end{cases}

\label{L2}
L_1(x) = x^2; \ \frac{d{L_2(x)}}{d{x}} = 2x

\label{Smooth_L1}
smooth_{L_1}(x) = \begin{cases}
\ 0.5x^2 & if \  \lvert{x}\rvert < 1 \\
\ \lvert{x}\rvert - 0.5 & \ otherwise
\end{cases}
; \ \frac{d{smooth_{L_1}}}{d{x}} = \begin{cases}
\ x & if \  \lvert{x}\rvert < 1 \\
\ \pm{1} & \ otherwise
\end{cases}

# 七.使用注意

## 6. 尝试使用 GIoU、DIoU、CIoU Loss 替换 Smooth L1 Loss。

posted @ 2017-07-24 14:53  xuanyuyt  阅读(15711)  评论(4编辑  收藏  举报