Detectron 基于 Caffe2,添加了三个自定义的网络层(Python Ops).
"""
网络层作用:
- 将 multiple FPN levels 生成的 RPN proposals 进行合并,并分发 FPN proposals 到对应的 FPN levels.
- 一个 FPN level 的 anchor 预测的一个 RoI 可能映射到另一个 FPN level,因此需要重新分发 FPN proposals.
网络层的输入和输出 - Input Blobs & Output Blobs:
- Input Blobs:
[rpn_rois_fpn<min>, ..., rpn_rois_fpn<max>, rpn_roi_probs_fpn<min>, ..., rpn_roi_probs_fpn<max>]
其中,
rpn_rois_fpn<i> - FPN level i 的 RPN proposals.
rpn_roi_probs_fpn<i>` - FPN level i 的 RPN objectness 概率.
训练阶段使用时,Input Blobs 还包括:[roidb, im_info].
- Output blobs:
[rois_fpn<min>, ..., rois_rpn<max>, rois, rois_idx_restore]
其中,
rois_fpn<i> - FPN level i 的 RPN proposals.
rois_idx_restore - 所有 rois_fpn<i>, i=min...max 组合的排列序列,用于将 RPN RoIs 恢复到 Input Blobs 原来的顺序.
训练阶段使用时,Output Blobs 还包括: [labels, bbox_targets, bbox_inside_weights, bbox_outside_weights].
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import numpy as np
from core.config import cfg
from datasets import json_dataset
import modeling.FPN as fpn
import roi_data.fast_rcnn
import utils.blob as blob_utils
class CollectAndDistributeFpnRpnProposalsOp(object):
def __init__(self, train):
self._train = train
def forward(self, inputs, outputs):
# 参见: modeling.detector.CollectAndDistributeFpnRpnProposals
# 输入: [rpn_rois_fpn2, ..., rpn_rois_fpn6,
# rpn_roi_probs_fpn2, ..., rpn_roi_probs_fpn6]
# 与 Faster R-CNN 一起训练时,输入还额外的包括 [roidb, im_info]
rois = collect(inputs, self._train)
if self._train:
# 训练时,重复使用 data loader code.
# 采用 RPN 生成的 rois,在线输入 roidb 数据.
# im_info: [[im_height, im_width, im_scale], ...]
im_info = inputs[-1].data
im_scales = im_info[:, 2]
roidb = blob_utils.deserialize(inputs[-2].data)
# 为了与原始 Faster R-CNN 的实现保持一致,
# 这里未对 crowd proposals 的过滤(fliter). 即:crowd_thresh=0
# 未来值得探索这个.
json_dataset.add_proposals(roidb, rois, im_scales, crowd_thresh=0)
"""
json_dataset.add_proposals(roidb, rois, scales, crowd_thresh)
添加 proposal boxes(rois) 到 roidb,
proposal boxes(rois) 有 ground-truth 标注但没有 proposals.
如果 proposals 不是原始图片尺度scale,需要指定scale.
"""
# 计算 RPN proposals 的训练 labels;
# 还包括对 FPN levels 分发 proposals.
# Fast R-CNN blob names
output_blob_names = roi_data.fast_rcnn.get_fast_rcnn_blob_names()
blobs = {k: [] for k in output_blob_names}
# 添加训练 Faster R-CNN style models 所需要的 blobs.
roi_data.fast_rcnn.add_fast_rcnn_blobs(blobs, im_scales, roidb)
for i, k in enumerate(output_blob_names):
blob_utils.py_op_copy_blob(blobs[k], outputs[i])
else:
# 推断inference时,代码实现,避免可能出现的 data loader overhead.
distribute(rois, None, outputs, self._train)
def collect(inputs, is_training):
cfg_key = 'TRAIN' if is_training else 'TEST'
# 如 RPN_POST_NMS_TOP_N=2000
post_nms_topN = cfg[cfg_key].RPN_POST_NMS_TOP_N
k_max = cfg.FPN.RPN_MAX_LEVEL
k_min = cfg.FPN.RPN_MIN_LEVEL
num_lvls = k_max - k_min + 1
roi_inputs = inputs[:num_lvls]
score_inputs = inputs[num_lvls:]
if is_training:
score_inputs = score_inputs[:-2]
# rois 保存格式: [[batch_idx, x0, y0, x1, y2], ...]
# 组合所有 levels 的预测结果,并保留 top scoring 的结果.
rois = np.concatenate([blob.data for blob in roi_inputs])
scores = np.concatenate([blob.data for blob in score_inputs]).squeeze()
inds = np.argsort(-scores)[:post_nms_topN]
rois = rois[inds, :]
return rois
def distribute(rois, label_blobs, outputs, train):
"""
roi_data.fast_rcnn.get_fast_rcnn_blob_names(is_training=False) 的返回值给出了 output blob 的次序order.
"""
lvl_min = cfg.FPN.ROI_MIN_LEVEL
lvl_max = cfg.FPN.ROI_MAX_LEVEL
lvls = fpn.map_rois_to_fpn_levels(rois[:, 1:5], lvl_min, lvl_max)
"""
fpn.map_rois_ro_fpn_levels(rois, k_min, k_max)
检测全部 RoIs 中每个 RoI 应该被映射到的 FPN level.
基于 FPN Paper.
"""
outputs[0].reshape(rois.shape)
outputs[0].data[...] = rois
# 对每一个 FPN level 创建新的 roi blobs.
# (参见: modeling.FPN.add_multilevel_roi_blobs,类似的功能,
# 但将其泛化到支持这里的特殊情况比较麻烦.)
rois_idx_order = np.empty((0, ))
for output_idx, lvl in enumerate(range(lvl_min, lvl_max + 1)):
idx_lvl = np.where(lvls == lvl)[0]
blob_roi_level = rois[idx_lvl, :]
outputs[output_idx + 1].reshape(blob_roi_level.shape)
outputs[output_idx + 1].data[...] = blob_roi_level
rois_idx_order = np.concatenate((rois_idx_order, idx_lvl))
rois_idx_restore = np.argsort(rois_idx_order)
blob_utils.py_op_copy_blob(rois_idx_restore.astype(np.int32), outputs[-1])
"""
网络层(op)作用:
- 生成 RPN proposals 的训练 labels.
- 用于与 Fast/Mask R-CNN 联合训练时的 RPN 训练(如end-to-end Faster R-CNN 训练)
网络层(op)的输入和输出 blobs:
- 输入blobs - blobs_in:
- rpn_rois: GenerateProposals 输出的 RPN proposals,2D tensor.
- roidb: 待 labeled 的 roidb entries.
- im_info: 参考 GenerateProposals 文档.
- 输出blobs - blobs_out:
- (blobs 的 variable set): 返回模型训练需要的 blobs.
通过查询 data loader 来返回需要的 blobs 列表list.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import logging
from datasets import json_dataset
from utils import blob as blob_utils
import roi_data.fast_rcnn
logger = logging.getLogger(__name__)
class GenerateProposalLabelsOp(object):
def forward(self, inputs, outputs):
"""
参见:modeling.detector.GenerateProposalLabels
训练时,重复使用 data loader code.
采用 RPN 生成的 rois 来在线输入 roidb 数据.
im_info: [[im_height, im_width, im_scale], ...]
"""
rois = inputs[0].data
roidb = blob_utils.deserialize(inputs[1].data)
im_info = inputs[2].data
im_scales = im_info[:, 2]
output_blob_names = roi_data.fast_rcnn.get_fast_rcnn_blob_names()
# 为了与原始 Faster R-CNN 实现保持一致性,这里也未过滤crowd proposals.
# 即: crowd_thresh=0
json_dataset.add_proposals(roidb, rois, im_scales, crowd_thresh=0)
blobs = {k: [] for k in output_blob_names}
roi_data.fast_rcnn.add_fast_rcnn_blobs(blobs, im_scales, roidb)
for i, k in enumerate(output_blob_names):
blob_utils.py_op_copy_blob(blobs[k], outputs[i])
"""
网络层(op)作用:
- 生成 RPN proposals.
网络层输入和输出 blobs - blobs_in 和 blobs_out
- 输入 blobs_in:
- rpn_cls_probs: 4D tensor, shape (N, A, H, W)
其中, N - minibatch images 数
A - 每个位置(per locations)的 anchors 数
(H, W) - 预测网格(grid) 的空间尺寸
每个值表示一个物体的概率(probability of object), [0, 1]
- rpn_cls_pred: 4D tensor, shape (N, 4*A, H, W)
将 anchor boxes 变换为 RPN proposals 的预测的 deltas 值.
- im_info: 2D tensor, shape (N, 3)
其中,3 列值分别是输入图片的 [height, width, scale].
height 和 width 是网络的输入.
scale 是将原始图片缩放到网络输入尺寸的缩放因子 scale factor.
- 输出 blobs_out:
- rpn_rois: 2D tensor, shape (R, 5)
对于 R 个 PRN proposals, 5 列值分别为 [batch ind, x1, y1, x2, y2].
boxes 是关于网络输入尺寸的,即:原始图片的 scaled 版本.
这些 proposals 必须缩放scaled: 1/scale (其中,scale 来自 im_info) 以变换到原始输入图片的坐标系统.
- rpn_rois_probs: 1D tensor, objectness 概率分数(probability scores). (从 rpn_cls_probs 提取得到.)
"""
import numpy as np
from core.config import cfg
import utils.boxes as box_utils
class GenerateProposalsOp(object):
"""
应用估计的边界框(bounding-box)变换到一系列规则 boxes(即:anchors),
以输出目标检测 proposals.
"""
def __init__(self, anchors, spatial_scale, train):
self._anchors = anchors
self._num_anchors = self._anchors.shape[0]
self._feat_stride = 1. / spatial_scale
self._train = train
def forward(self, inputs, outputs):
"""
参见: modeling.detector.GenerateProposals.
"""
# 主要步骤:
# 1. 对于在 (H,W) 网格grid 中的每个位置:
# 生成在 cell i 为中心的 A 个 anchor boxes
# 对 cell i 为中心的 A 个 anchors 中的每一个 anchor,应用预测的 bbox deltas.
# 2. 裁剪预测的 boxes 到图片
# 3. 移除 height < thresold 或 width < threshold 的预测 boxes
# 4. 根据 score 对所有的 (proposal, score) pairs对,由高到低进行排序
# 5. NMS 处理前,取 top pre_nms_topN 个 proposals
# 6. 对于保留的 proposals, NMS 处理,其中阈值 threshold 0.7
# 7. NMS 处理后,取 top after_nms_topN 个 proposals
# 8. 返回 top proposals
# 对于每个 PRN anchor, fg object 的预测概率
scores = inputs[0].data
# 预测的 anchors 变换
bbox_deltas = inputs[1].data
# 输入图片 (height, width, scale)
# 其中,scale 是应用到原始数据集图片的缩放因子scale factor,以得到网络图片输入.
im_info = inputs[2].data
height, width = scores.shape[-2:]
# 1. 从 bbox deltas 和 shifted anchors 中生成 proposals
# 枚举在 (H, W) 网格 grid 上所有的平移位置shifted positions
shift_x = np.arange(0, width) * self._feat_stride
shift_y = np.arange(0, height) * self._feat_stride
shift_x, shift_y = np.meshgrid(shift_x, shift_y, copy=False)
# 变化为 (K, 4), K=H*W, 每一列值为(dx, dy, dx, dy)
# 指向每个网络位置grid location 的平移
shifts = np.vstack((shift_x.ravel(), shift_y.ravel(),
shift_x.ravel(), shift_y.ravel())).transpose()
# 对(H, W) 网格的所有位置所枚举的所有 anchors 进行评议shifts
# - add A anchors of shape (1, A, 4) to
# - K shifts of shape (K, 1, 4) to get
# - all shifted anchors of shape (K, A, 4)
# - reshape to (K*A, 4) shifted anchors
num_images = inputs[0].shape[0]
A = self._num_anchors
K = shifts.shape[0]
all_anchors = self._anchors[np.newaxis, :, :] + shifts[:, np.newaxis, :]
all_anchors = all_anchors.reshape((K * A, 4))
rois = np.empty((0, 5), dtype=np.float32)
roi_probs = np.empty((0, 1), dtype=np.float32)
for im_i in range(num_images):
im_i_boxes, im_i_probs = self.proposals_for_one_image(im_info[im_i, :],
all_anchors,
bbox_deltas[im_i, :, :, :],
scores[im_i, :, :, :] )
batch_inds = im_i * np.ones((im_i_boxes.shape[0], 1), dtype=np.float32 )
im_i_rois = np.hstack((batch_inds, im_i_boxes))
rois = np.append(rois, im_i_rois, axis=0)
roi_probs = np.append(roi_probs, im_i_probs, axis=0)
outputs[0].reshape(rois.shape)
outputs[0].data[...] = rois
if len(outputs) > 1:
outputs[1].reshape(roi_probs.shape)
outputs[1].data[...] = roi_probs
def proposals_for_one_image(self, im_info, all_anchors, bbox_deltas, scores):
# 模式相关的配置
cfg_key = 'TRAIN' if self._train else 'TEST'
pre_nms_topN = cfg[cfg_key].RPN_PRE_NMS_TOP_N
post_nms_topN = cfg[cfg_key].RPN_POST_NMS_TOP_N
nms_thresh = cfg[cfg_key].RPN_NMS_THRESH
min_size = cfg[cfg_key].RPN_MIN_SIZE
# Transpose 并 reshape 预测的 bbox 变换,以使其与 anchors 具有相同的 order:
# - bbox deltas will be (4 * A, H, W) format from conv output
# - transpose to (H, W, 4 * A)
# - reshape to (H * W * A, 4) where rows are ordered by (H, W, A)
# in slowest to fastest order to match the enumerated anchors
bbox_deltas = bbox_deltas.transpose((1, 2, 0)).reshape((-1, 4))
# 对 scores 进行相同处理:
# - scores are (A, H, W) format from conv output
# - transpose to (H, W, A)
# - reshape to (H * W * A, 1) where rows are ordered by (H, W, A)
# to match the order of anchors and bbox_deltas
scores = scores.transpose((1, 2, 0)).reshape((-1, 1))
# 4. 根据 score 大小从高到低的对所有 (proposal, score) 对pairs 排序
# 5. 取 top pre_nms_topN (e.g. 6000)
if pre_nms_topN <= 0 or pre_nms_topN >= len(scores):
order = np.argsort(-scores.squeeze())
else:
# 避免对很大的 arrays 直接排序;
# 首先将大 arrays 分块,得到 top K 未排序的结果;
# 然后,对分块后的进行排序.(~20x faster for 200k scores)
inds = np.argpartition(-scores.squeeze(), pre_nms_topN)[:pre_nms_topN]
order = np.argsort(-scores[inds].squeeze())
order = inds[order]
bbox_deltas = bbox_deltas[order, :]
all_anchors = all_anchors[order, :]
scores = scores[order]
# 通过 bbox 变换,将 anchors 变换为 proposals
proposals = box_utils.bbox_transform(all_anchors, bbox_deltas, (1.0, 1.0, 1.0, 1.0))
# 2. 裁剪 proposals 到图片(可能会出现面积为 0 的 proposals,下一步会处理掉)
proposals = box_utils.clip_tiled_boxes(proposals, im_info[:2])
# 3. 移除 height < min_size 或 width < min_size 的预测
keep = _filter_boxes(proposals, min_size, im_info)
proposals = proposals[keep, :]
scores = scores[keep]
# 6. 应用 loose nms 处理(e.g. threshold = 0.7)
# 7. 取 after_nms_topN (e.g. 300)
# 8. 返回 top proposals (-> RoIs top)
if nms_thresh > 0:
keep = box_utils.nms(np.hstack((proposals, scores)), nms_thresh)
if post_nms_topN > 0:
keep = keep[:post_nms_topN]
proposals = proposals[keep, :]
scores = scores[keep]
return proposals, scores
def _filter_boxes(boxes, min_size, im_info):
"""
只保留 height 和 width 都大于等于 min_size 的 boxes,且中心位于图片中.
"""
# 缩放 min_size 以匹配图片缩放scale
min_size *= im_info[2]
ws = boxes[:, 2] - boxes[:, 0] + 1
hs = boxes[:, 3] - boxes[:, 1] + 1
x_ctr = boxes[:, 0] + ws / 2.
y_ctr = boxes[:, 1] + hs / 2.
keep = np.where( (ws >= min_size) & (hs >= min_size) &
(x_ctr < im_info[1]) & (y_ctr < im_info[0]))[0]
return keep