根据对应的 roi_data 模块可以处理 对应模型的 minibatch blobs.
构建用于 Fast R-CNN 训练的 minibatches.
"""
处理 Fast R-CNN 所涉及的 minibatch blobs.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import logging
import numpy as np
import numpy.random as npr
from core.config import cfg
import modeling.FPN as fpn
import roi_data.keypoint_rcnn
import roi_data.mask_rcnn
import utils.blob as blob_utils
import utils.boxes as box_utils
logger = logging.getLogger(__name__)
def get_fast_rcnn_blob_names(is_training=True):
"""
Fast R-CNN blob names.
"""
"""
rois blob: R 个 RoIs(regions of interest),
每个 blob 是 5-tuple:(batch_idx, x1, y1, x2, y2),
- batch_idx: 图片 batch index
- (x1, y1, x2, y2):矩形框
"""
blob_names = ['rois']
if is_training:
# labels_int32 blob:
# R categorical labels in [0, ..., K] for K foreground classes plus background
# K 个前景类 + 1 个背景类.
blob_names += ['labels_int32']
if is_training:
# bbox_targets blob:
# R bounding-box regression targets with 4 targets per class
blob_names += ['bbox_targets']
# bbox_inside_weights blob:
# 每个 roi 最多 4 个 targets 被激活,该二值向量表示了激活 targets 的subset.
blob_names += ['bbox_inside_weights']
blob_names += ['bbox_outside_weights']
if is_training and cfg.MODEL.MASK_ON:
# 'mask_rois':
# 训练 mask 预测分支所采样的 RoIs
# Shape is (#masks, 5) in format (batch_idx, x1, y1, x2, y2).
blob_names += ['mask_rois']
# 'roi_has_mask':
# rois 中指定的 RoIs 的二值标签(binart labels),表示每个 RoI 是否有 mask.
# 注:某些情况, *bg* RoI 会有一个值都为 -1(ignore) 的 mask,此时,没有 fg RoIs 可采样.
# Shape is (batchsize).
blob_names += ['roi_has_mask_int32']
# 'masks_int32':
# 'mask_rois' 中指定的 RoIs的二值masks.
# Shape is (#fg, M * M) where M is the ground truth mask size.
blob_names += ['masks_int32']
if is_training and cfg.MODEL.KEYPOINTS_ON:
# 'keypoint_rois':
# 训练 keypoint 预测分支所采样的 RoIs
# Shape is (#instances, 5) in format (batch_idx, x1, y1, x2, y2).
blob_names += ['keypoint_rois']
# 'keypoint_locations_int32':
# KRCNN.HEATMAP_SIZE**2 大小的 array 中 keypoint 的索引index.
# Shape is (#instances). Used in SoftmaxWithLoss.
blob_names += ['keypoint_locations_int32']
# 'keypoint_weights':
# 'keypoint_locations_int32' 中每个 target 的权重weight
# Shape is (#instances). Used in SoftmaxWithLoss.
blob_names += ['keypoint_weights']
# 'keypoint_loss_normalizer':
# 可选参数,如果 cfg.KRCNN.NORMALIZE_BY_VISIBLE_KEYPOINTS = False,
# 使用归一化因子.
blob_names += ['keypoint_loss_normalizer']
if cfg.FPN.FPN_ON and cfg.FPN.MULTILEVEL_ROIS:
"""
支持 FPN multi-level rois without bbox reg isn't implemented (... and may never be implemented)
"""
k_max = cfg.FPN.ROI_MAX_LEVEL
k_min = cfg.FPN.ROI_MIN_LEVEL
# Same format as rois blob, but one per FPN level
for lvl in range(k_min, k_max + 1):
blob_names += ['rois_fpn' + str(lvl)]
blob_names += ['rois_idx_restore_int32']
if is_training:
if cfg.MODEL.MASK_ON:
for lvl in range(k_min, k_max + 1):
blob_names += ['mask_rois_fpn' + str(lvl)]
blob_names += ['mask_rois_idx_restore_int32']
if cfg.MODEL.KEYPOINTS_ON:
for lvl in range(k_min, k_max + 1):
blob_names += ['keypoint_rois_fpn' + str(lvl)]
blob_names += ['keypoint_rois_idx_restore_int32']
return blob_names
def add_fast_rcnn_blobs(blobs, im_scales, roidb):
"""
添加 blobs ,用于训练 Fast R-CNN style models.
"""
# 从每张图片采样训练 RoIs,并添加到 blob 列表lists
for im_i, entry in enumerate(roidb):
frcn_blobs = _sample_rois(entry, im_scales[im_i], im_i)
for k, v in frcn_blobs.items():
blobs[k].append(v)
# 将 blob lists 连接为 tensors
for k, v in blobs.items():
if isinstance(v, list) and len(v) > 0:
blobs[k] = np.concatenate(v)
# 添加 FPN multilevel training RoIs, if configured
if cfg.FPN.FPN_ON and cfg.FPN.MULTILEVEL_ROIS:
_add_multilevel_rois(blobs)
# 在处理完所有的 minibatch 图片后,进行安全性检查.
valid = True
if cfg.MODEL.KEYPOINTS_ON:
valid = roi_data.keypoint_rcnn.finalize_keypoint_minibatch(blobs, valid)
return valid
def _sample_rois(roidb, im_scale, batch_idx):
"""
生成由 foreground 和 background 样本组成的 RoIs 的随机采样.
"""
rois_per_image = int(cfg.TRAIN.BATCH_SIZE_PER_IM)
fg_rois_per_image = int(np.round(cfg.TRAIN.FG_FRACTION * rois_per_image))
max_overlaps = roidb['max_overlaps']
# 选择 foreground RoIs,overlap >= FG_THRESH 的
fg_inds = np.where(max_overlaps >= cfg.TRAIN.FG_THRESH)[0]
# 避免出现的情况:
# 图片中的 foreground RoIs 的数量小于 fg_rois_per_image
fg_rois_per_this_image = np.minimum(fg_rois_per_image, fg_inds.size)
# 无替换地(without replacement)采样 foreground 区域
if fg_inds.size > 0:
fg_inds = npr.choice(fg_inds, size=fg_rois_per_this_image, replace=False)
# 选择 background RoIs, overlap 在 [BG_THRESH_LO, BG_THRESH_HI) 之间的
bg_inds = np.where((max_overlaps < cfg.TRAIN.BG_THRESH_HI) &
(max_overlaps >= cfg.TRAIN.BG_THRESH_LO) )[0]
# 计算从图片中选择的 background RoIs 数量
# (避免数量太少)
bg_rois_per_this_image = rois_per_image - fg_rois_per_this_image
bg_rois_per_this_image = np.minimum(bg_rois_per_this_image, bg_inds.size)
# 无替换地(without replacement)采样 background 区域
if bg_inds.size > 0:
bg_inds = npr.choice(bg_inds, size=bg_rois_per_this_image, replace=False)
# 所选择的 indices (both fg and bg)
keep_inds = np.append(fg_inds, bg_inds)
# Label 是与每个 RoI 具有最大 overlap 的类别class
sampled_labels = roidb['max_classes'][keep_inds]
sampled_labels[fg_rois_per_this_image:] = 0 # Label bg RoIs with class 0
sampled_boxes = roidb['boxes'][keep_inds]
if 'bbox_targets' not in roidb:
gt_inds = np.where(roidb['gt_classes'] > 0)[0]
gt_boxes = roidb['boxes'][gt_inds, :]
gt_assignments = gt_inds[roidb['box_to_gt_ind_map'][keep_inds]]
bbox_targets = _compute_targets(sampled_boxes,
gt_boxes[gt_assignments, :],
sampled_labels)
bbox_targets, bbox_inside_weights = _expand_bbox_targets(bbox_targets)
else:
bbox_targets, bbox_inside_weights = _expand_bbox_targets(roidb['bbox_targets'][keep_inds, :])
bbox_outside_weights = np.array(bbox_inside_weights > 0, dtype=bbox_inside_weights.dtype)
# 缩放Scale rois,并格式化为: (batch_idx, x1, y1, x2, y2)
sampled_rois = sampled_boxes * im_scale
repeated_batch_idx = batch_idx * blob_utils.ones((sampled_rois.shape[0], 1))
sampled_rois = np.hstack((repeated_batch_idx, sampled_rois))
# Base Fast R-CNN blobs
blob_dict = dict(labels_int32=sampled_labels.astype(np.int32, copy=False),
rois=sampled_rois,
bbox_targets=bbox_targets,
bbox_inside_weights=bbox_inside_weights,
bbox_outside_weights=bbox_outside_weights )
# Optionally add Mask R-CNN blobs
if cfg.MODEL.MASK_ON:
roi_data.mask_rcnn.add_mask_rcnn_blobs(
blob_dict, sampled_boxes, roidb, im_scale, batch_idx )
# Optionally add Keypoint R-CNN blobs
if cfg.MODEL.KEYPOINTS_ON:
roi_data.keypoint_rcnn.add_keypoint_rcnn_blobs(
blob_dict, roidb, fg_rois_per_image, fg_inds, im_scale, batch_idx)
return blob_dict
def _compute_targets(ex_rois, gt_rois, labels):
"""
计算图片的边界框回归目标值bounding-box regression targets.
"""
assert ex_rois.shape[0] == gt_rois.shape[0]
assert ex_rois.shape[1] == 4
assert gt_rois.shape[1] == 4
targets = box_utils.bbox_transform_inv(ex_rois, gt_rois, cfg.MODEL.BBOX_REG_WEIGHTS)
return np.hstack((labels[:, np.newaxis], targets)).astype(np.float32, copy=False )
def _expand_bbox_targets(bbox_target_data):
"""
边界框回归目标值以紧凑形式存储在 roidb 中.
该函数将 targets 展开为网所使用的 4-of-4*K 表示.
(i.e. 只有一个类别class 具有 non-zero targets).
类似地,loss weights 也进行展开.
返回值:
bbox_target_data (ndarray): N x 4K blob of regression targets
bbox_inside_weights (ndarray): N x 4K blob of loss weights
"""
num_bbox_reg_classes = cfg.MODEL.NUM_CLASSES
if cfg.MODEL.CLS_AGNOSTIC_BBOX_REG:
num_bbox_reg_classes = 2 # bg and fg
clss = bbox_target_data[:, 0]
bbox_targets = blob_utils.zeros((clss.size, 4 * num_bbox_reg_classes))
bbox_inside_weights = blob_utils.zeros(bbox_targets.shape)
inds = np.where(clss > 0)[0]
for ind in inds:
cls = int(clss[ind])
start = 4 * cls
end = start + 4
bbox_targets[ind, start:end] = bbox_target_data[ind, 1:]
bbox_inside_weights[ind, start:end] = (1.0, 1.0, 1.0, 1.0)
return bbox_targets, bbox_inside_weights
def _add_multilevel_rois(blobs):
"""
默认情况,只对单 feature map level 添加训练 RoIs.
当使用 FPN时,RoIs 必须根据 level 设置启发式来分配到不同的 FPN levels.
(参见: modeling.FPN.map_rois_to_fpn_levels).
"""
lvl_min = cfg.FPN.ROI_MIN_LEVEL
lvl_max = cfg.FPN.ROI_MAX_LEVEL
def _distribute_rois_over_fpn_levels(rois_blob_name):
"""
分配 rois 到不同的 FPN levels.
"""
# 获取每个 roi 的 target level
# blob rois 格式为:(batch_idx, x1, y1, x2, y2), 因此,取1:5 列的 box 坐标
target_lvls = fpn.map_rois_to_fpn_levels(blobs[rois_blob_name][:, 1:5],
lvl_min, lvl_max )
# Add per FPN level roi blobs named like: <rois_blob_name>_fpn<lvl>
fpn.add_multilevel_roi_blobs(blobs,
rois_blob_name,
blobs[rois_blob_name],
target_lvls,
lvl_min,
lvl_max)
_distribute_rois_over_fpn_levels('rois')
if cfg.MODEL.MASK_ON:
_distribute_rois_over_fpn_levels('mask_rois')
if cfg.MODEL.KEYPOINTS_ON:
_distribute_rois_over_fpn_levels('keypoint_rois')
构建 Mask R-CNN 训练的 minibatches.
"""
处理 Mask R-CNN 的 minibatch blobs.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import logging
import numpy as np
from core.config import cfg
import utils.blob as blob_utils
import utils.boxes as box_utils
import utils.segms as segm_utils
logger = logging.getLogger(__name__)
def add_mask_rcnn_blobs(blobs, sampled_boxes, roidb, im_scale, batch_idx):
"""
添加 Mask R-CNN 特有的 blobs 到 input blob dictionary.
"""
"""
准备 mask targets:
将一个 gt mask 关联到每个具有 fg 类别标签(non-bg class label)的训练 roi,
"""
M = cfg.MRCNN.RESOLUTION
polys_gt_inds = np.where((roidb['gt_classes'] > 0)
& (roidb['is_crowd'] == 0))[0]
polys_gt = [roidb['segms'][i] for i in polys_gt_inds]
boxes_from_polys = segm_utils.polys_to_boxes(polys_gt)
fg_inds = np.where(blobs['labels_int32'] > 0)[0]
roi_has_mask = blobs['labels_int32'].copy()
roi_has_mask[roi_has_mask > 0] = 1
if fg_inds.shape[0] > 0:
# foreground rois 的类别标签
mask_class_labels = blobs['labels_int32'][fg_inds]
masks = blob_utils.zeros((fg_inds.shape[0], M**2), int32=True)
# 寻找所有的 foreground rois 与边界框之间的重叠区域,封闭区域.
rois_fg = sampled_boxes[fg_inds]
overlaps_bbfg_bbpolys = box_utils.bbox_overlaps(
rois_fg.astype(np.float32, copy=False),
boxes_from_polys.astype(np.float32, copy=False) )
# 将每个 fg rois 映射到 highest overlap 的mask.
# (衡量标准: bbox overlap)
fg_polys_inds = np.argmax(overlaps_bbfg_bbpolys, axis=1)
# 添加 fg targets
for i in range(rois_fg.shape[0]):
fg_polys_ind = fg_polys_inds[i]
poly_gt = polys_gt[fg_polys_ind]
roi_fg = rois_fg[i]
# 将给定 fg roi 中的多边形 mask 转换为 MxM 的二值图像.
mask = segm_utils.polys_to_mask_wrt_box(poly_gt, roi_fg, M)
# 确保 mask 是二值的binary
mask = np.array(mask > 0, dtype=np.int32)
masks[i, :] = np.reshape(mask, M**2)
else: # 如果没有 fg masks
# 网络不能处理空 blobs,因此,需要提供一个 mask.
# 简单采用第一个 bg roi,并给定其一个都是 -1(ignore label) 值的 mask,
# 且其类别标签为 0 (bg).
bg_inds = np.where(blobs['labels_int32'] == 0)[0]
# rois_fg 实际上是一个 background roi, but that's ok because ...
rois_fg = sampled_boxes[bg_inds[0]].reshape((1, -1))
# 设定一个 -1's blob (ignore label)
masks = -blob_utils.ones((1, M**2), int32=True)
# 设定其类别标签 class = 0 (background)
mask_class_labels = blob_utils.zeros((1, ))
# 确保第一个 roi 有一个 mask
roi_has_mask[0] = 1
if cfg.MRCNN.CLS_SPECIFIC_MASK:
masks = _expand_to_class_specific_mask_targets(masks,
mask_class_labels)
# 缩放Scale rois_fg,并格式化为: (batch_idx, x1, y1, x2, y2)
rois_fg *= im_scale
repeated_batch_idx = batch_idx * blob_utils.ones((rois_fg.shape[0], 1))
rois_fg = np.hstack((repeated_batch_idx, rois_fg))
# Update blobs dict with Mask R-CNN blobs
blobs['mask_rois'] = rois_fg
blobs['roi_has_mask_int32'] = roi_has_mask
blobs['masks_int32'] = masks
def _expand_to_class_specific_mask_targets(masks, mask_class_labels):
"""
将 masks 由 shape (#masks, M ** 2) 展开到 (#masks, #classes * M ** 2),
以表示类别已知的 mask targets.
"""
assert masks.shape[0] == mask_class_labels.shape[0]
M = cfg.MRCNN.RESOLUTION
# Target values of -1 are "don't care" / ignore labels
mask_targets = -blob_utils.ones((masks.shape[0],
cfg.MODEL.NUM_CLASSES * M**2),
int32=True )
for i in range(masks.shape[0]):
cls = int(mask_class_labels[i])
start = M**2 * cls
end = start + M**2
# 忽略 background 实例instance
# (只有图片中没有 fg 样本是才会发生)
if cls > 0:
mask_targets[i, start:end] = masks[i, :]
return mask_targets
构建 Mask R-CNN 关于 keypoints 训练的 minibatches.
"""
处理 Mask R-CNN 中关于 keypoint 检测分支训练的 minibatch blobs.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import logging
import numpy as np
from core.config import cfg
import utils.blob as blob_utils
import utils.keypoints as keypoint_utils
logger = logging.getLogger(__name__)
def add_keypoint_rcnn_blobs(blobs, roidb, fg_rois_per_image, fg_inds, im_scale, batch_idx):
"""
添加 Mask R-CNN keypoint 相关的 blobs 到给定的 blobs dictionary.
"""
"""
注: gt_inds 必须与 datasets.json_dataset._merge_proposal_boxes_into_roidb 中的计算一致.
"""
gt_inds = np.where(roidb['gt_classes'] > 0)[0]
max_overlaps = roidb['max_overlaps']
gt_keypoints = roidb['gt_keypoints']
ind_kp = gt_inds[roidb['box_to_gt_ind_map']]
within_box = _within_box(gt_keypoints[ind_kp, :, :], roidb['boxes'])
vis_kp = gt_keypoints[ind_kp, 2, :] > 0
is_visible = np.sum(np.logical_and(vis_kp, within_box), axis=1) > 0
kp_fg_inds = np.where(np.logical_and(max_overlaps >= cfg.TRAIN.FG_THRESH,
is_visible) )[0]
kp_fg_rois_per_this_image = np.minimum(fg_rois_per_image, kp_fg_inds.size)
if kp_fg_inds.size > kp_fg_rois_per_this_image:
kp_fg_inds = np.random.choice(kp_fg_inds,
size=kp_fg_rois_per_this_image,
replace=False )
sampled_fg_rois = roidb['boxes'][kp_fg_inds]
box_to_gt_ind_map = roidb['box_to_gt_ind_map'][kp_fg_inds]
num_keypoints = gt_keypoints.shape[2]
sampled_keypoints = -np.ones((len(sampled_fg_rois),
gt_keypoints.shape[1], num_keypoints),
dtype=gt_keypoints.dtype )
for ii in range(len(sampled_fg_rois)):
ind = box_to_gt_ind_map[ii]
if ind >= 0:
sampled_keypoints[ii, :, :] = gt_keypoints[gt_inds[ind], :, :]
assert np.sum(sampled_keypoints[ii, 2, :]) > 0
heats, weights = keypoint_utils.keypoints_to_heatmap_labels(
sampled_keypoints, sampled_fg_rois )
shape = (sampled_fg_rois.shape[0] * cfg.KRCNN.NUM_KEYPOINTS, 1)
heats = heats.reshape(shape)
weights = weights.reshape(shape)
sampled_fg_rois *= im_scale
repeated_batch_idx = batch_idx * blob_utils.ones(
(sampled_fg_rois.shape[0], 1) )
sampled_fg_rois = np.hstack((repeated_batch_idx, sampled_fg_rois))
blobs['keypoint_rois'] = sampled_fg_rois
blobs['keypoint_locations_int32'] = heats.astype(np.int32, copy=False)
blobs['keypoint_weights'] = weights
def finalize_keypoint_minibatch(blobs, valid):
"""
当所有的 minibatch 图片 blobs 处理完以后,定型 minibatch.
"""
min_count = cfg.KRCNN.MIN_KEYPOINT_COUNT_FOR_VALID_MINIBATCH
num_visible_keypoints = np.sum(blobs['keypoint_weights'])
valid = (valid and len(blobs['keypoint_weights']) > 0 and
num_visible_keypoints > min_count )
# Normalizer to use if cfg.KRCNN.NORMALIZE_BY_VISIBLE_KEYPOINTS is False.
# See modeling.model_builder.add_keypoint_losses
norm = num_visible_keypoints / (
cfg.TRAIN.IMS_PER_BATCH * cfg.TRAIN.BATCH_SIZE_PER_IM *
cfg.TRAIN.FG_FRACTION * cfg.KRCNN.NUM_KEYPOINTS
)
blobs['keypoint_loss_normalizer'] = np.array(norm, dtype=np.float32)
return valid
def _within_box(points, boxes):
"""
确认在给定 box 中的 keypoints.
points: Nx2xK
boxes: Nx4
output: NxK
"""
x_within = np.logical_and(
points[:, 0, :] >= np.expand_dims(boxes[:, 0], axis=1),
points[:, 0, :] <= np.expand_dims(boxes[:, 2], axis=1) )
y_within = np.logical_and(
points[:, 1, :] >= np.expand_dims(boxes[:, 1], axis=1),
points[:, 1, :] <= np.expand_dims(boxes[:, 3], axis=1) )
return np.logical_and(x_within, y_within)
"""
RPN - Region Proposal Networks 构建 minibatch.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import logging
import numpy as np
import numpy.random as npr
from core.config import cfg
import roi_data.data_utils as data_utils
import utils.blob as blob_utils
import utils.boxes as box_utils
logger = logging.getLogger(__name__)
def get_rpn_blob_names(is_training=True):
"""
RPN 使用的 Blob names.
"""
# im_info: (height, width, image scale)
blob_names = ['im_info']
if is_training:
# gt boxes: (batch_idx, x1, y1, x2, y2, cls)
blob_names += ['roidb']
if cfg.FPN.FPN_ON and cfg.FPN.MULTILEVEL_RPN:
# 与 RPN blobs 格式一致, but one per FPN level
for lvl in range(cfg.FPN.RPN_MIN_LEVEL, cfg.FPN.RPN_MAX_LEVEL + 1):
blob_names += ['rpn_labels_int32_wide_fpn' + str(lvl),
'rpn_bbox_targets_wide_fpn' + str(lvl),
'rpn_bbox_inside_weights_wide_fpn' + str(lvl),
'rpn_bbox_outside_weights_wide_fpn' + str(lvl) ]
else:
# Single level RPN blobs
blob_names += ['rpn_labels_int32_wide',
'rpn_bbox_targets_wide',
'rpn_bbox_inside_weights_wide',
'rpn_bbox_outside_weights_wide' ]
return blob_names
def add_rpn_blobs(blobs, im_scales, roidb):
"""
添加 RPN-only 和 end-to-end Faster R-CNN 模型训练所需的 blobs.
"""
if cfg.FPN.FPN_ON and cfg.FPN.MULTILEVEL_RPN:
# RPN applied to many feature levels, as in the FPN paper
k_max = cfg.FPN.RPN_MAX_LEVEL
k_min = cfg.FPN.RPN_MIN_LEVEL
foas = []
for lvl in range(k_min, k_max + 1):
field_stride = 2.**lvl
anchor_sizes = (cfg.FPN.RPN_ANCHOR_START_SIZE * 2.**(lvl - k_min), )
anchor_aspect_ratios = cfg.FPN.RPN_ASPECT_RATIOS
foa = data_utils.get_field_of_anchors(field_stride, anchor_sizes, anchor_aspect_ratios)
foas.append(foa)
all_anchors = np.concatenate([f.field_of_anchors for f in foas])
else:
foa = data_utils.get_field_of_anchors(cfg.RPN.STRIDE,
cfg.RPN.SIZES,
cfg.RPN.ASPECT_RATIOS )
all_anchors = foa.field_of_anchors
for im_i, entry in enumerate(roidb):
scale = im_scales[im_i]
im_height = np.round(entry['height'] * scale)
im_width = np.round(entry['width'] * scale)
gt_inds = np.where( (entry['gt_classes'] > 0) & (entry['is_crowd'] == 0) )[0]
gt_rois = entry['boxes'][gt_inds, :] * scale
# 待办事项(rbg): gt_boxes is poorly named;
# should be something like 'gt_rois_info'
gt_boxes = blob_utils.zeros((len(gt_inds), 6))
gt_boxes[:, 0] = im_i # batch inds
gt_boxes[:, 1:5] = gt_rois
gt_boxes[:, 5] = entry['gt_classes'][gt_inds]
im_info = np.array([[im_height, im_width, scale]], dtype=np.float32)
blobs['im_info'].append(im_info)
# 添加 RPN targets
if cfg.FPN.FPN_ON and cfg.FPN.MULTILEVEL_RPN:
# RPN applied to many feature levels, as in the FPN paper
rpn_blobs = _get_rpn_blobs(im_height, im_width, foas, all_anchors, gt_rois)
for i, lvl in enumerate(range(k_min, k_max + 1)):
for k, v in rpn_blobs[i].items():
blobs[k + '_fpn' + str(lvl)].append(v)
else:
# 经典 RPN, 对单 feature level 应用.
rpn_blobs = _get_rpn_blobs(im_height, im_width, [foa], all_anchors, gt_rois)
for k, v in rpn_blobs.items():
blobs[k].append(v)
for k, v in blobs.items():
if isinstance(v, list) and len(v) > 0:
blobs[k] = np.concatenate(v)
valid_keys = ['has_visible_keypoints', 'boxes', 'segms', 'seg_areas', 'gt_classes',
'gt_overlaps', 'is_crowd', 'box_to_gt_ind_map', 'gt_keypoints' ]
minimal_roidb = [{} for _ in range(len(roidb))]
for i, e in enumerate(roidb):
for k in valid_keys:
if k in e:
minimal_roidb[i][k] = e[k]
blobs['roidb'] = blob_utils.serialize(minimal_roidb)
# Always return valid=True, since RPN minibatches are valid by design
return True
def _get_rpn_blobs(im_height, im_width, foas, all_anchors, gt_boxes):
total_anchors = all_anchors.shape[0]
straddle_thresh = cfg.TRAIN.RPN_STRADDLE_THRESH
if straddle_thresh >= 0:
# 只保留在图片内的 anchors,根据阈值 straddle_thresh
# 设置 TRAIN.RPN_STRADDLE_THRESH = -1 (或一个很大的值) 以保留所有的 anchors.
inds_inside = np.where((all_anchors[:, 0] >= -straddle_thresh) &
(all_anchors[:, 1] >= -straddle_thresh) &
(all_anchors[:, 2] < im_width + straddle_thresh) &
(all_anchors[:, 3] < im_height + straddle_thresh) )[0]
# keep only inside anchors
anchors = all_anchors[inds_inside, :]
else:
inds_inside = np.arange(all_anchors.shape[0])
anchors = all_anchors
num_inside = len(inds_inside)
logger.debug('total_anchors: {}'.format(total_anchors))
logger.debug('inds_inside: {}'.format(num_inside))
logger.debug('anchors.shape: {}'.format(anchors.shape))
# 计算 anchor labels:
# label=1 is positive, 0 is negative, -1 is don't care (ignore)
labels = np.empty((num_inside, ), dtype=np.int32)
labels.fill(-1)
if len(gt_boxes) > 0:
# 计算 anchors 与 gt boxes 重叠区域间的 overlaps
anchor_by_gt_overlap = box_utils.bbox_overlaps(anchors, gt_boxes)
# 映射 anchor 到具有 highest overlap 的 gt box
anchor_to_gt_argmax = anchor_by_gt_overlap.argmax(axis=1)
# 对于每个 anchor, 与最重叠的 gt box 的 overlap 数量
anchor_to_gt_max = anchor_by_gt_overlap[np.arange(num_inside),
anchor_to_gt_argmax]
# 将 gt box映射到具有 highest overlap 的 anchor
gt_to_anchor_argmax = anchor_by_gt_overlap.argmax(axis=0)
#对于每个 gt box, 与最重叠的 anchor 的 overlap 数量
gt_to_anchor_max = anchor_by_gt_overlap[gt_to_anchor_argmax,
np.arange(anchor_by_gt_overlap.shape[1]) ]
# 寻找共享 max overlap 数量的所有 anchors
# (this includes many ties)
anchors_with_max_overlap = np.where(anchor_by_gt_overlap == gt_to_anchor_max)[0]
# Fg label: for each gt use anchors with highest overlap
# (including ties)
labels[anchors_with_max_overlap] = 1
# Fg label: 大于 IOU 阈值
labels[anchor_to_gt_max >= cfg.TRAIN.RPN_POSITIVE_OVERLAP] = 1
# 如果有很多 positive labels, 则随机采样
num_fg = int(cfg.TRAIN.RPN_FG_FRACTION * cfg.TRAIN.RPN_BATCH_SIZE_PER_IM)
fg_inds = np.where(labels == 1)[0]
if len(fg_inds) > num_fg:
disable_inds = npr.choice(fg_inds, size=(len(fg_inds) - num_fg), replace=False)
labels[disable_inds] = -1
fg_inds = np.where(labels == 1)[0]
# 如果有很多 negative labels,则随机采样
# (samples with replacement, but since the set of bg inds is large most
# samples will not have repeats)
num_bg = cfg.TRAIN.RPN_BATCH_SIZE_PER_IM - np.sum(labels == 1)
bg_inds = np.where(anchor_to_gt_max < cfg.TRAIN.RPN_NEGATIVE_OVERLAP)[0]
if len(bg_inds) > num_bg:
enable_inds = bg_inds[npr.randint(len(bg_inds), size=num_bg)]
labels[enable_inds] = 0
bg_inds = np.where(labels == 0)[0]
bbox_targets = np.zeros((num_inside, 4), dtype=np.float32)
bbox_targets[fg_inds, :] = data_utils.compute_targets(anchors[fg_inds, :],
gt_boxes[anchor_to_gt_argmax[fg_inds], :] )
"""
Bbox regression loss 的形式:
loss(x) = weight_outside * L(weight_inside * x)
Inside weights 可以在 element-wist basis 上设为 0.
bbox regression 只对 positive 样本进行训练,因此可以设置其权重为 1.0,否则设为 0.0
Inside weights 相当于 "开关".
"""
bbox_inside_weights = np.zeros((num_inside, 4), dtype=np.float32)
bbox_inside_weights[labels == 1, :] = (1.0, 1.0, 1.0, 1.0)
"""
bbox regression loss 只根据 minibatch 内的图片数进行取平均.
根据所选取的 anchors 样本总数进行取平均.
Outside weights 用于对每个 loss 逐元素缩放(scale each element-wise loss),
因此,最终的对 minibatch 求平均是正确的.
Outside weights 相当于 "权重".
"""
bbox_outside_weights = np.zeros((num_inside, 4), dtype=np.float32)
# uniform weighting of examples (given non-uniform sampling)
num_examples = np.sum(labels >= 0)
bbox_outside_weights[labels == 1, :] = 1.0 / num_examples
bbox_outside_weights[labels == 0, :] = 1.0 / num_examples
# Map up to original set of anchors
labels = data_utils.unmap(labels, total_anchors, inds_inside, fill=-1)
bbox_targets = data_utils.unmap(bbox_targets, total_anchors, inds_inside, fill=0)
bbox_inside_weights = data_utils.unmap(bbox_inside_weights, total_anchors, inds_inside, fill=0)
bbox_outside_weights = data_utils.unmap(bbox_outside_weights, total_anchors, inds_inside, fill=0)
# 对生成的 labels, etc. 分割为 labels per each field of anchors
blobs_out = []
start_idx = 0
for foa in foas:
H = foa.field_size
W = foa.field_size
A = foa.num_cell_anchors
end_idx = start_idx + H * W * A
_labels = labels[start_idx:end_idx]
_bbox_targets = bbox_targets[start_idx:end_idx, :]
_bbox_inside_weights = bbox_inside_weights[start_idx:end_idx, :]
_bbox_outside_weights = bbox_outside_weights[start_idx:end_idx, :]
start_idx = end_idx
# 输出 labels 的 shape (1, A, height, width)
_labels = _labels.reshape((1, H, W, A)).transpose(0, 3, 1, 2)
# bbox_targets 输出的 shape (1, 4 * A, height, width)
_bbox_targets = _bbox_targets.reshape( (1, H, W, A * 4)).transpose(0, 3, 1, 2)
# bbox_inside_weights 输出的 shape (1, 4 * A, height, width)
_bbox_inside_weights = _bbox_inside_weights.reshape((1, H, W, A * 4)).transpose(0, 3, 1, 2)
# bbox_outside_weights 输出的 shape (1, 4 * A, height, width)
_bbox_outside_weights = _bbox_outside_weights.reshape( (1, H, W, A * 4)).transpose(0, 3, 1, 2)
blobs_out.append(dict(rpn_labels_int32_wide=_labels,
rpn_bbox_targets_wide=_bbox_targets,
rpn_bbox_inside_weights_wide=_bbox_inside_weights,
rpn_bbox_outside_weights_wide=_bbox_outside_weights) )
return blobs_out[0] if len(blobs_out) == 1 else blobs_out
"""
计算训练 RetinaNet 网络的 minibatch blobs.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import numpy as np
import logging
import utils.boxes as box_utils
import roi_data.data_utils as data_utils
from core.config import cfg
logger = logging.getLogger(__name__)
def get_retinanet_blob_names(is_training=True):
"""
返回 blob names,以 data loader 读取的顺序.
N = number of images per minibatch
A = number of anchors = num_scales * num_aspect_ratios (for example 9 used in RetinaNet paper)
H, W = spatial dimensions (different for each FPN level)
M = Out of all the anchors generated, 取决于 positive/negative IoU overlap thresholds,
会得到 M 个 positive anchors.
这些是 bounding box 网络分支来回归的 anchors.
retnet_cls_labels -> labels for the cls branch for each FPN level
Shape: N x A x H x W
retnet_roi_bbox_targets -> targets for the bbox regression branch
Shape: M x 4
retnet_roi_fg_bbox_locs -> bbox 回归时,由于只对 fg bboxes 进行回归,
且,网络的预测输出的shape 是 N x (A * 4) x H x W ,
因此,将 positive boxes 的位置存储在 retnet_roi_fg_bbox_locs blobs,
其shape 为 M x 4,每一行的元素为:[img_id, anchor_id, x_loc, y_loc]
"""
# im_info: (height, width, image scale)
blob_names = ['im_info']
assert cfg.FPN.FPN_ON, "RetinaNet uses FPN for dense detection"
# Same format as RPN blobs, but one per FPN level
if is_training:
blob_names += ['retnet_fg_num', 'retnet_bg_num']
for lvl in range(cfg.FPN.RPN_MIN_LEVEL, cfg.FPN.RPN_MAX_LEVEL + 1):
suffix = 'fpn{}'.format(lvl)
blob_names += ['retnet_cls_labels_' + suffix,
'retnet_roi_bbox_targets_' + suffix,
'retnet_roi_fg_bbox_locs_' + suffix, ]
return blob_names
def add_retinanet_blobs(blobs, im_scales, roidb, image_width, image_height):
"""
添加 RetinaNet blobs.
"""
# RetinaNet is applied to many feature levels, as in the FPN paper
k_max, k_min = cfg.FPN.RPN_MAX_LEVEL, cfg.FPN.RPN_MIN_LEVEL
scales_per_octave = cfg.RETINANET.SCALES_PER_OCTAVE
num_aspect_ratios = len(cfg.RETINANET.ASPECT_RATIOS)
aspect_ratios = cfg.RETINANET.ASPECT_RATIOS
anchor_scale = cfg.RETINANET.ANCHOR_SCALE
# get anchors from all levels for all scales/aspect ratios
foas = []
for lvl in range(k_min, k_max + 1):
stride = 2. ** lvl
for octave in range(scales_per_octave):
octave_scale = 2 ** (octave / float(scales_per_octave))
for idx in range(num_aspect_ratios):
anchor_sizes = (stride * octave_scale * anchor_scale, )
anchor_aspect_ratios = (aspect_ratios[idx], )
foa = data_utils.get_field_of_anchors(
stride, anchor_sizes, anchor_aspect_ratios, octave, idx)
foas.append(foa)
all_anchors = np.concatenate([f.field_of_anchors for f in foas])
blobs['retnet_fg_num'], blobs['retnet_bg_num'] = 0.0, 0.0
for im_i, entry in enumerate(roidb):
scale = im_scales[im_i]
im_height = np.round(entry['height'] * scale)
im_width = np.round(entry['width'] * scale)
gt_inds = np.where((entry['gt_classes'] > 0) & (entry['is_crowd'] == 0))[0]
assert len(gt_inds) > 0, 'Empty ground truth empty for image is not allowed. Please check.'
gt_rois = entry['boxes'][gt_inds, :] * scale
gt_classes = entry['gt_classes'][gt_inds]
im_info = np.array([[im_height, im_width, scale]], dtype=np.float32)
blobs['im_info'].append(im_info)
retinanet_blobs, fg_num, bg_num = _get_retinanet_blobs(
foas, all_anchors, gt_rois, gt_classes, image_width, image_height)
for i, foa in enumerate(foas):
for k, v in retinanet_blobs[i].items():
# the way it stacks is:
# [[anchors for image1] + [anchors for images 2]]
level = int(np.log2(foa.stride))
key = '{}_fpn{}'.format(k, level)
if k == 'retnet_roi_fg_bbox_locs':
v[:, 0] = im_i
# loc_stride: 80 * 4 if cls_specific else 4
loc_stride = 4 # 4 coordinate corresponding to bbox prediction
if cfg.RETINANET.CLASS_SPECIFIC_BBOX:
loc_stride *= (cfg.MODEL.NUM_CLASSES - 1)
anchor_ind = foa.octave * num_aspect_ratios + foa.aspect
# v[:, 1] is the class label [range 0-80] if we do
# class-specfic bbox otherwise it is 0. In case of class
# specific, based on the label, the location of current
# anchor is class_label * 4 and then we take into account
# the anchor_ind if the anchors
v[:, 1] *= 4
v[:, 1] += loc_stride * anchor_ind
blobs[key].append(v)
blobs['retnet_fg_num'] += fg_num
blobs['retnet_bg_num'] += bg_num
blobs['retnet_fg_num'] = blobs['retnet_fg_num'].astype(np.float32)
blobs['retnet_bg_num'] = blobs['retnet_bg_num'].astype(np.float32)
N = len(roidb)
for k, v in blobs.items():
if isinstance(v, list) and len(v) > 0:
# compute number of anchors
A = int(len(v) / N)
# for the cls branch labels [per fpn level],
# we have blobs['retnet_cls_labels_fpn{}'] as a list until this step
# and length of this list is N x A where
# N = num_images, A = num_anchors for example, N = 2, A = 9
# Each element of the list has the shape 1 x 1 x H x W where H, W are
# spatial dimension of curret fpn lvl. Let a{i} denote the element
# corresponding to anchor i [9 anchors total] in the list.
# The elements in the list are in order [[a0, ..., a9], [a0, ..., a9]]
# however the network will make predictions like 2 x (9 * 80) x H x W
# so we first concatenate the elements of each image to a numpy array
# and then concatenate the two images to get the 2 x 9 x H x W
if k.find('retnet_cls_labels') >= 0:
tmp = []
# concat anchors within an image
for i in range(0, len(v), A):
tmp.append(np.concatenate(v[i: i + A], axis=1))
# concat images
blobs[k] = np.concatenate(tmp, axis=0)
else:
# for the bbox branch elements [per FPN level],
# we have the targets and the fg boxes locations
# in the shape: M x 4 where M is the number of fg locations in a
# given image at the current FPN level. For the given level,
# the bbox predictions will be. The elements in the list are in
# order [[a0, ..., a9], [a0, ..., a9]]
# Concatenate them to form M x 4
blobs[k] = np.concatenate(v, axis=0)
return True
def _get_retinanet_blobs(
foas, all_anchors, gt_boxes, gt_classes, im_width, im_height):
total_anchors = all_anchors.shape[0]
logger.debug('Getting mad blobs: im_height {} im_width: {}'.format(
im_height, im_width))
inds_inside = np.arange(all_anchors.shape[0])
anchors = all_anchors
num_inside = len(inds_inside)
logger.debug('total_anchors: {}'.format(total_anchors))
logger.debug('inds_inside: {}'.format(num_inside))
logger.debug('anchors.shape: {}'.format(anchors.shape))
# Compute anchor labels:
# label=1 is positive, 0 is negative, -1 is don't care (ignore)
labels = np.empty((num_inside, ), dtype=np.float32)
labels.fill(-1)
if len(gt_boxes) > 0:
# Compute overlaps between the anchors and the gt boxes overlaps
anchor_by_gt_overlap = box_utils.bbox_overlaps(anchors, gt_boxes)
# Map from anchor to gt box that has highest overlap
anchor_to_gt_argmax = anchor_by_gt_overlap.argmax(axis=1)
# For each anchor, amount of overlap with most overlapping gt box
anchor_to_gt_max = anchor_by_gt_overlap[
np.arange(num_inside), anchor_to_gt_argmax]
# Map from gt box to an anchor that has highest overlap
gt_to_anchor_argmax = anchor_by_gt_overlap.argmax(axis=0)
# For each gt box, amount of overlap with most overlapping anchor
gt_to_anchor_max = anchor_by_gt_overlap[
gt_to_anchor_argmax, np.arange(anchor_by_gt_overlap.shape[1])]
# Find all anchors that share the max overlap amount
# (this includes many ties)
anchors_with_max_overlap = np.where(
anchor_by_gt_overlap == gt_to_anchor_max)[0]
# Fg label: for each gt use anchors with highest overlap
# (including ties)
gt_inds = anchor_to_gt_argmax[anchors_with_max_overlap]
labels[anchors_with_max_overlap] = gt_classes[gt_inds]
# Fg label: above threshold IOU
inds = anchor_to_gt_max >= cfg.RETINANET.POSITIVE_OVERLAP
gt_inds = anchor_to_gt_argmax[inds]
labels[inds] = gt_classes[gt_inds]
fg_inds = np.where(labels >= 1)[0]
bg_inds = np.where(anchor_to_gt_max < cfg.RETINANET.NEGATIVE_OVERLAP)[0]
labels[bg_inds] = 0
num_fg, num_bg = len(fg_inds), len(bg_inds)
bbox_targets = np.zeros((num_inside, 4), dtype=np.float32)
bbox_targets[fg_inds, :] = data_utils.compute_targets(
anchors[fg_inds, :], gt_boxes[anchor_to_gt_argmax[fg_inds], :])
# Map up to original set of anchors
labels = data_utils.unmap(labels, total_anchors, inds_inside, fill=-1)
bbox_targets = data_utils.unmap(bbox_targets, total_anchors, inds_inside, fill=0)
# Split the generated labels, etc. into labels per each field of anchors
blobs_out = []
start_idx = 0
for foa in foas:
H = foa.field_size
W = foa.field_size
end_idx = start_idx + H * W
_labels = labels[start_idx:end_idx]
_bbox_targets = bbox_targets[start_idx:end_idx, :]
start_idx = end_idx
# labels output with shape (1, height, width)
_labels = _labels.reshape((1, 1, H, W))
# bbox_targets output with shape (1, 4 * A, height, width)
_bbox_targets = _bbox_targets.reshape((1, H, W, 4)).transpose(0, 3, 1, 2)
stride = foa.stride
w = int(im_width / stride)
h = int(im_height / stride)
# data for select_smooth_l1 loss
num_classes = cfg.MODEL.NUM_CLASSES - 1
inds_4d = np.where(_labels > 0)
M = len(inds_4d)
_roi_bbox_targets = np.zeros((0, 4))
_roi_fg_bbox_locs = np.zeros((0, 4))
if M > 0:
im_inds, y, x = inds_4d[0], inds_4d[2], inds_4d[3]
_roi_bbox_targets = np.zeros((len(im_inds), 4))
_roi_fg_bbox_locs = np.zeros((len(im_inds), 4))
lbls = _labels[im_inds, :, y, x]
for i, lbl in enumerate(lbls):
l = lbl[0] - 1
if not cfg.RETINANET.CLASS_SPECIFIC_BBOX:
l = 0
assert l >= 0 and l < num_classes, 'label out of the range'
_roi_bbox_targets[i, :] = _bbox_targets[:, :, y[i], x[i]]
_roi_fg_bbox_locs[i, :] = np.array([[0, l, y[i], x[i]]])
blobs_out.append(dict(retnet_cls_labels=_labels[:, :, 0:h, 0:w].astype(np.int32),
retnet_roi_bbox_targets=_roi_bbox_targets.astype(np.float32),
retnet_roi_fg_bbox_locs=_roi_fg_bbox_locs.astype(np.float32), ) )
out_num_fg = np.array([num_fg + 1.0], dtype=np.float32)
out_num_bg = (np.array([num_bg + 1.0]) * (cfg.MODEL.NUM_CLASSES - 1) +
out_num_fg * (cfg.MODEL.NUM_CLASSES - 2))
return blobs_out, out_num_fg, out_num_bg