发布2018-05-17 10:19:34
根据对应的 roi_data 模块可以处理 对应模型的 minibatch blobs.

  • fast_rcnn.py
  • mask_rcnn.py
  • keypoint_rcnn.py
  • rpn.py
  • retinanet.py

1. fast_rcnn.py

构建用于 Fast R-CNN 训练的 minibatches.

处理 Fast R-CNN 所涉及的 minibatch blobs.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import logging
import numpy as np
import numpy.random as npr

from core.config import cfg
import modeling.FPN as fpn
import roi_data.keypoint_rcnn
import roi_data.mask_rcnn
import utils.blob as blob_utils
import utils.boxes as box_utils

logger = logging.getLogger(__name__)

def get_fast_rcnn_blob_names(is_training=True):
    Fast R-CNN blob names.
    rois blob: R 个 RoIs(regions of interest),
              每个 blob 是 5-tuple:(batch_idx, x1, y1, x2, y2),
                 - batch_idx: 图片 batch index
                 - (x1, y1, x2, y2):矩形框
    blob_names = ['rois']
    if is_training:
        # labels_int32 blob: 
        #   R categorical labels in [0, ..., K] for K foreground classes plus background
        #   K 个前景类 + 1 个背景类.
        blob_names += ['labels_int32']
    if is_training:
        # bbox_targets blob: 
        #   R bounding-box regression targets with 4 targets per class
        blob_names += ['bbox_targets']
        # bbox_inside_weights blob: 
        #   每个 roi 最多 4 个 targets 被激活,该二值向量表示了激活 targets 的subset.
        blob_names += ['bbox_inside_weights']
        blob_names += ['bbox_outside_weights']
    if is_training and cfg.MODEL.MASK_ON:
        # 'mask_rois': 
        #   训练 mask 预测分支所采样的 RoIs
        #   Shape is (#masks, 5) in format (batch_idx, x1, y1, x2, y2).
        blob_names += ['mask_rois']
        # 'roi_has_mask':  
        #   rois 中指定的 RoIs 的二值标签(binart labels),表示每个 RoI 是否有 mask.
        #   注:某些情况, *bg* RoI 会有一个值都为 -1(ignore) 的 mask,此时,没有 fg RoIs 可采样.
        #       Shape is (batchsize).
        blob_names += ['roi_has_mask_int32']
        # 'masks_int32':
        #   'mask_rois' 中指定的 RoIs的二值masks.
        #   Shape is (#fg, M * M) where M is the ground truth mask size.
        blob_names += ['masks_int32']
    if is_training and cfg.MODEL.KEYPOINTS_ON:
        # 'keypoint_rois': 
        #   训练 keypoint 预测分支所采样的 RoIs
        #   Shape is (#instances, 5) in format (batch_idx, x1, y1, x2, y2).
        blob_names += ['keypoint_rois']
        # 'keypoint_locations_int32': 
        #   KRCNN.HEATMAP_SIZE**2 大小的 array 中 keypoint 的索引index.
        #   Shape is (#instances). Used in SoftmaxWithLoss.
        blob_names += ['keypoint_locations_int32']
        # 'keypoint_weights': 
        #   'keypoint_locations_int32' 中每个 target 的权重weight
        #   Shape is (#instances). Used in SoftmaxWithLoss.
        blob_names += ['keypoint_weights']
        # 'keypoint_loss_normalizer': 
        #   可选参数,如果 cfg.KRCNN.NORMALIZE_BY_VISIBLE_KEYPOINTS = False,
        #           使用归一化因子.
        blob_names += ['keypoint_loss_normalizer']
    if cfg.FPN.FPN_ON and cfg.FPN.MULTILEVEL_ROIS:
        支持 FPN multi-level rois without bbox reg isn't implemented (... and may never be implemented)
        k_max = cfg.FPN.ROI_MAX_LEVEL
        k_min = cfg.FPN.ROI_MIN_LEVEL
        # Same format as rois blob, but one per FPN level
        for lvl in range(k_min, k_max + 1):
            blob_names += ['rois_fpn' + str(lvl)]
        blob_names += ['rois_idx_restore_int32']
        if is_training:
            if cfg.MODEL.MASK_ON:
                for lvl in range(k_min, k_max + 1):
                    blob_names += ['mask_rois_fpn' + str(lvl)]
                blob_names += ['mask_rois_idx_restore_int32']
            if cfg.MODEL.KEYPOINTS_ON:
                for lvl in range(k_min, k_max + 1):
                    blob_names += ['keypoint_rois_fpn' + str(lvl)]
                blob_names += ['keypoint_rois_idx_restore_int32']
    return blob_names

def add_fast_rcnn_blobs(blobs, im_scales, roidb):
    添加 blobs ,用于训练 Fast R-CNN style models.
    # 从每张图片采样训练 RoIs,并添加到 blob 列表lists
    for im_i, entry in enumerate(roidb):
        frcn_blobs = _sample_rois(entry, im_scales[im_i], im_i)
        for k, v in frcn_blobs.items():
    # 将 blob lists 连接为 tensors
    for k, v in blobs.items():
        if isinstance(v, list) and len(v) > 0:
            blobs[k] = np.concatenate(v)
    # 添加 FPN multilevel training RoIs, if configured
    if cfg.FPN.FPN_ON and cfg.FPN.MULTILEVEL_ROIS:

    # 在处理完所有的 minibatch 图片后,进行安全性检查.
    valid = True
        valid = roi_data.keypoint_rcnn.finalize_keypoint_minibatch(blobs, valid)

    return valid

def _sample_rois(roidb, im_scale, batch_idx):
    生成由 foreground 和 background 样本组成的 RoIs 的随机采样.
    rois_per_image = int(cfg.TRAIN.BATCH_SIZE_PER_IM)
    fg_rois_per_image = int(np.round(cfg.TRAIN.FG_FRACTION * rois_per_image))
    max_overlaps = roidb['max_overlaps']

    # 选择 foreground RoIs,overlap >= FG_THRESH 的
    fg_inds = np.where(max_overlaps >= cfg.TRAIN.FG_THRESH)[0]
    # 避免出现的情况:
    #   图片中的 foreground RoIs 的数量小于 fg_rois_per_image
    fg_rois_per_this_image = np.minimum(fg_rois_per_image, fg_inds.size)
    # 无替换地(without replacement)采样 foreground 区域
    if fg_inds.size > 0:
        fg_inds = npr.choice(fg_inds, size=fg_rois_per_this_image, replace=False)

    # 选择 background RoIs, overlap 在 [BG_THRESH_LO, BG_THRESH_HI) 之间的
    bg_inds = np.where((max_overlaps < cfg.TRAIN.BG_THRESH_HI) &
                       (max_overlaps >= cfg.TRAIN.BG_THRESH_LO) )[0]
    # 计算从图片中选择的 background RoIs 数量
    # (避免数量太少)
    bg_rois_per_this_image = rois_per_image - fg_rois_per_this_image
    bg_rois_per_this_image = np.minimum(bg_rois_per_this_image, bg_inds.size)
    # 无替换地(without replacement)采样 background 区域
    if bg_inds.size > 0:
        bg_inds = npr.choice(bg_inds, size=bg_rois_per_this_image, replace=False)

    # 所选择的 indices (both fg and bg)
    keep_inds = np.append(fg_inds, bg_inds)
    # Label 是与每个 RoI 具有最大 overlap 的类别class
    sampled_labels = roidb['max_classes'][keep_inds]
    sampled_labels[fg_rois_per_this_image:] = 0  # Label bg RoIs with class 0
    sampled_boxes = roidb['boxes'][keep_inds]

    if 'bbox_targets' not in roidb:
        gt_inds = np.where(roidb['gt_classes'] > 0)[0]
        gt_boxes = roidb['boxes'][gt_inds, :]
        gt_assignments = gt_inds[roidb['box_to_gt_ind_map'][keep_inds]]
        bbox_targets = _compute_targets(sampled_boxes, 
                                        gt_boxes[gt_assignments, :], 
        bbox_targets, bbox_inside_weights = _expand_bbox_targets(bbox_targets)
        bbox_targets, bbox_inside_weights = _expand_bbox_targets(roidb['bbox_targets'][keep_inds, :])

    bbox_outside_weights = np.array(bbox_inside_weights > 0, dtype=bbox_inside_weights.dtype)

    # 缩放Scale rois,并格式化为: (batch_idx, x1, y1, x2, y2)
    sampled_rois = sampled_boxes * im_scale
    repeated_batch_idx = batch_idx * blob_utils.ones((sampled_rois.shape[0], 1))
    sampled_rois = np.hstack((repeated_batch_idx, sampled_rois))

    # Base Fast R-CNN blobs
    blob_dict = dict(labels_int32=sampled_labels.astype(np.int32, copy=False),
                     bbox_outside_weights=bbox_outside_weights )

    # Optionally add Mask R-CNN blobs
    if cfg.MODEL.MASK_ON:
            blob_dict, sampled_boxes, roidb, im_scale, batch_idx )

    # Optionally add Keypoint R-CNN blobs
            blob_dict, roidb, fg_rois_per_image, fg_inds, im_scale, batch_idx)

    return blob_dict

def _compute_targets(ex_rois, gt_rois, labels):
    计算图片的边界框回归目标值bounding-box regression targets.

    assert ex_rois.shape[0] == gt_rois.shape[0]
    assert ex_rois.shape[1] == 4
    assert gt_rois.shape[1] == 4

    targets = box_utils.bbox_transform_inv(ex_rois, gt_rois, cfg.MODEL.BBOX_REG_WEIGHTS)
    return np.hstack((labels[:, np.newaxis], targets)).astype(np.float32, copy=False )

def _expand_bbox_targets(bbox_target_data):
    边界框回归目标值以紧凑形式存储在 roidb 中.
    该函数将 targets 展开为网所使用的 4-of-4*K 表示.
    (i.e. 只有一个类别class 具有 non-zero targets). 
    类似地,loss weights 也进行展开.

        bbox_target_data (ndarray): N x 4K blob of regression targets
        bbox_inside_weights (ndarray): N x 4K blob of loss weights
    num_bbox_reg_classes = cfg.MODEL.NUM_CLASSES
        num_bbox_reg_classes = 2  # bg and fg

    clss = bbox_target_data[:, 0]
    bbox_targets = blob_utils.zeros((clss.size, 4 * num_bbox_reg_classes))
    bbox_inside_weights = blob_utils.zeros(bbox_targets.shape)
    inds = np.where(clss > 0)[0]
    for ind in inds:
        cls = int(clss[ind])
        start = 4 * cls
        end = start + 4
        bbox_targets[ind, start:end] = bbox_target_data[ind, 1:]
        bbox_inside_weights[ind, start:end] = (1.0, 1.0, 1.0, 1.0)
    return bbox_targets, bbox_inside_weights

def _add_multilevel_rois(blobs):
    默认情况,只对单 feature map level 添加训练 RoIs.
    当使用 FPN时,RoIs 必须根据 level 设置启发式来分配到不同的 FPN levels.
    (参见: modeling.FPN.map_rois_to_fpn_levels).
    lvl_min = cfg.FPN.ROI_MIN_LEVEL
    lvl_max = cfg.FPN.ROI_MAX_LEVEL

    def _distribute_rois_over_fpn_levels(rois_blob_name):
        分配 rois 到不同的 FPN levels.
        # 获取每个 roi 的 target level
        # blob rois 格式为:(batch_idx, x1, y1, x2, y2), 因此,取1:5 列的 box 坐标
        target_lvls = fpn.map_rois_to_fpn_levels(blobs[rois_blob_name][:, 1:5], 
                                                 lvl_min, lvl_max )
        # Add per FPN level roi blobs named like: <rois_blob_name>_fpn<lvl>

    if cfg.MODEL.MASK_ON:

2. mask_rcnn.py

构建 Mask R-CNN 训练的 minibatches.

处理 Mask R-CNN 的 minibatch blobs.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import logging
import numpy as np

from core.config import cfg
import utils.blob as blob_utils
import utils.boxes as box_utils
import utils.segms as segm_utils

logger = logging.getLogger(__name__)

def add_mask_rcnn_blobs(blobs, sampled_boxes, roidb, im_scale, batch_idx):
    添加 Mask R-CNN 特有的 blobs 到 input blob dictionary.
    准备 mask targets:
    将一个 gt mask 关联到每个具有 fg 类别标签(non-bg class label)的训练 roi,
    polys_gt_inds = np.where((roidb['gt_classes'] > 0) 
                             & (roidb['is_crowd'] == 0))[0]
    polys_gt = [roidb['segms'][i] for i in polys_gt_inds]
    boxes_from_polys = segm_utils.polys_to_boxes(polys_gt)
    fg_inds = np.where(blobs['labels_int32'] > 0)[0]
    roi_has_mask = blobs['labels_int32'].copy()
    roi_has_mask[roi_has_mask > 0] = 1

    if fg_inds.shape[0] > 0:
        # foreground rois 的类别标签
        mask_class_labels = blobs['labels_int32'][fg_inds]
        masks = blob_utils.zeros((fg_inds.shape[0], M**2), int32=True)

        # 寻找所有的 foreground rois 与边界框之间的重叠区域,封闭区域.
        rois_fg = sampled_boxes[fg_inds]
        overlaps_bbfg_bbpolys = box_utils.bbox_overlaps(
            rois_fg.astype(np.float32, copy=False),
            boxes_from_polys.astype(np.float32, copy=False) )

        # 将每个 fg rois 映射到 highest overlap 的mask.
        # (衡量标准: bbox overlap)
        fg_polys_inds = np.argmax(overlaps_bbfg_bbpolys, axis=1)

        # 添加 fg targets
        for i in range(rois_fg.shape[0]):
            fg_polys_ind = fg_polys_inds[i]
            poly_gt = polys_gt[fg_polys_ind]
            roi_fg = rois_fg[i]

            # 将给定 fg roi 中的多边形 mask 转换为 MxM 的二值图像.
            mask = segm_utils.polys_to_mask_wrt_box(poly_gt, roi_fg, M)
            # 确保 mask 是二值的binary
            mask = np.array(mask > 0, dtype=np.int32)  
            masks[i, :] = np.reshape(mask, M**2)
    else:  # 如果没有 fg masks
        # 网络不能处理空 blobs,因此,需要提供一个 mask.
        # 简单采用第一个 bg roi,并给定其一个都是 -1(ignore label) 值的 mask,
        # 且其类别标签为 0 (bg).
        bg_inds = np.where(blobs['labels_int32'] == 0)[0]
        # rois_fg 实际上是一个 background roi, but that's ok because ...
        rois_fg = sampled_boxes[bg_inds[0]].reshape((1, -1))
        # 设定一个 -1's blob (ignore label)
        masks = -blob_utils.ones((1, M**2), int32=True)
        # 设定其类别标签 class = 0 (background)
        mask_class_labels = blob_utils.zeros((1, ))
        # 确保第一个 roi 有一个 mask
        roi_has_mask[0] = 1

        masks = _expand_to_class_specific_mask_targets(masks,

    # 缩放Scale rois_fg,并格式化为: (batch_idx, x1, y1, x2, y2)
    rois_fg *= im_scale
    repeated_batch_idx = batch_idx * blob_utils.ones((rois_fg.shape[0], 1))
    rois_fg = np.hstack((repeated_batch_idx, rois_fg))

    # Update blobs dict with Mask R-CNN blobs
    blobs['mask_rois'] = rois_fg
    blobs['roi_has_mask_int32'] = roi_has_mask
    blobs['masks_int32'] = masks

def _expand_to_class_specific_mask_targets(masks, mask_class_labels):
    将 masks 由 shape (#masks, M ** 2) 展开到 (#masks, #classes * M ** 2),
    以表示类别已知的 mask targets.
    assert masks.shape[0] == mask_class_labels.shape[0]

    # Target values of -1 are "don't care" / ignore labels
    mask_targets = -blob_utils.ones((masks.shape[0], 
                                     cfg.MODEL.NUM_CLASSES * M**2), 
                                    int32=True )

    for i in range(masks.shape[0]):
        cls = int(mask_class_labels[i])
        start = M**2 * cls
        end = start + M**2
        # 忽略 background 实例instance
        # (只有图片中没有 fg 样本是才会发生)
        if cls > 0:
            mask_targets[i, start:end] = masks[i, :]

    return mask_targets

3. keypoint_rcnn.py

构建 Mask R-CNN 关于 keypoints 训练的 minibatches.

处理 Mask R-CNN 中关于 keypoint 检测分支训练的 minibatch blobs.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import logging
import numpy as np

from core.config import cfg
import utils.blob as blob_utils
import utils.keypoints as keypoint_utils

logger = logging.getLogger(__name__)

def add_keypoint_rcnn_blobs(blobs, roidb, fg_rois_per_image, fg_inds, im_scale, batch_idx):
    添加 Mask R-CNN keypoint 相关的 blobs 到给定的 blobs dictionary.
    注: gt_inds 必须与 datasets.json_dataset._merge_proposal_boxes_into_roidb 中的计算一致.
    gt_inds = np.where(roidb['gt_classes'] > 0)[0]
    max_overlaps = roidb['max_overlaps']
    gt_keypoints = roidb['gt_keypoints']

    ind_kp = gt_inds[roidb['box_to_gt_ind_map']]
    within_box = _within_box(gt_keypoints[ind_kp, :, :], roidb['boxes'])
    vis_kp = gt_keypoints[ind_kp, 2, :] > 0
    is_visible = np.sum(np.logical_and(vis_kp, within_box), axis=1) > 0
    kp_fg_inds = np.where(np.logical_and(max_overlaps >= cfg.TRAIN.FG_THRESH, 
                                         is_visible) )[0]

    kp_fg_rois_per_this_image = np.minimum(fg_rois_per_image, kp_fg_inds.size)
    if kp_fg_inds.size > kp_fg_rois_per_this_image:
        kp_fg_inds = np.random.choice(kp_fg_inds, 
                                      replace=False )

    sampled_fg_rois = roidb['boxes'][kp_fg_inds]
    box_to_gt_ind_map = roidb['box_to_gt_ind_map'][kp_fg_inds]

    num_keypoints = gt_keypoints.shape[2]
    sampled_keypoints = -np.ones((len(sampled_fg_rois), 
                                  gt_keypoints.shape[1], num_keypoints),
                                 dtype=gt_keypoints.dtype )
    for ii in range(len(sampled_fg_rois)):
        ind = box_to_gt_ind_map[ii]
        if ind >= 0:
            sampled_keypoints[ii, :, :] = gt_keypoints[gt_inds[ind], :, :]
            assert np.sum(sampled_keypoints[ii, 2, :]) > 0

    heats, weights = keypoint_utils.keypoints_to_heatmap_labels(
        sampled_keypoints, sampled_fg_rois )

    shape = (sampled_fg_rois.shape[0] * cfg.KRCNN.NUM_KEYPOINTS, 1)
    heats = heats.reshape(shape)
    weights = weights.reshape(shape)

    sampled_fg_rois *= im_scale
    repeated_batch_idx = batch_idx * blob_utils.ones(
        (sampled_fg_rois.shape[0], 1) )
    sampled_fg_rois = np.hstack((repeated_batch_idx, sampled_fg_rois))

    blobs['keypoint_rois'] = sampled_fg_rois
    blobs['keypoint_locations_int32'] = heats.astype(np.int32, copy=False)
    blobs['keypoint_weights'] = weights

def finalize_keypoint_minibatch(blobs, valid):
    当所有的 minibatch 图片 blobs 处理完以后,定型 minibatch.
    num_visible_keypoints = np.sum(blobs['keypoint_weights'])
    valid = (valid and len(blobs['keypoint_weights']) > 0 and
        num_visible_keypoints > min_count )
    # Normalizer to use if cfg.KRCNN.NORMALIZE_BY_VISIBLE_KEYPOINTS is False.
    # See modeling.model_builder.add_keypoint_losses
    norm = num_visible_keypoints / (
    blobs['keypoint_loss_normalizer'] = np.array(norm, dtype=np.float32)
    return valid

def _within_box(points, boxes):
    确认在给定 box 中的 keypoints.

    points: Nx2xK
    boxes: Nx4
    output: NxK
    x_within = np.logical_and(
        points[:, 0, :] >= np.expand_dims(boxes[:, 0], axis=1),
        points[:, 0, :] <= np.expand_dims(boxes[:, 2], axis=1) )
    y_within = np.logical_and(
        points[:, 1, :] >= np.expand_dims(boxes[:, 1], axis=1),
        points[:, 1, :] <= np.expand_dims(boxes[:, 3], axis=1) )
    return np.logical_and(x_within, y_within)

4. rpn.py

RPN - Region Proposal Networks 构建 minibatch.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import logging
import numpy as np
import numpy.random as npr

from core.config import cfg
import roi_data.data_utils as data_utils
import utils.blob as blob_utils
import utils.boxes as box_utils

logger = logging.getLogger(__name__)

def get_rpn_blob_names(is_training=True):
    RPN 使用的 Blob names.
    # im_info: (height, width, image scale)
    blob_names = ['im_info']
    if is_training:
        # gt boxes: (batch_idx, x1, y1, x2, y2, cls)
        blob_names += ['roidb']
        if cfg.FPN.FPN_ON and cfg.FPN.MULTILEVEL_RPN:
            # 与 RPN blobs 格式一致, but one per FPN level
            for lvl in range(cfg.FPN.RPN_MIN_LEVEL, cfg.FPN.RPN_MAX_LEVEL + 1):
                blob_names += ['rpn_labels_int32_wide_fpn' + str(lvl),
                               'rpn_bbox_targets_wide_fpn' + str(lvl),
                               'rpn_bbox_inside_weights_wide_fpn' + str(lvl),
                               'rpn_bbox_outside_weights_wide_fpn' + str(lvl) ]
            # Single level RPN blobs
            blob_names += ['rpn_labels_int32_wide',
                           'rpn_bbox_outside_weights_wide' ]
    return blob_names

def add_rpn_blobs(blobs, im_scales, roidb):
    添加 RPN-only 和 end-to-end Faster R-CNN 模型训练所需的 blobs.
    if cfg.FPN.FPN_ON and cfg.FPN.MULTILEVEL_RPN:
        # RPN applied to many feature levels, as in the FPN paper
        k_max = cfg.FPN.RPN_MAX_LEVEL
        k_min = cfg.FPN.RPN_MIN_LEVEL
        foas = []
        for lvl in range(k_min, k_max + 1):
            field_stride = 2.**lvl
            anchor_sizes = (cfg.FPN.RPN_ANCHOR_START_SIZE * 2.**(lvl - k_min), )
            anchor_aspect_ratios = cfg.FPN.RPN_ASPECT_RATIOS
            foa = data_utils.get_field_of_anchors(field_stride, anchor_sizes, anchor_aspect_ratios)
        all_anchors = np.concatenate([f.field_of_anchors for f in foas])
        foa = data_utils.get_field_of_anchors(cfg.RPN.STRIDE, 
                                              cfg.RPN.ASPECT_RATIOS )
        all_anchors = foa.field_of_anchors

    for im_i, entry in enumerate(roidb):
        scale = im_scales[im_i]
        im_height = np.round(entry['height'] * scale)
        im_width = np.round(entry['width'] * scale)
        gt_inds = np.where( (entry['gt_classes'] > 0) & (entry['is_crowd'] == 0) )[0]
        gt_rois = entry['boxes'][gt_inds, :] * scale
        # 待办事项(rbg): gt_boxes is poorly named;
        # should be something like 'gt_rois_info'
        gt_boxes = blob_utils.zeros((len(gt_inds), 6))
        gt_boxes[:, 0] = im_i  # batch inds
        gt_boxes[:, 1:5] = gt_rois
        gt_boxes[:, 5] = entry['gt_classes'][gt_inds]
        im_info = np.array([[im_height, im_width, scale]], dtype=np.float32)

        # 添加 RPN targets
        if cfg.FPN.FPN_ON and cfg.FPN.MULTILEVEL_RPN:
            # RPN applied to many feature levels, as in the FPN paper
            rpn_blobs = _get_rpn_blobs(im_height, im_width, foas, all_anchors, gt_rois)
            for i, lvl in enumerate(range(k_min, k_max + 1)):
                for k, v in rpn_blobs[i].items():
                    blobs[k + '_fpn' + str(lvl)].append(v)
            # 经典 RPN, 对单 feature level 应用.
            rpn_blobs = _get_rpn_blobs(im_height, im_width, [foa], all_anchors, gt_rois)
            for k, v in rpn_blobs.items():

    for k, v in blobs.items():
        if isinstance(v, list) and len(v) > 0:
            blobs[k] = np.concatenate(v)

    valid_keys = ['has_visible_keypoints', 'boxes', 'segms', 'seg_areas', 'gt_classes',
                  'gt_overlaps', 'is_crowd', 'box_to_gt_ind_map', 'gt_keypoints' ]
    minimal_roidb = [{} for _ in range(len(roidb))]
    for i, e in enumerate(roidb):
        for k in valid_keys:
            if k in e:
                minimal_roidb[i][k] = e[k]
    blobs['roidb'] = blob_utils.serialize(minimal_roidb)

    # Always return valid=True, since RPN minibatches are valid by design
    return True

def _get_rpn_blobs(im_height, im_width, foas, all_anchors, gt_boxes):
    total_anchors = all_anchors.shape[0]
    straddle_thresh = cfg.TRAIN.RPN_STRADDLE_THRESH

    if straddle_thresh >= 0:
        # 只保留在图片内的 anchors,根据阈值 straddle_thresh
        # 设置 TRAIN.RPN_STRADDLE_THRESH = -1 (或一个很大的值) 以保留所有的 anchors.
        inds_inside = np.where((all_anchors[:, 0] >= -straddle_thresh) &
                               (all_anchors[:, 1] >= -straddle_thresh) &
                               (all_anchors[:, 2] < im_width + straddle_thresh) &
                               (all_anchors[:, 3] < im_height + straddle_thresh) )[0]
        # keep only inside anchors
        anchors = all_anchors[inds_inside, :]
        inds_inside = np.arange(all_anchors.shape[0])
        anchors = all_anchors
    num_inside = len(inds_inside)

    logger.debug('total_anchors: {}'.format(total_anchors))
    logger.debug('inds_inside: {}'.format(num_inside))
    logger.debug('anchors.shape: {}'.format(anchors.shape))

    # 计算 anchor labels:
    # label=1 is positive, 0 is negative, -1 is don't care (ignore)
    labels = np.empty((num_inside, ), dtype=np.int32)
    if len(gt_boxes) > 0:
        # 计算 anchors 与 gt boxes 重叠区域间的 overlaps
        anchor_by_gt_overlap = box_utils.bbox_overlaps(anchors, gt_boxes)
        # 映射 anchor 到具有 highest overlap 的 gt box
        anchor_to_gt_argmax = anchor_by_gt_overlap.argmax(axis=1)
        # 对于每个 anchor, 与最重叠的 gt box 的 overlap 数量
        anchor_to_gt_max = anchor_by_gt_overlap[np.arange(num_inside),

        # 将 gt box映射到具有 highest overlap 的 anchor
        gt_to_anchor_argmax = anchor_by_gt_overlap.argmax(axis=0)
        #对于每个 gt box, 与最重叠的 anchor 的 overlap 数量
        gt_to_anchor_max = anchor_by_gt_overlap[gt_to_anchor_argmax,
                                                np.arange(anchor_by_gt_overlap.shape[1]) ]
        # 寻找共享 max overlap 数量的所有 anchors
        # (this includes many ties)
        anchors_with_max_overlap = np.where(anchor_by_gt_overlap == gt_to_anchor_max)[0]

        # Fg label: for each gt use anchors with highest overlap
        # (including ties)
        labels[anchors_with_max_overlap] = 1
        # Fg label: 大于 IOU 阈值
        labels[anchor_to_gt_max >= cfg.TRAIN.RPN_POSITIVE_OVERLAP] = 1

    # 如果有很多 positive labels, 则随机采样
    fg_inds = np.where(labels == 1)[0]
    if len(fg_inds) > num_fg:
        disable_inds = npr.choice(fg_inds, size=(len(fg_inds) - num_fg), replace=False)
        labels[disable_inds] = -1
    fg_inds = np.where(labels == 1)[0]

    # 如果有很多 negative labels,则随机采样
    # (samples with replacement, but since the set of bg inds is large most
    # samples will not have repeats)
    num_bg = cfg.TRAIN.RPN_BATCH_SIZE_PER_IM - np.sum(labels == 1)
    bg_inds = np.where(anchor_to_gt_max < cfg.TRAIN.RPN_NEGATIVE_OVERLAP)[0]
    if len(bg_inds) > num_bg:
        enable_inds = bg_inds[npr.randint(len(bg_inds), size=num_bg)]
        labels[enable_inds] = 0
    bg_inds = np.where(labels == 0)[0]

    bbox_targets = np.zeros((num_inside, 4), dtype=np.float32)
    bbox_targets[fg_inds, :] = data_utils.compute_targets(anchors[fg_inds, :],
                                                          gt_boxes[anchor_to_gt_argmax[fg_inds], :] )

    Bbox regression loss 的形式:
       loss(x) = weight_outside * L(weight_inside * x)
    Inside weights 可以在 element-wist basis 上设为 0.
    bbox regression 只对 positive 样本进行训练,因此可以设置其权重为 1.0,否则设为 0.0
    Inside weights 相当于 "开关".
    bbox_inside_weights = np.zeros((num_inside, 4), dtype=np.float32)
    bbox_inside_weights[labels == 1, :] = (1.0, 1.0, 1.0, 1.0)

    bbox regression loss 只根据 minibatch 内的图片数进行取平均.
    根据所选取的 anchors 样本总数进行取平均.

    Outside weights 用于对每个 loss 逐元素缩放(scale each element-wise loss),
    因此,最终的对 minibatch 求平均是正确的.
    Outside weights 相当于 "权重".
    bbox_outside_weights = np.zeros((num_inside, 4), dtype=np.float32)
    # uniform weighting of examples (given non-uniform sampling)
    num_examples = np.sum(labels >= 0)
    bbox_outside_weights[labels == 1, :] = 1.0 / num_examples
    bbox_outside_weights[labels == 0, :] = 1.0 / num_examples

    # Map up to original set of anchors
    labels = data_utils.unmap(labels, total_anchors, inds_inside, fill=-1)
    bbox_targets = data_utils.unmap(bbox_targets, total_anchors, inds_inside, fill=0)
    bbox_inside_weights = data_utils.unmap(bbox_inside_weights, total_anchors, inds_inside, fill=0)
    bbox_outside_weights = data_utils.unmap(bbox_outside_weights, total_anchors, inds_inside, fill=0)

    # 对生成的 labels, etc. 分割为 labels per each field of anchors
    blobs_out = []
    start_idx = 0
    for foa in foas:
        H = foa.field_size
        W = foa.field_size
        A = foa.num_cell_anchors
        end_idx = start_idx + H * W * A
        _labels = labels[start_idx:end_idx]
        _bbox_targets = bbox_targets[start_idx:end_idx, :]
        _bbox_inside_weights = bbox_inside_weights[start_idx:end_idx, :]
        _bbox_outside_weights = bbox_outside_weights[start_idx:end_idx, :]
        start_idx = end_idx

        # 输出 labels 的 shape (1, A, height, width)
        _labels = _labels.reshape((1, H, W, A)).transpose(0, 3, 1, 2)
        # bbox_targets 输出的 shape (1, 4 * A, height, width)
        _bbox_targets = _bbox_targets.reshape( (1, H, W, A * 4)).transpose(0, 3, 1, 2)
        # bbox_inside_weights 输出的 shape (1, 4 * A, height, width)
        _bbox_inside_weights = _bbox_inside_weights.reshape((1, H, W, A * 4)).transpose(0, 3, 1, 2)
        # bbox_outside_weights 输出的 shape (1, 4 * A, height, width)
        _bbox_outside_weights = _bbox_outside_weights.reshape( (1, H, W, A * 4)).transpose(0, 3, 1, 2)
                              rpn_bbox_outside_weights_wide=_bbox_outside_weights) )

    return blobs_out[0] if len(blobs_out) == 1 else blobs_out

5. retinanet.py

计算训练 RetinaNet 网络的 minibatch blobs.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import numpy as np
import logging

import utils.boxes as box_utils
import roi_data.data_utils as data_utils
from core.config import cfg

logger = logging.getLogger(__name__)

def get_retinanet_blob_names(is_training=True):
    返回 blob names,以 data loader 读取的顺序.

    N = number of images per minibatch
    A = number of anchors = num_scales * num_aspect_ratios (for example 9 used in RetinaNet paper)
    H, W = spatial dimensions (different for each FPN level)
    M = Out of all the anchors generated, 取决于 positive/negative IoU overlap thresholds, 
        会得到 M 个 positive anchors. 
        这些是 bounding box 网络分支来回归的 anchors.

    retnet_cls_labels -> labels for the cls branch for each FPN level
                         Shape: N x A x H x W

    retnet_roi_bbox_targets -> targets for the bbox regression branch
                               Shape: M x 4

    retnet_roi_fg_bbox_locs -> bbox 回归时,由于只对 fg bboxes 进行回归,
                             且,网络的预测输出的shape 是 N x (A * 4) x H x W ,
                             因此,将 positive boxes 的位置存储在 retnet_roi_fg_bbox_locs blobs,
                              其shape 为  M x 4,每一行的元素为:[img_id, anchor_id, x_loc, y_loc]
    # im_info: (height, width, image scale)
    blob_names = ['im_info']
    assert cfg.FPN.FPN_ON, "RetinaNet uses FPN for dense detection"
    # Same format as RPN blobs, but one per FPN level
    if is_training:
        blob_names += ['retnet_fg_num', 'retnet_bg_num']
        for lvl in range(cfg.FPN.RPN_MIN_LEVEL, cfg.FPN.RPN_MAX_LEVEL + 1):
            suffix = 'fpn{}'.format(lvl)
            blob_names += ['retnet_cls_labels_' + suffix,
                           'retnet_roi_bbox_targets_' + suffix,
                           'retnet_roi_fg_bbox_locs_' + suffix, ]
    return blob_names

def add_retinanet_blobs(blobs, im_scales, roidb, image_width, image_height):
    添加 RetinaNet blobs.
    # RetinaNet is applied to many feature levels, as in the FPN paper
    k_max, k_min = cfg.FPN.RPN_MAX_LEVEL, cfg.FPN.RPN_MIN_LEVEL
    scales_per_octave = cfg.RETINANET.SCALES_PER_OCTAVE
    num_aspect_ratios = len(cfg.RETINANET.ASPECT_RATIOS)
    aspect_ratios = cfg.RETINANET.ASPECT_RATIOS
    anchor_scale = cfg.RETINANET.ANCHOR_SCALE

    # get anchors from all levels for all scales/aspect ratios
    foas = []
    for lvl in range(k_min, k_max + 1):
        stride = 2. ** lvl
        for octave in range(scales_per_octave):
            octave_scale = 2 ** (octave / float(scales_per_octave))
            for idx in range(num_aspect_ratios):
                anchor_sizes = (stride * octave_scale * anchor_scale, )
                anchor_aspect_ratios = (aspect_ratios[idx], )
                foa = data_utils.get_field_of_anchors(
                    stride, anchor_sizes, anchor_aspect_ratios, octave, idx)
    all_anchors = np.concatenate([f.field_of_anchors for f in foas])

    blobs['retnet_fg_num'], blobs['retnet_bg_num'] = 0.0, 0.0
    for im_i, entry in enumerate(roidb):
        scale = im_scales[im_i]
        im_height = np.round(entry['height'] * scale)
        im_width = np.round(entry['width'] * scale)
        gt_inds = np.where((entry['gt_classes'] > 0) & (entry['is_crowd'] == 0))[0]
        assert len(gt_inds) > 0, 'Empty ground truth empty for image is not allowed. Please check.'

        gt_rois = entry['boxes'][gt_inds, :] * scale
        gt_classes = entry['gt_classes'][gt_inds]

        im_info = np.array([[im_height, im_width, scale]], dtype=np.float32)

        retinanet_blobs, fg_num, bg_num = _get_retinanet_blobs(
            foas, all_anchors, gt_rois, gt_classes, image_width, image_height)
        for i, foa in enumerate(foas):
            for k, v in retinanet_blobs[i].items():
                # the way it stacks is:
                # [[anchors for image1] + [anchors for images 2]]
                level = int(np.log2(foa.stride))
                key = '{}_fpn{}'.format(k, level)
                if k == 'retnet_roi_fg_bbox_locs':
                    v[:, 0] = im_i
                    # loc_stride: 80 * 4 if cls_specific else 4
                    loc_stride = 4  # 4 coordinate corresponding to bbox prediction
                    if cfg.RETINANET.CLASS_SPECIFIC_BBOX:
                        loc_stride *= (cfg.MODEL.NUM_CLASSES - 1)
                    anchor_ind = foa.octave * num_aspect_ratios + foa.aspect
                    # v[:, 1] is the class label [range 0-80] if we do
                    # class-specfic bbox otherwise it is 0. In case of class
                    # specific, based on the label, the location of current
                    # anchor is class_label * 4 and then we take into account
                    # the anchor_ind if the anchors
                    v[:, 1] *= 4
                    v[:, 1] += loc_stride * anchor_ind
        blobs['retnet_fg_num'] += fg_num
        blobs['retnet_bg_num'] += bg_num

    blobs['retnet_fg_num'] = blobs['retnet_fg_num'].astype(np.float32)
    blobs['retnet_bg_num'] = blobs['retnet_bg_num'].astype(np.float32)

    N = len(roidb)
    for k, v in blobs.items():
        if isinstance(v, list) and len(v) > 0:
            # compute number of anchors
            A = int(len(v) / N)
            # for the cls branch labels [per fpn level],
            # we have blobs['retnet_cls_labels_fpn{}'] as a list until this step
            # and length of this list is N x A where
            # N = num_images, A = num_anchors for example, N = 2, A = 9
            # Each element of the list has the shape 1 x 1 x H x W where H, W are
            # spatial dimension of curret fpn lvl. Let a{i} denote the element
            # corresponding to anchor i [9 anchors total] in the list.
            # The elements in the list are in order [[a0, ..., a9], [a0, ..., a9]]
            # however the network will make predictions like 2 x (9 * 80) x H x W
            # so we first concatenate the elements of each image to a numpy array
            # and then concatenate the two images to get the 2 x 9 x H x W

            if k.find('retnet_cls_labels') >= 0:
                tmp = []
                # concat anchors within an image
                for i in range(0, len(v), A):
                    tmp.append(np.concatenate(v[i: i + A], axis=1))
                # concat images
                blobs[k] = np.concatenate(tmp, axis=0)
                # for the bbox branch elements [per FPN level],
                #  we have the targets and the fg boxes locations
                # in the shape: M x 4 where M is the number of fg locations in a
                # given image at the current FPN level. For the given level,
                # the bbox predictions will be. The elements in the list are in
                # order [[a0, ..., a9], [a0, ..., a9]]
                # Concatenate them to form M x 4
                blobs[k] = np.concatenate(v, axis=0)
    return True

def _get_retinanet_blobs(
        foas, all_anchors, gt_boxes, gt_classes, im_width, im_height):
    total_anchors = all_anchors.shape[0]
    logger.debug('Getting mad blobs: im_height {} im_width: {}'.format(
        im_height, im_width))

    inds_inside = np.arange(all_anchors.shape[0])
    anchors = all_anchors
    num_inside = len(inds_inside)

    logger.debug('total_anchors: {}'.format(total_anchors))
    logger.debug('inds_inside: {}'.format(num_inside))
    logger.debug('anchors.shape: {}'.format(anchors.shape))

    # Compute anchor labels:
    # label=1 is positive, 0 is negative, -1 is don't care (ignore)
    labels = np.empty((num_inside, ), dtype=np.float32)
    if len(gt_boxes) > 0:
        # Compute overlaps between the anchors and the gt boxes overlaps
        anchor_by_gt_overlap = box_utils.bbox_overlaps(anchors, gt_boxes)
        # Map from anchor to gt box that has highest overlap
        anchor_to_gt_argmax = anchor_by_gt_overlap.argmax(axis=1)
        # For each anchor, amount of overlap with most overlapping gt box
        anchor_to_gt_max = anchor_by_gt_overlap[
            np.arange(num_inside), anchor_to_gt_argmax]

        # Map from gt box to an anchor that has highest overlap
        gt_to_anchor_argmax = anchor_by_gt_overlap.argmax(axis=0)
        # For each gt box, amount of overlap with most overlapping anchor
        gt_to_anchor_max = anchor_by_gt_overlap[
            gt_to_anchor_argmax, np.arange(anchor_by_gt_overlap.shape[1])]
        # Find all anchors that share the max overlap amount
        # (this includes many ties)
        anchors_with_max_overlap = np.where(
            anchor_by_gt_overlap == gt_to_anchor_max)[0]

        # Fg label: for each gt use anchors with highest overlap
        # (including ties)
        gt_inds = anchor_to_gt_argmax[anchors_with_max_overlap]
        labels[anchors_with_max_overlap] = gt_classes[gt_inds]
        # Fg label: above threshold IOU
        inds = anchor_to_gt_max >= cfg.RETINANET.POSITIVE_OVERLAP
        gt_inds = anchor_to_gt_argmax[inds]
        labels[inds] = gt_classes[gt_inds]

    fg_inds = np.where(labels >= 1)[0]
    bg_inds = np.where(anchor_to_gt_max < cfg.RETINANET.NEGATIVE_OVERLAP)[0]
    labels[bg_inds] = 0
    num_fg, num_bg = len(fg_inds), len(bg_inds)

    bbox_targets = np.zeros((num_inside, 4), dtype=np.float32)
    bbox_targets[fg_inds, :] = data_utils.compute_targets(
        anchors[fg_inds, :], gt_boxes[anchor_to_gt_argmax[fg_inds], :])

    # Map up to original set of anchors
    labels = data_utils.unmap(labels, total_anchors, inds_inside, fill=-1)
    bbox_targets = data_utils.unmap(bbox_targets, total_anchors, inds_inside, fill=0)

    # Split the generated labels, etc. into labels per each field of anchors
    blobs_out = []
    start_idx = 0
    for foa in foas:
        H = foa.field_size
        W = foa.field_size
        end_idx = start_idx + H * W
        _labels = labels[start_idx:end_idx]
        _bbox_targets = bbox_targets[start_idx:end_idx, :]
        start_idx = end_idx

        # labels output with shape (1, height, width)
        _labels = _labels.reshape((1, 1, H, W))
        # bbox_targets output with shape (1, 4 * A, height, width)
        _bbox_targets = _bbox_targets.reshape((1, H, W, 4)).transpose(0, 3, 1, 2)
        stride = foa.stride
        w = int(im_width / stride)
        h = int(im_height / stride)

        # data for select_smooth_l1 loss
        num_classes = cfg.MODEL.NUM_CLASSES - 1
        inds_4d = np.where(_labels > 0)
        M = len(inds_4d)
        _roi_bbox_targets = np.zeros((0, 4))
        _roi_fg_bbox_locs = np.zeros((0, 4))
        if M > 0:
            im_inds, y, x = inds_4d[0], inds_4d[2], inds_4d[3]
            _roi_bbox_targets = np.zeros((len(im_inds), 4))
            _roi_fg_bbox_locs = np.zeros((len(im_inds), 4))
            lbls = _labels[im_inds, :, y, x]
            for i, lbl in enumerate(lbls):
                l = lbl[0] - 1
                if not cfg.RETINANET.CLASS_SPECIFIC_BBOX:
                    l = 0
                assert l >= 0 and l < num_classes, 'label out of the range'
                _roi_bbox_targets[i, :] = _bbox_targets[:, :, y[i], x[i]]
                _roi_fg_bbox_locs[i, :] = np.array([[0, l, y[i], x[i]]])
        blobs_out.append(dict(retnet_cls_labels=_labels[:, :, 0:h, 0:w].astype(np.int32),
                              retnet_roi_fg_bbox_locs=_roi_fg_bbox_locs.astype(np.float32), ) )
    out_num_fg = np.array([num_fg + 1.0], dtype=np.float32)
    out_num_bg = (np.array([num_bg + 1.0]) * (cfg.MODEL.NUM_CLASSES - 1) +
                  out_num_fg * (cfg.MODEL.NUM_CLASSES - 2))

    return blobs_out, out_num_fg, out_num_bg
