# 源码下载

## 一、预测部分

### 1、主干网络介绍

ResNet101有两个基本的块，分别名为Conv Block和Identity Block，其中Conv Block输入和输出的维度是不一样的，所以不能连续串联，它的作用是改变网络的维度；Identity Block输入维度和输出维度相同，可以串联，用于加深网络的。
Conv Block的结构如下：

Identity Block的结构如下：

``````from tensorflow.keras.layers import (Activation, Add, BatchNormalization,
from tensorflow.keras.regularizers import l2

#----------------------------------------------#
#   conv_block和identity_block的区别主要就是：
#   conv_block会压缩输入进来的特征层的宽高
#   identity_block用于加深网络
#----------------------------------------------#
def identity_block(input_tensor, kernel_size, filters, stage, block, use_bias=True, weight_decay=0, train_bn=True):
nb_filter1, nb_filter2, nb_filter3 = filters
conv_name_base = 'res' + str(stage) + block + '_branch'
bn_name_base = 'bn' + str(stage) + block + '_branch'

x = Conv2D(nb_filter1, (1, 1), name=conv_name_base + '2a', use_bias=use_bias, kernel_regularizer=l2(weight_decay))(input_tensor)
x = BatchNormalization(name=bn_name_base + '2a')(x, training=train_bn)
x = Activation('relu')(x)

x = Conv2D(nb_filter2, (kernel_size, kernel_size), padding='same', name=conv_name_base + '2b', use_bias=use_bias, kernel_regularizer=l2(weight_decay))(x)
x = BatchNormalization(name=bn_name_base + '2b')(x, training=train_bn)
x = Activation('relu')(x)

x = Conv2D(nb_filter3, (1, 1), name=conv_name_base + '2c', use_bias=use_bias, kernel_regularizer=l2(weight_decay))(x)
x = BatchNormalization(name=bn_name_base + '2c')(x, training=train_bn)

x = Activation('relu', name='res' + str(stage) + block + '_out')(x)
return x

def conv_block(input_tensor, kernel_size, filters, stage, block, strides=(2, 2), use_bias=True, weight_decay=0, train_bn=True):

nb_filter1, nb_filter2, nb_filter3 = filters
conv_name_base = 'res' + str(stage) + block + '_branch'
bn_name_base = 'bn' + str(stage) + block + '_branch'

x = Conv2D(nb_filter1, (1, 1), strides=strides, name=conv_name_base + '2a', use_bias=use_bias, kernel_regularizer=l2(weight_decay))(input_tensor)
x = BatchNormalization(name=bn_name_base + '2a')(x, training=train_bn)
x = Activation('relu')(x)

x = Conv2D(nb_filter2, (kernel_size, kernel_size), padding='same', name=conv_name_base + '2b', use_bias=use_bias, kernel_regularizer=l2(weight_decay))(x)
x = BatchNormalization(name=bn_name_base + '2b')(x, training=train_bn)
x = Activation('relu')(x)

x = Conv2D(nb_filter3, (1, 1), name=conv_name_base + '2c', use_bias=use_bias, kernel_regularizer=l2(weight_decay))(x)
x = BatchNormalization(name=bn_name_base + '2c')(x, training=train_bn)

shortcut = Conv2D(nb_filter3, (1, 1), strides=strides, name=conv_name_base + '1', use_bias=use_bias, kernel_regularizer=l2(weight_decay))(input_tensor)
shortcut = BatchNormalization(name=bn_name_base + '1')(shortcut, training=train_bn)

x = Activation('relu', name='res' + str(stage) + block + '_out')(x)
return x

#----------------------------------------------#
#   获得resnet的主干部分
#----------------------------------------------#
def get_resnet(input_image, train_bn=True, weight_decay=0):
#----------------------------------------------#
#   假设输入进来的图片为1024,1024,3
#----------------------------------------------#

# 1024,1024,3 -> 512,512,64
x = Conv2D(64, (7, 7), strides=(2, 2), name='conv1', use_bias=True, kernel_regularizer=l2(weight_decay))(x)
x = BatchNormalization(name='bn_conv1')(x, training=train_bn)
x = Activation('relu')(x)

# 512,512,64 -> 256,256,64
x = MaxPooling2D((3, 3), strides=(2, 2), padding="same")(x)
C1 = x

# 256,256,64 -> 256,256,256
x = conv_block(x, 3, [64, 64, 256], stage=2, block='a', strides=(1, 1), weight_decay=weight_decay, train_bn=train_bn)
x = identity_block(x, 3, [64, 64, 256], stage=2, block='b', weight_decay=weight_decay, train_bn=train_bn)
x = identity_block(x, 3, [64, 64, 256], stage=2, block='c', weight_decay=weight_decay, train_bn=train_bn)
C2 = x

# 256,256,256 -> 128,128,512
x = conv_block(x, 3, [128, 128, 512], stage=3, block='a', weight_decay=weight_decay, train_bn=train_bn)
x = identity_block(x, 3, [128, 128, 512], stage=3, block='b', weight_decay=weight_decay, train_bn=train_bn)
x = identity_block(x, 3, [128, 128, 512], stage=3, block='c', weight_decay=weight_decay, train_bn=train_bn)
x = identity_block(x, 3, [128, 128, 512], stage=3, block='d', weight_decay=weight_decay, train_bn=train_bn)
C3 = x

# 128,128,512 -> 64,64,1024
x = conv_block(x, 3, [256, 256, 1024], stage=4, block='a', weight_decay=weight_decay, train_bn=train_bn)
block_count = 22
for i in range(block_count):
x = identity_block(x, 3, [256, 256, 1024], stage=4, block=chr(98 + i), weight_decay=weight_decay, train_bn=train_bn)
C4 = x

# 64,64,1024 -> 32,32,2048
x = conv_block(x, 3, [512, 512, 2048], stage=5, block='a', weight_decay=weight_decay, train_bn=train_bn)
x = identity_block(x, 3, [512, 512, 2048], stage=5, block='b', weight_decay=weight_decay, train_bn=train_bn)
x = identity_block(x, 3, [512, 512, 2048], stage=5, block='c', weight_decay=weight_decay, train_bn=train_bn)
C5 = x
return [C1, C2, C3, C4, C5]
``````

### 2、特征金字塔FPN的构建

``````#----------------------------------------------#
#   组合成特征金字塔的结构
#   P5长宽共压缩了5次
#   P5为32,32,256
#----------------------------------------------#
P5 = Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (1, 1), name='fpn_c5p5')(C5)
#----------------------------------------------#
#   将P5上采样和P4进行相加
#   P4长宽共压缩了4次
#   P4为64,64,256
#----------------------------------------------#
UpSampling2D(size=(2, 2), name="fpn_p5upsampled")(P5),
Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (1, 1), name='fpn_c4p4')(C4)])
#----------------------------------------------#
#   将P4上采样和P3进行相加
#   P3长宽共压缩了3次
#   P3为128,128,256
#----------------------------------------------#
UpSampling2D(size=(2, 2), name="fpn_p4upsampled")(P4),
Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (1, 1), name='fpn_c3p3')(C3)])
#----------------------------------------------#
#   将P3上采样和P2进行相加
#   P2长宽共压缩了2次
#   P2为256,256,256
#----------------------------------------------#
UpSampling2D(size=(2, 2), name="fpn_p3upsampled")(P3),
Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (1, 1), name='fpn_c2p2')(C2)])

#-----------------------------------------------------------#
#   各自进行一次256通道的卷积，此时P2、P3、P4、P5通道数相同
#   P2为256,256,256
#   P3为128,128,256
#   P4为64,64,256
#   P5为32,32,256
#-----------------------------------------------------------#
P2 = Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (3, 3), padding="SAME", name="fpn_p2")(P2)
P3 = Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (3, 3), padding="SAME", name="fpn_p3")(P3)
P4 = Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (3, 3), padding="SAME", name="fpn_p4")(P4)
P5 = Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (3, 3), padding="SAME", name="fpn_p5")(P5)
#----------------------------------------------#
#   在建议框网络里面还有一个P6用于获取建议框
#   P5为16,16,256
#----------------------------------------------#
P6 = MaxPooling2D(pool_size=(1, 1), strides=2, name="fpn_p6")(P5)

#----------------------------------------------#
#   P2, P3, P4, P5, P6可以用于获取建议框
#----------------------------------------------#
rpn_feature_maps    = [P2, P3, P4, P5, P6]
#----------------------------------------------#
#----------------------------------------------#
mrcnn_feature_maps  = [P2, P3, P4, P5]
``````

### 3、获得Proposal建议框

anchors_per_location x 4的卷积 用于预测 公用特征层上 每一个网格点上 每一个先验框的变化情况。（为什么说是变化情况呢，这是因为Faster-RCNN的预测结果需要结合先验框获得预测框，预测结果就是先验框的变化情况。）

anchors_per_location x 2的卷积 用于预测 公用特征层上 每一个网格点上 每一个预测框内部是否包含了物体。

anchors_per_location x 4的卷积的结果会对这些先验框进行调整，获得一个新的框。
anchors_per_location x 2的卷积会判断上述获得的新框是否包含物体。

``````#------------------------------------#
#   五个不同大小的特征层会传入到
#   RPN当中，获得建议框
#------------------------------------#
def rpn_graph(feature_map, anchors_per_location, weight_decay=0):
#------------------------------------#
#   利用一个3x3卷积进行特征整合
#------------------------------------#
shared = Conv2D(512, (3, 3), padding='same', activation='relu',
name='rpn_conv_shared', kernel_regularizer=l2(weight_decay))(feature_map)

#------------------------------------#
#   batch_size, num_anchors, 2
#   代表这个先验框是否包含物体
#------------------------------------#
x = Conv2D(anchors_per_location * 2, (1, 1), padding='valid', activation='linear', name='rpn_class_raw', kernel_regularizer=l2(weight_decay))(shared)
rpn_class_logits = Reshape([-1,2])(x)
rpn_probs = Activation("softmax", name="rpn_class_xxx")(rpn_class_logits)

#------------------------------------#
#   batch_size, num_anchors, 4
#   这个先验框的调整参数
#------------------------------------#
x = Conv2D(anchors_per_location * 4, (1, 1), padding="valid", activation='linear', name='rpn_bbox_pred', kernel_regularizer=l2(weight_decay))(shared)
rpn_bbox = Reshape([-1, 4])(x)

return [rpn_class_logits, rpn_probs, rpn_bbox]

#------------------------------------#
#   建立建议框网络模型
#   RPN模型
#------------------------------------#
def build_rpn_model(anchors_per_location, depth, weight_decay=0):
input_feature_map = Input(shape=[None, None, depth], name="input_rpn_feature_map")
outputs = rpn_graph(input_feature_map, anchors_per_location, weight_decay=weight_decay)
return Model([input_feature_map], outputs, name="rpn_model")
``````

### 4、Proposal建议框的解码

anchors_per_location x 4的卷积 用于预测 有效特征层上 每一个网格点上 每一个先验框的变化情况。**

anchors_per_location x 1的卷积 用于预测 有效特征层上 每一个网格点上 每一个预测框内部是否包含了物体。

anchors_per_location x 4中的anchors_per_location 表示了这个网格点所包含的先验框数量，其中的4表示了框的中心与长宽的调整情况。

``````#------------------------------------------------------------------#
#   利用先验框调整参数调整先验框，获得建议框的坐标
#------------------------------------------------------------------#
def apply_box_deltas_graph(boxes, deltas):
#---------------------------------------#
#   计算先验框的中心和宽高
#---------------------------------------#
height = boxes[:, 2] - boxes[:, 0]
width = boxes[:, 3] - boxes[:, 1]
center_y = boxes[:, 0] + 0.5 * height
center_x = boxes[:, 1] + 0.5 * width
#---------------------------------------#
#   计算出调整后的先验框的中心和宽高
#---------------------------------------#
center_y += deltas[:, 0] * height
center_x += deltas[:, 1] * width
height *= tf.math.exp(deltas[:, 2])
width *= tf.math.exp(deltas[:, 3])
#---------------------------------------#
#   计算左上角和右下角的点的坐标
#---------------------------------------#
y1 = center_y - 0.5 * height
x1 = center_x - 0.5 * width
y2 = y1 + height
x2 = x1 + width
result = tf.stack([y1, x1, y2, x2], axis=1, name="apply_box_deltas_out")
return result

def clip_boxes_graph(boxes, window):
wy1, wx1, wy2, wx2 = tf.split(window, 4)
y1, x1, y2, x2 = tf.split(boxes, 4, axis=1)

y1 = tf.maximum(tf.minimum(y1, wy2), wy1)
x1 = tf.maximum(tf.minimum(x1, wx2), wx1)
y2 = tf.maximum(tf.minimum(y2, wy2), wy1)
x2 = tf.maximum(tf.minimum(x2, wx2), wx1)
clipped = tf.concat([y1, x1, y2, x2], axis=1, name="clipped_boxes")
clipped.set_shape((clipped.shape[0], 4))
return clipped

#----------------------------------------------------------#
#   Proposal Layer
#   该部分代码用于将先验框转化成建议框
#----------------------------------------------------------#
class ProposalLayer(Layer):
def __init__(self, proposal_count, nms_threshold, config=None, **kwargs):
super(ProposalLayer, self).__init__(**kwargs)
self.config = config
self.proposal_count = proposal_count
self.nms_threshold = nms_threshold

def call(self, inputs):
#----------------------------------------------------------#
#   输入的inputs有三个内容
#   inputs[0]   rpn_class   : Batch_size, num_anchors, 2
#   inputs[1]   rpn_bbox    : Batch_size, num_anchors, 4
#   inputs[2]   anchors     : Batch_size, num_anchors, 4
#----------------------------------------------------------#

#----------------------------------------------------------#
#   获得先验框内部是否有物体[Batch_size, num_anchors, 1]
#----------------------------------------------------------#
scores = inputs[0][:, :, 1]

#----------------------------------------------------------#
#   获得先验框的调整参数[batch, num_rois, 4]
#----------------------------------------------------------#
deltas = inputs[1]

#----------------------------------------------------------#
#   获得先验框的坐标
#----------------------------------------------------------#
anchors = inputs[2]

#----------------------------------------------------------#
#   RPN_BBOX_STD_DEV[0.1 0.1 0.2 0.2] 改变数量级
#----------------------------------------------------------#
deltas = deltas * np.reshape(self.config.RPN_BBOX_STD_DEV, [1, 1, 4])

#----------------------------------------------------------#
#   筛选出得分前6000个的框
#----------------------------------------------------------#
pre_nms_limit = tf.minimum(self.config.PRE_NMS_LIMIT, tf.shape(anchors)[1])

#----------------------------------------------------------#
#   获得这些框的索引
#----------------------------------------------------------#
ix = tf.nn.top_k(scores, pre_nms_limit, sorted=True,
name="top_anchors").indices

#----------------------------------------------------------#
#   获得先验框、及其得分与调整参数
#----------------------------------------------------------#
scores = batch_slice([scores, ix], lambda x, y: tf.gather(x, y),
self.config.IMAGES_PER_GPU)
deltas = batch_slice([deltas, ix], lambda x, y: tf.gather(x, y),
self.config.IMAGES_PER_GPU)
pre_nms_anchors = batch_slice([anchors, ix], lambda a, x: tf.gather(a, x),
self.config.IMAGES_PER_GPU,
names=["pre_nms_anchors"])

#----------------------------------------------------------#
#   [batch, pre_nms_limit, (y1, x1, y2, x2)]
#   对先验框进行解码
#----------------------------------------------------------#
boxes = batch_slice([pre_nms_anchors, deltas],
lambda x, y: apply_box_deltas_graph(x, y),
self.config.IMAGES_PER_GPU,
names=["refined_anchors"])

#----------------------------------------------------------#
#   [batch, pre_nms_limit, (y1, x1, y2, x2)]
#   防止超出图片范围
#----------------------------------------------------------#
window = np.array([0, 0, 1, 1], dtype=np.float32)
boxes = batch_slice(boxes,
lambda x: clip_boxes_graph(x, window),
self.config.IMAGES_PER_GPU,
names=["refined_anchors_clipped"])

#---------------------------------------------------------#
#   在非极大抑制后
#   获得一个shape为[batch, NMS_ROIS, 4]的proposals
#---------------------------------------------------------#
def nms(boxes, scores):
indices = tf.image.non_max_suppression(
boxes, scores, self.proposal_count,
self.nms_threshold, name="rpn_non_max_suppression")
proposals = tf.gather(boxes, indices)
padding = tf.maximum(self.proposal_count - tf.shape(proposals)[0], 0)
return proposals
proposals = batch_slice([boxes, scores], nms, self.config.IMAGES_PER_GPU)

return tf.reshape(proposals, (-1, self.proposal_count, 4))

def compute_output_shape(self, input_shape):
return (None, self.proposal_count, 4)
``````

### 5、对Proposal建议框加以利用（Roi Align）

``````def log2_graph(x):
return tf.math.log(x) / tf.math.log(2.0)

def parse_image_meta_graph(meta):
"""
将meta里面的参数进行分割
"""
image_id = meta[:, 0]
original_image_shape = meta[:, 1:4]
image_shape = meta[:, 4:7]
window = meta[:, 7:11]  # (y1, x1, y2, x2) window of image in in pixels
scale = meta[:, 11]
active_class_ids = meta[:, 12:]
return {
"image_id": image_id,
"original_image_shape": original_image_shape,
"image_shape": image_shape,
"window": window,
"scale": scale,
"active_class_ids": active_class_ids,
}

#----------------------------------------------------------#
#   ROIAlign Layer
#   利用建议框在特征层上截取内容
#----------------------------------------------------------#
class PyramidROIAlign(Layer):
def __init__(self, pool_shape, **kwargs):
super(PyramidROIAlign, self).__init__(**kwargs)
self.pool_shape = tuple(pool_shape)

def call(self, inputs):
#----------------------------------------------------------#
#   获得建议框的坐标
#----------------------------------------------------------#
boxes = inputs[0]
#----------------------------------------------------------#
#   image_meta包含了一些必要的图片信息
#----------------------------------------------------------#
image_meta = inputs[1]
#----------------------------------------------------------#
#   取出所有的特征层[batch, height, width, channels]
#----------------------------------------------------------#
feature_maps = inputs[2:]

#----------------------------------------------------------#
#   获得建议框的宽高
#----------------------------------------------------------#
y1, x1, y2, x2 = tf.split(boxes, 4, axis=2)
h = y2 - y1
w = x2 - x1

#----------------------------------------------------------#
#   获得输入进来的图像的大小
#----------------------------------------------------------#
image_shape = parse_image_meta_graph(image_meta)['image_shape'][0]

#----------------------------------------------------------#
#   通过建议框的大小找到这个建议框属于哪个特征层
#----------------------------------------------------------#
image_area = tf.cast(image_shape[0] * image_shape[1], tf.float32)
roi_level = log2_graph(tf.sqrt(h * w) / (224.0 / tf.sqrt(image_area)))
roi_level = tf.minimum(5, tf.maximum(2, 4 + tf.cast(tf.round(roi_level), tf.int32)))
roi_level = tf.squeeze(roi_level, 2)

pooled = []
box_to_level = []
# 分别在P2-P5中进行截取
for i, level in enumerate(range(2, 6)):
#-----------------------------------------------#
#   找到每个特征层对应的建议框
#-----------------------------------------------#
ix = tf.where(tf.equal(roi_level, level))
level_boxes = tf.gather_nd(boxes, ix)
box_to_level.append(ix)

#-----------------------------------------------#
#    获得这些建议框所属的图片
#-----------------------------------------------#
box_indices = tf.cast(ix[:, 0], tf.int32)

# 停止梯度下降

#--------------------------------------------------------------------------#
#   利用建议框对特征层进行截取
#   [batch * num_boxes, pool_height, pool_width, channels]
#--------------------------------------------------------------------------#
pooled.append(tf.image.crop_and_resize(
feature_maps[i], level_boxes, box_indices, self.pool_shape,
method="bilinear"))

pooled = tf.concat(pooled, axis=0)
#--------------------------------------------------------------------------#
#   将顺序和所属的图片进行堆叠
#--------------------------------------------------------------------------#
box_to_level = tf.concat(box_to_level, axis=0)
box_range = tf.expand_dims(tf.range(tf.shape(box_to_level)[0]), 1)
box_to_level = tf.concat([tf.cast(box_to_level, tf.int32), box_range], axis=1)

# box_to_level[:, 0]表示第几张图
# box_to_level[:, 1]表示第几张图里的第几个框
sorting_tensor = box_to_level[:, 0] * 100000 + box_to_level[:, 1]
# 进行排序，将同一张图里的某一些聚集在一起
ix = tf.nn.top_k(sorting_tensor, k=tf.shape(
box_to_level)[0]).indices[::-1]

# 按顺序获得图片的索引
ix = tf.gather(box_to_level[:, 2], ix)
pooled = tf.gather(pooled, ix)

#--------------------------------------------------------------------------#
#   重新reshape为如下
#   [batch, num_rois, POOL_SIZE, POOL_SIZE, channels]
#--------------------------------------------------------------------------#
shape = tf.concat([tf.shape(boxes)[:2], tf.shape(pooled)[1:]], axis=0)
pooled = tf.reshape(pooled, shape)
return pooled

def compute_output_shape(self, input_shape):
return input_shape[0][:2] + self.pool_shape + (input_shape[2][-1], )
``````

``````#------------------------------------#
#   建立classifier模型
#   这个模型的预测结果会调整建议框
#   获得最终的预测框
#------------------------------------#
def fpn_classifier_graph(rois, feature_maps, image_meta,
pool_size, num_classes, train_bn=True,
fc_layers_size=1024, weight_decay=0):
#---------------------------------------------------------------#
#   ROI Pooling，利用建议框在特征层上进行截取
#   x   : [batch, num_rois, POOL_SIZE, POOL_SIZE, channels]
#---------------------------------------------------------------#
x = PyramidROIAlign([pool_size, pool_size], name="roi_align_classifier")([rois, image_meta] + feature_maps)

#------------------------------------------------------------------#
#   利用卷积进行特征整合
#   x   : [batch, num_rois, 1, 1, fc_layers_size]
#------------------------------------------------------------------#
x = TimeDistributed(Conv2D(fc_layers_size, (pool_size, pool_size), padding="valid", kernel_regularizer=l2(weight_decay)),  name="mrcnn_class_conv1")(x)
x = TimeDistributed(BatchNormalization(), name='mrcnn_class_bn1')(x, training=train_bn)
x = Activation('relu')(x)
#------------------------------------------------------------------#
#   x   : [batch, num_rois, 1, 1, fc_layers_size]
#------------------------------------------------------------------#
x = TimeDistributed(Conv2D(fc_layers_size, (1, 1), kernel_regularizer=l2(weight_decay)), name="mrcnn_class_conv2")(x)
x = TimeDistributed(BatchNormalization(), name='mrcnn_class_bn2')(x, training=train_bn)
x = Activation('relu')(x)

#------------------------------------------------------------------#
#   x   : [batch, num_rois, fc_layers_size]
#------------------------------------------------------------------#
shared = Lambda(lambda x: K.squeeze(K.squeeze(x, 3), 2),  name="pool_squeeze")(x)

#------------------------------------------------------------------#
#   这个的预测结果代表这个先验框内部的物体的种类
#   mrcnn_probs   : [batch, num_rois, num_classes]
#------------------------------------------------------------------#
mrcnn_class_logits = TimeDistributed(Dense(num_classes), name='mrcnn_class_logits')(shared)
mrcnn_probs = TimeDistributed(Activation("softmax"), name="mrcnn_class")(mrcnn_class_logits)

#------------------------------------------------------------------#
#   这个的预测结果会对先验框进行调整
#   mrcnn_bbox : [batch, num_rois, num_classes, 4]
#------------------------------------------------------------------#
x = TimeDistributed(Dense(num_classes * 4, activation='linear'), name='mrcnn_bbox_fc')(shared)
mrcnn_bbox = Reshape((-1, num_classes, 4), name="mrcnn_bbox")(x)

return mrcnn_class_logits, mrcnn_probs, mrcnn_bbox

#----------------------------------------------#
#   这个模型会利用预测框对特征层进行ROIAlign
#   根据截取下来的特征层进行语义分割
#----------------------------------------------#
pool_size, num_classes, train_bn=True, weight_decay=0):
#--------------------------------------------------------------------#
#   ROI Pooling，利用预测框在特征层上进行截取
#--------------------------------------------------------------------#
x = PyramidROIAlign([pool_size, pool_size], name="roi_align_mask")([rois, image_meta] + feature_maps)

#--------------------------------------------------------------------#
#--------------------------------------------------------------------#
x = Activation('relu')(x)

#--------------------------------------------------------------------#
#--------------------------------------------------------------------#
x = Activation('relu')(x)

#--------------------------------------------------------------------#
#--------------------------------------------------------------------#
x = Activation('relu')(x)

#--------------------------------------------------------------------#
#--------------------------------------------------------------------#
x = Activation('relu')(x)

#--------------------------------------------------------------------#
#--------------------------------------------------------------------#
x = TimeDistributed(Conv2DTranspose(256, (2, 2), strides=2, activation="relu", kernel_regularizer=l2(weight_decay)), name="mrcnn_mask_deconv")(x)
#--------------------------------------------------------------------#
#   反卷积后再次进行一个1x1卷积调整通道，
#   使其最终数量为numclasses，代表分的类
#--------------------------------------------------------------------#
x = TimeDistributed(Conv2D(num_classes, (1, 1), strides=1, activation="sigmoid", kernel_regularizer=l2(weight_decay)), name="mrcnn_mask")(x)
return x
``````

### 6、预测框的解码

1、取出不属于背景，并且得分大于config.DETECTION_MIN_CONFIDENCE的建议框。
2、然后利用建议框和classifier模型的预测结果进行解码，获得最终预测框的位置。
3、利用得分和最终预测框的位置进行非极大抑制，防止重复检测。

``````#----------------------------------------------------------#
#   利用classifier的预测结果对建议框进行调整获得预测框
#   获得每一个预测框的种类
#----------------------------------------------------------#
def refine_detections_graph(rois, probs, deltas, window, config):
#----------------------------------------------------------#
#   输入为：
#   rois        : N, 4
#   probs       : N, num_classes
#   deltas      : N, num_classes, 4
#   window      : 4,
#
#   输出为：
#   detections  : num_detections, 6
#----------------------------------------------------------#

#----------------------------------------------------------#
#   找到得分最高的类
#----------------------------------------------------------#
class_ids = tf.argmax(probs, axis=1, output_type=tf.int32)
#----------------------------------------------------------#
#   序号+类，用于取出成绩与建议框的调整参数
#----------------------------------------------------------#
indices = tf.stack([tf.range(probs.shape[0]), class_ids], axis=1)
#----------------------------------------------------------#
#   取出成绩与建议框的调整参数
#----------------------------------------------------------#
class_scores = tf.gather_nd(probs, indices)
deltas_specific = tf.gather_nd(deltas, indices)
#----------------------------------------------------------#
#   进行解码
#   refined_rois    : boxes, 4
#----------------------------------------------------------#
refined_rois = apply_box_deltas_graph(rois, deltas_specific * config.BBOX_STD_DEV)
refined_rois = clip_boxes_graph(refined_rois, window)

#----------------------------------------------------------#
#   去除背景和得分小的区域
#----------------------------------------------------------#
keep = tf.where(class_ids > 0)[:, 0]
if config.DETECTION_MIN_CONFIDENCE:
conf_keep = tf.where(class_scores >= config.DETECTION_MIN_CONFIDENCE)[:, 0]
keep = tf.compat.v1.sets.set_intersection(tf.expand_dims(keep, 0),
tf.expand_dims(conf_keep, 0))
keep = tf.compat.v1.sparse_tensor_to_dense(keep)[0]

#----------------------------------------------------------#
#   获得除去背景并且得分较高的框还有种类与得分
#----------------------------------------------------------#
pre_nms_class_ids = tf.gather(class_ids, keep)
pre_nms_scores = tf.gather(class_scores, keep)
pre_nms_rois = tf.gather(refined_rois,   keep)
unique_pre_nms_class_ids = tf.unique(pre_nms_class_ids)[0]

def nms_keep_map(class_id):
ixs = tf.where(tf.equal(pre_nms_class_ids, class_id))[:, 0]

class_keep = tf.image.non_max_suppression(
tf.gather(pre_nms_rois, ixs),
tf.gather(pre_nms_scores, ixs),
max_output_size=config.DETECTION_MAX_INSTANCES,
iou_threshold=config.DETECTION_NMS_THRESHOLD)

class_keep = tf.gather(keep, tf.gather(ixs, class_keep))

gap = config.DETECTION_MAX_INSTANCES - tf.shape(class_keep)[0]
mode='CONSTANT', constant_values=-1)

class_keep.set_shape([config.DETECTION_MAX_INSTANCES])
return class_keep
#------------------------------------------------------------#
#   对获取到的满足得分门限且不属于背景的预测框进行非极大抑制
#------------------------------------------------------------#
nms_keep = tf.map_fn(nms_keep_map, unique_pre_nms_class_ids, dtype=tf.int64)
nms_keep = tf.reshape(nms_keep, [-1])
nms_keep = tf.gather(nms_keep, tf.where(nms_keep > -1)[:, 0])

keep = tf.compat.v1.sets.set_intersection(tf.expand_dims(keep, 0), tf.expand_dims(nms_keep, 0))
keep = tf.compat.v1.sparse_tensor_to_dense(keep)[0]

#------------------------------------------------------------#
#   寻找得分最高的num_keep个框
#------------------------------------------------------------#
roi_count = config.DETECTION_MAX_INSTANCES
class_scores_keep = tf.gather(class_scores, keep)
num_keep = tf.minimum(tf.shape(class_scores_keep)[0], roi_count)
top_ids = tf.nn.top_k(class_scores_keep, k=num_keep, sorted=True)[1]
keep = tf.gather(keep, top_ids)

#------------------------------------------------------------#
#   将预测结果进行堆叠，获得的最终shape为[N,6]
#   即：N, (y1, x1, y2, x2, class_id, score)
#------------------------------------------------------------#
detections = tf.concat([
tf.gather(refined_rois, keep),
tf.cast(tf.gather(class_ids, keep), tf.float32)[..., tf.newaxis],
tf.gather(class_scores, keep)[..., tf.newaxis]
], axis=1)

#------------------------------------------------------------#
#------------------------------------------------------------#
gap = config.DETECTION_MAX_INSTANCES - tf.shape(detections)[0]
detections = tf.pad(detections, [(0, gap), (0, 0)], "CONSTANT")
return detections

def norm_boxes_graph(boxes, shape):
h, w = tf.split(tf.cast(shape, tf.float32), 2)
scale = tf.concat([h, w, h, w], axis=-1) - tf.constant(1.0)
shift = tf.constant([0., 0., 1., 1.])
return tf.divide(boxes - shift, scale)

#----------------------------------------------------------#
#   Detection Layer
#   利用classifier的预测结果对建议框进行调整获得预测框
#----------------------------------------------------------#
class DetectionLayer(Layer):
def __init__(self, config=None, **kwargs):
super(DetectionLayer, self).__init__(**kwargs)
self.config = config

def call(self, inputs):
#------------------------------------------------------------------#
#   获得的inputs
#   rpn_rois            : Batch_size, proposal_count, 4
#   mrcnn_class         : Batch_size, num_rois, num_classes
#   mrcnn_bbox          : Batch_size, num_rois, num_classes,
#------------------------------------------------------------------#
rois = inputs[0]
mrcnn_class = inputs[1]
mrcnn_bbox = inputs[2]
image_meta = inputs[3]

#------------------------------------------------------------------#
#   找到window的小数形式
#------------------------------------------------------------------#
m = parse_image_meta_graph(image_meta)
image_shape = m['image_shape'][0]
window = norm_boxes_graph(m['window'], image_shape[:2])

#------------------------------------------------------------------#
#   对每一张图的结果进行解码
#------------------------------------------------------------------#
detections_batch = batch_slice(
[rois, mrcnn_class, mrcnn_bbox, window],
lambda x, y, w, z: refine_detections_graph(x, y, w, z, self.config),
self.config.IMAGES_PER_GPU)

#------------------------------------------------------------#
#   最终输出的shape为
#   Batch_size, num_detections, 6]
#------------------------------------------------------------#
return tf.reshape(
detections_batch,
[self.config.BATCH_SIZE, self.config.DETECTION_MAX_INSTANCES, 6])

def compute_output_shape(self, input_shape):
return (None, self.config.DETECTION_MAX_INSTANCES, 6)
``````

## 二、训练部分

### 1、建议框网络的训练

anchors_per_location x 4的卷积 用于预测 有效特征层上 每一个网格点上 每一个先验框的变化情况。**

anchors_per_location x 1的卷积 用于预测 有效特征层上 每一个网格点上 每一个建议框内部是否包含了物体。

``````def build_rpn_targets(image_shape, anchors, gt_class_ids, gt_boxes, config):
#------------------------------#
#   rpn_match中
#   1代表正样本、-1代表负样本
#   0代表忽略
#------------------------------#
rpn_match = np.zeros([anchors.shape[0]], dtype=np.int32)
#-----------------------------------------------#
#   创建该部分内容利用先验框和真实框进行编码
#-----------------------------------------------#
rpn_bbox = np.zeros((config.RPN_TRAIN_ANCHORS_PER_IMAGE, 4))

'''
iscrowd=0的时候，表示这是一个单独的物体，轮廓用Polygon(多边形的点)表示，
iscrowd=1的时候表示两个没有分开的物体，轮廓用RLE编码表示，比如说一张图片里面有三个人，
一个人单独站一边，另外两个搂在一起（标注的时候距离太近分不开了），这个时候，
单独的那个人的注释里面的iscrowing=0,segmentation用Polygon表示，
而另外两个用放在同一个anatation的数组里面用一个segmention的RLE编码形式表示
'''
crowd_ix = np.where(gt_class_ids < 0)[0]
if crowd_ix.shape[0] > 0:
non_crowd_ix    = np.where(gt_class_ids > 0)[0]
crowd_boxes     = gt_boxes[crowd_ix]
gt_class_ids    = gt_class_ids[non_crowd_ix]
gt_boxes        = gt_boxes[non_crowd_ix]
crowd_overlaps  = compute_overlaps(anchors, crowd_boxes)
crowd_iou_max   = np.amax(crowd_overlaps, axis=1)
no_crowd_bool   = (crowd_iou_max < 0.001)
else:
no_crowd_bool   = np.ones([anchors.shape[0]], dtype=bool)

#-----------------------------------------------#
#   计算先验框和真实框的重合程度
#   [num_anchors, num_gt_boxes]
#-----------------------------------------------#
overlaps = compute_overlaps(anchors, gt_boxes)

#-----------------------------------------------#
#   1. 重合程度小于0.3则代表为负样本
#-----------------------------------------------#
anchor_iou_argmax = np.argmax(overlaps, axis=1)
anchor_iou_max = overlaps[np.arange(overlaps.shape[0]), anchor_iou_argmax]
rpn_match[(anchor_iou_max < 0.3) & (no_crowd_bool)] = -1
#-----------------------------------------------#
#   2. 每个真实框重合度最大的先验框是正样本
#-----------------------------------------------#
gt_iou_argmax = np.argwhere(overlaps == np.max(overlaps, axis=0))[:,0]
rpn_match[gt_iou_argmax] = 1
#-----------------------------------------------#
#   3. 重合度大于0.7则代表为正样本
#-----------------------------------------------#
rpn_match[anchor_iou_max >= 0.7] = 1

#-----------------------------------------------#
#   正负样本平衡
#   找到正样本的索引
#-----------------------------------------------#
ids = np.where(rpn_match == 1)[0]

#-----------------------------------------------#
#   如果大于(config.RPN_TRAIN_ANCHORS_PER_IMAGE // 2)则删掉一些
#-----------------------------------------------#
extra = len(ids) - (config.RPN_TRAIN_ANCHORS_PER_IMAGE // 2)
if extra > 0:
ids = np.random.choice(ids, extra, replace=False)
rpn_match[ids] = 0

#-----------------------------------------------#
#   找到负样本的索引
#-----------------------------------------------#
ids = np.where(rpn_match == -1)[0]

#-----------------------------------------------#
#   使得总数为config.RPN_TRAIN_ANCHORS_PER_IMAGE
#-----------------------------------------------#
extra = len(ids) - (config.RPN_TRAIN_ANCHORS_PER_IMAGE -
np.sum(rpn_match == 1))
if extra > 0:
# Rest the extra ones to neutral
ids = np.random.choice(ids, extra, replace=False)
rpn_match[ids] = 0

#-----------------------------------------------#
#   找到内部真实存在物体的先验框，进行编码
#-----------------------------------------------#
ids = np.where(rpn_match == 1)[0]
ix = 0
for i, a in zip(ids, anchors[ids]):
gt = gt_boxes[anchor_iou_argmax[i]]
#-----------------------------------------------#
#   计算真实框的中心，高宽
#-----------------------------------------------#
gt_h = gt[2] - gt[0]
gt_w = gt[3] - gt[1]
gt_center_y = gt[0] + 0.5 * gt_h
gt_center_x = gt[1] + 0.5 * gt_w
#-----------------------------------------------#
#   计算先验框中心，高宽
#-----------------------------------------------#
a_h = a[2] - a[0]
a_w = a[3] - a[1]
a_center_y = a[0] + 0.5 * a_h
a_center_x = a[1] + 0.5 * a_w
#-----------------------------------------------#
#   编码运算
#-----------------------------------------------#
rpn_bbox[ix] = [
(gt_center_y - a_center_y) / np.maximum(a_h, 1),
(gt_center_x - a_center_x) / np.maximum(a_w, 1),
np.log(np.maximum(gt_h / np.maximum(a_h, 1), 1e-5)),
np.log(np.maximum(gt_w / np.maximum(a_w, 1), 1e-5)),
]
#-----------------------------------------------#
#   改变数量级
#-----------------------------------------------#
rpn_bbox[ix] /= config.RPN_BBOX_STD_DEV
ix += 1
return rpn_match, rpn_bbox
``````

### 2、Classiffier模型的训练

``````#----------------------------------------------------------#
#   Detection Target Layer
#   该部分代码会输入建议框
#   判断建议框和真实框的重合情况
#   筛选出内部包含物体的建议框
#   利用建议框和真实框编码
#----------------------------------------------------------#
#----------------------------------------------------------#
#   对输入进来的真实框进行编码
#----------------------------------------------------------#
def box_refinement_graph(box, gt_box):
box = tf.cast(box, tf.float32)
gt_box = tf.cast(gt_box, tf.float32)

height = box[:, 2] - box[:, 0]
width = box[:, 3] - box[:, 1]
center_y = box[:, 0] + 0.5 * height
center_x = box[:, 1] + 0.5 * width

gt_height = gt_box[:, 2] - gt_box[:, 0]
gt_width = gt_box[:, 3] - gt_box[:, 1]
gt_center_y = gt_box[:, 0] + 0.5 * gt_height
gt_center_x = gt_box[:, 1] + 0.5 * gt_width

dy = (gt_center_y - center_y) / height
dx = (gt_center_x - center_x) / width
dh = tf.math.log(gt_height / height)
dw = tf.math.log(gt_width / width)

result = tf.stack([dy, dx, dh, dw], axis=1)
return result

#----------------------------------------------------------#
#   Detection Target Layer
#   该部分代码会输入建议框
#   判断建议框和真实框的重合情况
#   筛选出内部包含物体的建议框
#   利用建议框和真实框编码
#----------------------------------------------------------#
def detection_targets_graph(proposals, gt_class_ids, gt_boxes, gt_masks, config):
asserts = [
tf.Assert(tf.greater(tf.shape(proposals)[0], 0), [proposals],
name="roi_assertion"),
]
with tf.control_dependencies(asserts):
proposals = tf.identity(proposals)

#----------------------------------------------------------#
#   在这里需要去掉
#----------------------------------------------------------#
proposals, _ = trim_zeros_graph(proposals, name="trim_proposals")
gt_boxes, non_zeros = trim_zeros_graph(gt_boxes, name="trim_gt_boxes")

#----------------------------------------------------------#
#   忽略掉coco数据集中的crowd部分，这些部分不易区分
#   训练时直接忽略
#----------------------------------------------------------#
crowd_ix = tf.where(gt_class_ids < 0)[:, 0]
non_crowd_ix = tf.where(gt_class_ids > 0)[:, 0]
crowd_boxes = tf.gather(gt_boxes, crowd_ix)
gt_class_ids = tf.gather(gt_class_ids, non_crowd_ix)
gt_boxes = tf.gather(gt_boxes, non_crowd_ix)

#----------------------------------------------------------#
#   计算建议框和所有真实框的重合程度
#   overlaps    : proposals, gt_boxes
#----------------------------------------------------------#
overlaps = overlaps_graph(proposals, gt_boxes)

#----------------------------------------------------------#
#   计算建议框和crowd boxes的重合程度
#   overlaps    : proposals, crowd_boxes
#----------------------------------------------------------#
crowd_overlaps = overlaps_graph(proposals, crowd_boxes)
crowd_iou_max = tf.reduce_max(crowd_overlaps, axis=1)
no_crowd_bool = (crowd_iou_max < 0.001)

#----------------------------------------------------------#
#   每个建议框与真实框的最大重合程度
#   roi_iou_max    : proposals,
#----------------------------------------------------------#
roi_iou_max = tf.reduce_max(overlaps, axis=1)
#----------------------------------------------------------#
#   1. 正样本建议框和真实框的重合程度大于0.5
#----------------------------------------------------------#
positive_roi_bool = (roi_iou_max >= 0.5)
positive_indices = tf.where(positive_roi_bool)[:, 0]
#----------------------------------------------------------#
#   2. 负样本建议框和真实框的重合程度小于0.5
#   那些和crowd重合度比较大的建议框忽略掉
#----------------------------------------------------------#
negative_indices = tf.where(tf.logical_and(roi_iou_max < 0.5, no_crowd_bool))[:, 0]

#----------------------------------------------------------#
#   进行正负样本的平衡，取出最大33%的正样本
#----------------------------------------------------------#
positive_count = int(config.TRAIN_ROIS_PER_IMAGE * config.ROI_POSITIVE_RATIO)
positive_indices = tf.random.shuffle(positive_indices)[:positive_count]
positive_count = tf.shape(positive_indices)[0]
#----------------------------------------------------------#
#   保持正负样本比例
#----------------------------------------------------------#
r = 1.0 / config.ROI_POSITIVE_RATIO
negative_count = tf.cast(r * tf.cast(positive_count, tf.float32), tf.int32) - positive_count
negative_indices = tf.random.shuffle(negative_indices)[:negative_count]
#----------------------------------------------------------#
#   获得正样本和负样本
#----------------------------------------------------------#
positive_rois = tf.gather(proposals, positive_indices)
negative_rois = tf.gather(proposals, negative_indices)

#----------------------------------------------------------#
#   获取建议框和真实框重合程度
#----------------------------------------------------------#
positive_overlaps = tf.gather(overlaps, positive_indices)

#----------------------------------------------------------#
#   判断是否有真实框
#----------------------------------------------------------#
roi_gt_box_assignment = tf.cond(
tf.greater(tf.shape(positive_overlaps)[1], 0),
true_fn = lambda: tf.argmax(positive_overlaps, axis=1),
false_fn = lambda: tf.cast(tf.constant([]),tf.int64)
)
#----------------------------------------------------------#
#   找到每一个建议框对应的真实框和种类
#----------------------------------------------------------#
roi_gt_boxes = tf.gather(gt_boxes, roi_gt_box_assignment)
roi_gt_class_ids = tf.gather(gt_class_ids, roi_gt_box_assignment)

#----------------------------------------------------------#
#   编码获得网络应该有得预测结果
#----------------------------------------------------------#
deltas = box_refinement_graph(positive_rois, roi_gt_boxes)
deltas /= config.BBOX_STD_DEV

#----------------------------------------------------------#
#----------------------------------------------------------#

#----------------------------------------------------------#
#----------------------------------------------------------#

#----------------------------------------------------------#
#----------------------------------------------------------#
boxes = positive_rois
y1, x1, y2, x2 = tf.split(positive_rois, 4, axis=1)
gt_y1, gt_x1, gt_y2, gt_x2 = tf.split(roi_gt_boxes, 4, axis=1)
gt_h = gt_y2 - gt_y1
gt_w = gt_x2 - gt_x1
y1 = (y1 - gt_y1) / gt_h
x1 = (x1 - gt_x1) / gt_w
y2 = (y2 - gt_y1) / gt_h
x2 = (x2 - gt_x1) / gt_w
boxes = tf.concat([y1, x1, y2, x2], 1)
box_ids,

#----------------------------------------------------------#
#   一般传入config.TRAIN_ROIS_PER_IMAGE个建议框进行训练，
#----------------------------------------------------------#
rois = tf.concat([positive_rois, negative_rois], axis=0)
N = tf.shape(negative_rois)[0]
P = tf.maximum(config.TRAIN_ROIS_PER_IMAGE - tf.shape(rois)[0], 0)
rois = tf.pad(rois, [(0, P), (0, 0)])
roi_gt_boxes = tf.pad(roi_gt_boxes, [(0, N + P), (0, 0)])
roi_gt_class_ids = tf.pad(roi_gt_class_ids, [(0, N + P)])
deltas = tf.pad(deltas, [(0, N + P), (0, 0)])

class DetectionTargetLayer(Layer):
"""
找到建议框的ground_truth
Inputs:
proposals       : [batch, N, (y1, x1, y2, x2)]                                          建议框
gt_class_ids    : [batch, MAX_GT_INSTANCES]                                             每个真实框对应的类
gt_boxes        : [batch, MAX_GT_INSTANCES, (y1, x1, y2, x2)]                           真实框的位置

Returns:
rois            : [batch, TRAIN_ROIS_PER_IMAGE, (y1, x1, y2, x2)]                       内部真实存在目标的建议框
target_class_ids: [batch, TRAIN_ROIS_PER_IMAGE]                                         每个建议框对应的类
target_deltas   : [batch, TRAIN_ROIS_PER_IMAGE, (dy, dx, log(dh), log(dw)]              每个建议框应该有的调整参数
target_mask     : [batch, TRAIN_ROIS_PER_IMAGE, height, width]                          每个建议框语义分割情况
"""

def __init__(self, config, **kwargs):
super(DetectionTargetLayer, self).__init__(**kwargs)
self.config = config

def call(self, inputs):
proposals = inputs[0]
gt_class_ids = inputs[1]
gt_boxes = inputs[2]

# 对真实框进行编码
names = ["rois", "target_class_ids", "target_bbox", "target_mask"]
outputs = batch_slice([proposals, gt_class_ids, gt_boxes, gt_masks],
lambda w, x, y, z: detection_targets_graph(w, x, y, z, self.config),
self.config.IMAGES_PER_GPU, names=names)
return outputs

def compute_output_shape(self, input_shape):
return [
(None, self.config.TRAIN_ROIS_PER_IMAGE, 4),  # rois
(None, self.config.TRAIN_ROIS_PER_IMAGE),  # class_ids
(None, self.config.TRAIN_ROIS_PER_IMAGE, 4),  # deltas
]

return [None, None, None, None]
``````

``````# Compute mask targets
boxes = positive_rois
# Transform ROI coordinates from normalized image space
y1, x1, y2, x2 = tf.split(positive_rois, 4, axis=1)
gt_y1, gt_x1, gt_y2, gt_x2 = tf.split(roi_gt_boxes, 4, axis=1)
gt_h = gt_y2 - gt_y1
gt_w = gt_x2 - gt_x1
y1 = (y1 - gt_y1) / gt_h
x1 = (x1 - gt_x1) / gt_w
y2 = (y2 - gt_y1) / gt_h
x2 = (x2 - gt_x1) / gt_w
boxes = tf.concat([y1, x1, y2, x2], 1)
box_ids,
``````

## 一、数据集的准备

``````triangle_1
triangle_2
``````

## 二、数据集的处理

model_data/cls_classes.txt文件内容为：

``````cat
dog
...
``````

## 三、开始训练网络

classes_path用于指向检测类别所对应的txt，这个txt和coco_annotation.py里面的txt一样！训练自己的数据集必须要修改！

## 四、模型预测

model_path指向训练好的权值文件，在logs文件夹里。
classes_path指向检测类别所对应的txt。

THE END

)">