anime-face-detector/faster_rcnn_wrapper.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204

import tensorflow as tf
from tensorflow.contrib.slim.python.slim.nets.resnet_v1 import resnet_v1_block, resnet_v1
import tensorflow.contrib.slim as slim
from tensorflow.contrib.slim.python.slim.nets.resnet_utils import arg_scope, conv2d_same
import numpy as np


class FasterRCNNSlim:

    def __init__(self):
        self._blocks = [resnet_v1_block('block1', base_depth=64, num_units=3, stride=2),
                        resnet_v1_block('block2', base_depth=128, num_units=4, stride=2),
                        resnet_v1_block('block3', base_depth=256, num_units=23, stride=1),
                        resnet_v1_block('block4', base_depth=512, num_units=3, stride=1)]
        self._image = tf.placeholder(tf.float32, shape=[1, None, None, 3])
        self._im_info = tf.placeholder(tf.float32, shape=[3])

        self._anchor_scales = [4, 8, 16, 32]
        self._num_scales = len(self._anchor_scales)

        self._anchor_ratios = [1]
        self._num_ratios = len(self._anchor_ratios)

        self._num_anchors = self._num_scales * self._num_ratios
        self._scope = 'resnet_v1_101'

        with arg_scope([slim.conv2d, slim.conv2d_in_plane, slim.conv2d_transpose, slim.separable_conv2d,
                        slim.fully_connected],
                       weights_regularizer=tf.contrib.layers.l2_regularizer(0.0001),
                       biases_regularizer=tf.no_regularizer,
                       biases_initializer=tf.constant_initializer(0.0)):
            # in _build_network
            initializer = tf.random_normal_initializer(stddev=0.01)
            initializer_bbox = tf.random_normal_initializer(stddev=0.001)
            # in _image_to_head
            with slim.arg_scope(self._resnet_arg_scope()):
                # in _build_base
                with tf.variable_scope(self._scope, self._scope):
                    net_conv = conv2d_same(self._image, 64, 7, stride=2, scope='conv1')
                    net_conv = tf.pad(net_conv, [[0, 0], [1, 1], [1, 1], [0, 0]])
                    net_conv = slim.max_pool2d(net_conv, [3, 3], stride=2, padding='VALID', scope='pool1')
                net_conv, _ = resnet_v1(net_conv, self._blocks[:-1], global_pool=False, include_root_block=False,
                                        scope=self._scope)
            with tf.variable_scope(self._scope, self._scope):
                # in _anchor_component
                with tf.variable_scope('ANCHOR-default'):
                    height = tf.to_int32(tf.ceil(self._im_info[0] / 16.0))
                    width = tf.to_int32(tf.ceil(self._im_info[1] / 16.0))

                    shift_x = tf.range(width) * 16
                    shift_y = tf.range(height) * 16
                    shift_x, shift_y = tf.meshgrid(shift_x, shift_y)
                    sx = tf.reshape(shift_x, [-1])
                    sy = tf.reshape(shift_y, [-1])
                    shifts = tf.transpose(tf.stack([sx, sy, sx, sy]))
                    k = width * height
                    shifts = tf.transpose(tf.reshape(shifts, [1, k, 4]), perm=[1, 0, 2])

                    anchors = np.array([[-24, -24, 39, 39], [-56, -56, 71, 71],
                                        [-120, -120, 135, 135], [-248, -248, 263, 263]], dtype=np.int32)

                    a = anchors.shape[0]
                    anchor_constant = tf.constant(anchors.reshape([1, a, 4]), dtype=tf.int32)
                    length = k * a
                    anchors_tf = tf.reshape(anchor_constant + shifts, shape=[length, 4])
                    anchors = tf.cast(anchors_tf, dtype=tf.float32)
                    self._anchors = anchors
                    self._anchor_length = length

                # in _region_proposal
                rpn = slim.conv2d(net_conv, 512, [3, 3], trainable=False, weights_initializer=initializer,
                                  scope='rpn_conv/3x3')
                rpn_cls_score = slim.conv2d(rpn, self._num_anchors * 2, [1, 1], trainable=False,
                                            weights_initializer=initializer, padding='VALID', activation_fn=None,
                                            scope='rpn_cls_score')
                rpn_cls_score_reshape = self._reshape(rpn_cls_score, 2, 'rpn_cls_score_reshape')
                rpn_cls_prob_reshape = self._softmax(rpn_cls_score_reshape, 'rpn_cls_prob_reshape')
                # rpn_cls_pred = tf.argmax(tf.reshape(rpn_cls_score_reshape, [-1, 2]), axis=1, name='rpn_cls_pred')
                rpn_cls_prob = self._reshape(rpn_cls_prob_reshape, self._num_anchors * 2, 'rpn_cls_prob')
                rpn_bbox_pred = slim.conv2d(rpn, self._num_anchors * 4, [1, 1], trainable=False,
                                            weights_initializer=initializer, padding='VALID', activation_fn=None,
                                            scope='rpn_bbox_pred')

                # in _proposal_layer
                with tf.variable_scope('rois'):
                    post_nms_topn = 300
                    nms_thresh = 0.7
                    scores = rpn_cls_prob[:, :, :, self._num_anchors:]
                    scores = tf.reshape(scores, [-1])
                    rpn_bbox_pred = tf.reshape(rpn_bbox_pred, [-1, 4])

                    boxes = tf.cast(self._anchors, rpn_bbox_pred.dtype)
                    widths = boxes[:, 2] - boxes[:, 0] + 1.0
                    heights = boxes[:, 3] - boxes[:, 1] + 1.0
                    ctr_x = boxes[:, 0] + widths * 0.5
                    ctr_y = boxes[:, 1] + heights * 0.5

                    dx = rpn_bbox_pred[:, 0]
                    dy = rpn_bbox_pred[:, 1]
                    dw = rpn_bbox_pred[:, 2]
                    dh = rpn_bbox_pred[:, 3]

                    pred_ctr_x = dx * widths + ctr_x
                    pred_ctr_y = dy * heights + ctr_y
                    pred_w = tf.exp(dw) * widths
                    pred_h = tf.exp(dh) * heights

                    pred_boxes0 = pred_ctr_x - pred_w * 0.5
                    pred_boxes1 = pred_ctr_y - pred_h * 0.5
                    pred_boxes2 = pred_ctr_x + pred_w * 0.5
                    pred_boxes3 = pred_ctr_y + pred_h * 0.5

                    b0 = tf.clip_by_value(pred_boxes0, 0, self._im_info[1] - 1)
                    b1 = tf.clip_by_value(pred_boxes1, 0, self._im_info[0] - 1)
                    b2 = tf.clip_by_value(pred_boxes2, 0, self._im_info[1] - 1)
                    b3 = tf.clip_by_value(pred_boxes3, 0, self._im_info[0] - 1)

                    proposals = tf.stack([b0, b1, b2, b3], axis=1)
                    indices = tf.image.non_max_suppression(proposals, scores, max_output_size=post_nms_topn,
                                                           iou_threshold=nms_thresh)
                    boxes = tf.to_float(tf.gather(proposals, indices))
                    # rpn_scores = tf.reshape(tf.gather(scores, indices), [-1, 1])

                    batch_inds = tf.zeros([tf.shape(indices)[0], 1], dtype=tf.float32)
                    rois = tf.concat([batch_inds, boxes], 1)

                # in _crop_pool_layer
                with tf.variable_scope('pool5'):
                    batch_ids = tf.squeeze(tf.slice(rois, [0, 0], [-1, 1], name='bath_id'), [1])
                    bottom_shape = tf.shape(net_conv)
                    height = (tf.to_float(bottom_shape[1]) - 1) * 16.0
                    width = (tf.to_float(bottom_shape[2]) - 1) * 16.0
                    x1 = tf.slice(rois, [0, 1], [-1, 1], name='x1') / width
                    y1 = tf.slice(rois, [0, 2], [-1, 1], name='y1') / height
                    x2 = tf.slice(rois, [0, 3], [-1, 1], name='x2') / width
                    y2 = tf.slice(rois, [0, 4], [-1, 1], name='y2') / height
                    bboxes = tf.stop_gradient(tf.concat([y1, x1, y2, x2], 1))
                    pool5 = tf.image.crop_and_resize(net_conv, bboxes, tf.to_int32(batch_ids), [7, 7], name='crops')
            # in _head_to_tail
            with slim.arg_scope(self._resnet_arg_scope()):
                fc7, _ = resnet_v1(pool5, self._blocks[-1:], global_pool=False, include_root_block=False,
                                   scope=self._scope)
                fc7 = tf.reduce_mean(fc7, axis=[1, 2])
            with tf.variable_scope(self._scope, self._scope):
                # in _region_classification
                cls_score = slim.fully_connected(fc7, 2, weights_initializer=initializer, trainable=False,
                                                 activation_fn=None, scope='cls_score')
                cls_prob = self._softmax(cls_score, 'cls_prob')
                # cls_pred = tf.argmax(cls_score, 'cls_pred')
                bbox_pred = slim.fully_connected(fc7, 2*4, weights_initializer=initializer_bbox, trainable=False,
                                                 activation_fn=None, scope='bbox_pred')
        self._cls_score = cls_score
        self._cls_prob = cls_prob
        self._bbox_pred = bbox_pred
        self._rois = rois

        stds = np.tile(np.array([0.1, 0.1, 0.2, 0.2]), 2)
        means = np.tile(np.array([0.0, 0.0, 0.0, 0.0]), 2)
        self._bbox_pred *= stds
        self._bbox_pred += means

    @staticmethod
    def _resnet_arg_scope():
        batch_norm_params = {
            'is_training': False,
            'decay': 0.997,
            'epsilon': 1e-5,
            'scale': True,
            'trainable': False,
            'updates_collections': tf.GraphKeys.UPDATE_OPS
        }
        with arg_scope([slim.conv2d],
                       weights_regularizer=slim.l2_regularizer(0.0001),
                       weights_initializer=slim.variance_scaling_initializer(),
                       trainable=False,
                       activation_fn=tf.nn.relu,
                       normalizer_fn=slim.batch_norm,
                       normalizer_params=batch_norm_params):
            with arg_scope([slim.batch_norm], **batch_norm_params) as arg_sc:
                return arg_sc

    @staticmethod
    def _reshape(bottom, num_dim, name):
        input_shape = tf.shape(bottom)
        with tf.variable_scope(name):
            to_caffe = tf.transpose(bottom, [0, 3, 1, 2])
            reshaped = tf.reshape(to_caffe, [1, num_dim, -1, input_shape[2]])
            to_tf = tf.transpose(reshaped, [0, 2, 3, 1])
        return to_tf

    @staticmethod
    def _softmax(bottom, name):
        if name.startswith('rpn_cls_prob_reshape'):
            input_shape = tf.shape(bottom)
            bottom_reshaped = tf.reshape(bottom, [-1, input_shape[-1]])
            reshaped_score = tf.nn.softmax(bottom_reshaped, name=name)
            return tf.reshape(reshaped_score, input_shape)
        return tf.nn.softmax(bottom, name=name)

    def test_image(self, sess, image, im_info):
        return sess.run([self._cls_score, self._cls_prob, self._bbox_pred, self._rois], feed_dict={
            self._image: image,
            self._im_info: im_info
        })