Source code for blocks.models.region_proposal

from typing import Mapping
import logging

import tensorflow as tf
import emloop_tensorflow as eltf
import tensorflow.contrib.slim as slim


[docs]class RegionProposalNetwork(eltf.BaseModel): """ Configurable region proposal network (RPN) inspired by the Faster R-CNN architecture. RPN predicts regions of interest (ROIs) from an input image. RPN starts with encoding the input images into feature maps. For each position of the feature maps, a fixed number of anchors corresponding to fixed regions in the original image is considered. For each anchor, RPN predicts: - if the anchor matches to a ROI in the original image - anchor diff (correction) to the respective ROI **Inputs** - ``images`` (4-dim tensor NHWC) scaled to 0-255 - ``anchors_label`` (4-dim tensor NHWA) anchors label 0/1 determining if anchors match certain regions - ``anchors_mask`` (4-dim tensor NHWA) anchors mask 0/1 determining valid anchors to be trained - ``diffs`` (5-dim tensor NHWAD) anchor differences to the respective ROIs, the diff dimension D is configurable **Outputs** - ``classifier_probabilities`` and ``classifier_predictions`` (4-dim tensors NHWA) scaled to 0-1 and 0/1 respectively - ``regression_predictions`` (5-dim tensor NHWAD) of anchor differences (corrections) to the respective ROIs - ``classifier_loss``, ``regression_loss`` and ``loss`` (1-dim tensors N) RPN is tightly connected with datasets used for the training. It needs to learn the input image shape, number of anchors per feature map position and the dimension of the ``diff`` input. On the other hand, the dataset has to be configured with the feature map shape and amount of pooling applied to the images. **Dataset requirements** - ``img_shape()`` function returning a 3-tuple or list with the image shape - ``diffs_dim()`` function returning the last dimension of the ``diffs`` input - ``n_anchors_per_position`` property - ``configure_shape(features_shape, pool_amount)`` function which will be called after creating the feature map .. code-block:: yaml :caption: example usage in config model: name: RegionProposal class: blocks.models.RegionProposalNetwork architecture: encoder_config: [14c3, 14c3, 14c3, 14c3, mp2, 32c3, 32c3, 32c3, 32c3, mp2, 64c3, 64c3, 64c3, 64c3, mp2, 128c3, 128c3, 128c3] use_ln: true optimizer: class: AdamOptimizer learning_rate: 0.0001 inputs: [images, anchors_mask, anchors_label, diffs] outputs: [loss, regression_loss, classifier_loss, classifier_accuracy] **Reference:** `Faster R-CNN <https://arxiv.org/pdf/1506.01497.pdf>`_ """
[docs] def _create_model(self, architecture: Mapping, shared_dim: int=512, window_size: int=5, loss_ratio: float=0.5) -> None: """ Create new RPN instance. :param architecture: CNN encoder architecture :param shared_dim: the dimension of the feature vector shared between the classifier and regression net :param window_size: sliding window size after the CNN encoder :param loss_ratio: ratio from 0-1 interval between the classifier and regression losses, value 0.1 means the classifier will be 9 times less trained that the regression net """ diffs_dim = self._dataset.diffs_dim() images = tf.placeholder(tf.float32, [None]+list(self._dataset.img_shape()), name='images')/255 # 1. CNN encoder logging.info('Input shape: %s', images.shape) net = eltf.models.cnn_encoder(x=images, is_training=self.is_training, **architecture) logging.info('Encoded shape: %s', net.shape) net = slim.conv2d(net, shared_dim, window_size) logging.info('Shared vector shape: %s', net.shape) encoded_shape = net.get_shape().as_list()[1:3] pool_amount = eltf.models.conv.compute_pool_amount(architecture['encoder_config']) logging.info('Pool amount: %s', pool_amount) self._dataset.configure_shape(tuple(encoded_shape), pool_amount) n_anchors_per_position = self._dataset.n_anchors_per_position anchors_label = tf.placeholder(tf.int32, [None]+encoded_shape+[n_anchors_per_position], name='anchors_label') anchors_mask = tf.placeholder(tf.float32, [None]+encoded_shape+[n_anchors_per_position], name='anchors_mask') diffs = tf.placeholder(tf.float32, [None]+encoded_shape+[n_anchors_per_position, diffs_dim], name='diffs') # 2. Classifier classifier_net = slim.conv2d(net, n_anchors_per_position*2, 1, activation_fn=tf.identity) classifier_net = tf.reshape(classifier_net, [-1]+encoded_shape+[n_anchors_per_position, 2]) logging.info('Classifier shape: %s', classifier_net.shape) classifier_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=anchors_label, logits=classifier_net) classifier_loss *= anchors_mask # train only the masked anchors classifier_loss = tf.reduce_sum(classifier_loss, axis=[1, 2, 3], name='classifier_loss') classifier_predictions = tf.argmax(classifier_net, output_type=tf.int32, axis=-1, name='classifier_predictions') tf.identity(tf.nn.softmax(classifier_net)[:, :, :, :, 1], name='classifier_probabilities') correctly_classified = tf.cast(tf.equal(classifier_predictions, anchors_label), tf.float32) acc = (tf.reduce_sum(anchors_mask*correctly_classified, axis=[1, 2, 3]))\ / tf.reduce_sum(anchors_mask, axis=[1, 2, 3]) tf.identity(acc, 'classifier_accuracy') # 3. Regression net regression_net = slim.conv2d(net, n_anchors_per_position*diffs_dim, 1, activation_fn=tf.identity) regression_net = tf.reshape(regression_net, [-1]+encoded_shape+[n_anchors_per_position, diffs_dim]) logging.info('Regression shape: %s', regression_net.shape) tf.identity(regression_net, 'regression_predictions') regression_anchor_loss = tf.reduce_sum(eltf.ops.smooth_l1_loss(regression_net, diffs), axis=4) # train only anchors with both expected and predicted ROIs regression_loss = tf.cast(classifier_predictions, tf.float32)*anchors_mask*regression_anchor_loss regression_loss = tf.reduce_sum(regression_loss, axis=[1, 2, 3], name='regression_loss') # 4. Overall loss tf.identity(loss_ratio*classifier_loss+(1-loss_ratio)*regression_loss, name='loss')