Source code for blocks.models.region_proposal

from typing import Mapping
import logging

import tensorflow as tf
import emloop_tensorflow as eltf
import tensorflow.contrib.slim as slim


[docs]class RegionProposalNetwork(eltf.BaseModel):
    """
    Configurable region proposal network (RPN) inspired by the Faster R-CNN architecture.

    RPN predicts regions of interest (ROIs) from an input image.
    RPN starts with encoding the input images into feature maps.
    For each position of the feature maps, a fixed number of anchors
    corresponding to fixed regions in the original image is considered.

    For each anchor, RPN predicts:
        - if the anchor matches to a ROI in the original image
        - anchor diff (correction) to the respective ROI

    **Inputs**
        - ``images`` (4-dim tensor NHWC) scaled to 0-255
        - ``anchors_label`` (4-dim tensor NHWA) anchors label 0/1 determining if anchors match certain regions
        - ``anchors_mask`` (4-dim tensor NHWA) anchors mask 0/1 determining valid anchors to be trained
        - ``diffs`` (5-dim tensor NHWAD) anchor differences to the respective ROIs, the diff dimension D is configurable

    **Outputs**
        - ``classifier_probabilities`` and ``classifier_predictions`` (4-dim tensors NHWA) scaled to 0-1 and 0/1
          respectively
        - ``regression_predictions`` (5-dim tensor NHWAD) of anchor differences (corrections) to the respective ROIs
        - ``classifier_loss``, ``regression_loss`` and ``loss`` (1-dim tensors N)

    RPN is tightly connected with datasets used for the training. It needs to learn the input image shape,
    number of anchors per feature map position and the dimension of the ``diff`` input. On the other hand, the dataset
    has to be configured with the feature map shape and amount of pooling applied to the images.

    **Dataset requirements**
        - ``img_shape()`` function returning a 3-tuple or list with the image shape
        - ``diffs_dim()`` function returning the last dimension of the ``diffs`` input
        - ``n_anchors_per_position`` property
        - ``configure_shape(features_shape, pool_amount)`` function which will be called after creating the feature map


    .. code-block:: yaml
        :caption: example usage in config

        model:
          name: RegionProposal
          class: blocks.models.RegionProposalNetwork

          architecture:
            encoder_config: [14c3, 14c3, 14c3, 14c3, mp2,
                             32c3, 32c3, 32c3, 32c3, mp2,
                             64c3, 64c3, 64c3, 64c3, mp2,
                             128c3, 128c3, 128c3]
            use_ln: true

          optimizer:
            class: AdamOptimizer
            learning_rate: 0.0001

          inputs: [images, anchors_mask, anchors_label, diffs]
          outputs: [loss, regression_loss, classifier_loss, classifier_accuracy]

    **Reference:** `Faster R-CNN <https://arxiv.org/pdf/1506.01497.pdf>`_
    """

[docs]    def _create_model(self,
                      architecture: Mapping,
                      shared_dim: int=512,
                      window_size: int=5,
                      loss_ratio: float=0.5) -> None:
        """
        Create new RPN instance.

        :param architecture: CNN encoder architecture
        :param shared_dim: the dimension of the feature vector shared between the classifier and regression net
        :param window_size: sliding window size after the CNN encoder
        :param loss_ratio: ratio from 0-1 interval between the classifier and regression losses,
            value 0.1 means the classifier will be 9 times less trained that the regression net
        """
        diffs_dim = self._dataset.diffs_dim()
        images = tf.placeholder(tf.float32, [None]+list(self._dataset.img_shape()), name='images')/255

        # 1. CNN encoder
        logging.info('Input shape: %s', images.shape)
        net = eltf.models.cnn_encoder(x=images, is_training=self.is_training, **architecture)
        logging.info('Encoded shape: %s', net.shape)
        net = slim.conv2d(net, shared_dim, window_size)
        logging.info('Shared vector shape: %s', net.shape)

        encoded_shape = net.get_shape().as_list()[1:3]
        pool_amount = eltf.models.conv.compute_pool_amount(architecture['encoder_config'])
        logging.info('Pool amount: %s', pool_amount)
        self._dataset.configure_shape(tuple(encoded_shape), pool_amount)
        n_anchors_per_position = self._dataset.n_anchors_per_position

        anchors_label = tf.placeholder(tf.int32, [None]+encoded_shape+[n_anchors_per_position], name='anchors_label')
        anchors_mask = tf.placeholder(tf.float32, [None]+encoded_shape+[n_anchors_per_position], name='anchors_mask')
        diffs = tf.placeholder(tf.float32, [None]+encoded_shape+[n_anchors_per_position, diffs_dim], name='diffs')

        # 2. Classifier
        classifier_net = slim.conv2d(net, n_anchors_per_position*2, 1, activation_fn=tf.identity)
        classifier_net = tf.reshape(classifier_net, [-1]+encoded_shape+[n_anchors_per_position, 2])
        logging.info('Classifier shape: %s', classifier_net.shape)
        classifier_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=anchors_label, logits=classifier_net)
        classifier_loss *= anchors_mask  # train only the masked anchors
        classifier_loss = tf.reduce_sum(classifier_loss, axis=[1, 2, 3], name='classifier_loss')
        classifier_predictions = tf.argmax(classifier_net, output_type=tf.int32, axis=-1, name='classifier_predictions')
        tf.identity(tf.nn.softmax(classifier_net)[:, :, :, :, 1], name='classifier_probabilities')
        correctly_classified = tf.cast(tf.equal(classifier_predictions, anchors_label), tf.float32)
        acc = (tf.reduce_sum(anchors_mask*correctly_classified, axis=[1, 2, 3]))\
              / tf.reduce_sum(anchors_mask, axis=[1, 2, 3])
        tf.identity(acc, 'classifier_accuracy')

        # 3. Regression net
        regression_net = slim.conv2d(net, n_anchors_per_position*diffs_dim, 1, activation_fn=tf.identity)
        regression_net = tf.reshape(regression_net, [-1]+encoded_shape+[n_anchors_per_position, diffs_dim])
        logging.info('Regression shape: %s', regression_net.shape)
        tf.identity(regression_net, 'regression_predictions')
        regression_anchor_loss = tf.reduce_sum(eltf.ops.smooth_l1_loss(regression_net, diffs), axis=4)
        # train only anchors with both expected and predicted ROIs
        regression_loss = tf.cast(classifier_predictions, tf.float32)*anchors_mask*regression_anchor_loss
        regression_loss = tf.reduce_sum(regression_loss, axis=[1, 2, 3], name='regression_loss')

        # 4. Overall loss
        tf.identity(loss_ratio*classifier_loss+(1-loss_ratio)*regression_loss, name='loss')
Related projects

Source code for blocks.models.region_proposal