from typing import Mapping
import logging
import tensorflow as tf
import emloop_tensorflow as eltf
import tensorflow.contrib.slim as slim
[docs]class RegionProposalNetwork(eltf.BaseModel):
"""
Configurable region proposal network (RPN) inspired by the Faster R-CNN architecture.
RPN predicts regions of interest (ROIs) from an input image.
RPN starts with encoding the input images into feature maps.
For each position of the feature maps, a fixed number of anchors
corresponding to fixed regions in the original image is considered.
For each anchor, RPN predicts:
- if the anchor matches to a ROI in the original image
- anchor diff (correction) to the respective ROI
**Inputs**
- ``images`` (4-dim tensor NHWC) scaled to 0-255
- ``anchors_label`` (4-dim tensor NHWA) anchors label 0/1 determining if anchors match certain regions
- ``anchors_mask`` (4-dim tensor NHWA) anchors mask 0/1 determining valid anchors to be trained
- ``diffs`` (5-dim tensor NHWAD) anchor differences to the respective ROIs, the diff dimension D is configurable
**Outputs**
- ``classifier_probabilities`` and ``classifier_predictions`` (4-dim tensors NHWA) scaled to 0-1 and 0/1
respectively
- ``regression_predictions`` (5-dim tensor NHWAD) of anchor differences (corrections) to the respective ROIs
- ``classifier_loss``, ``regression_loss`` and ``loss`` (1-dim tensors N)
RPN is tightly connected with datasets used for the training. It needs to learn the input image shape,
number of anchors per feature map position and the dimension of the ``diff`` input. On the other hand, the dataset
has to be configured with the feature map shape and amount of pooling applied to the images.
**Dataset requirements**
- ``img_shape()`` function returning a 3-tuple or list with the image shape
- ``diffs_dim()`` function returning the last dimension of the ``diffs`` input
- ``n_anchors_per_position`` property
- ``configure_shape(features_shape, pool_amount)`` function which will be called after creating the feature map
.. code-block:: yaml
:caption: example usage in config
model:
name: RegionProposal
class: blocks.models.RegionProposalNetwork
architecture:
encoder_config: [14c3, 14c3, 14c3, 14c3, mp2,
32c3, 32c3, 32c3, 32c3, mp2,
64c3, 64c3, 64c3, 64c3, mp2,
128c3, 128c3, 128c3]
use_ln: true
optimizer:
class: AdamOptimizer
learning_rate: 0.0001
inputs: [images, anchors_mask, anchors_label, diffs]
outputs: [loss, regression_loss, classifier_loss, classifier_accuracy]
**Reference:** `Faster R-CNN <https://arxiv.org/pdf/1506.01497.pdf>`_
"""
[docs] def _create_model(self,
architecture: Mapping,
shared_dim: int=512,
window_size: int=5,
loss_ratio: float=0.5) -> None:
"""
Create new RPN instance.
:param architecture: CNN encoder architecture
:param shared_dim: the dimension of the feature vector shared between the classifier and regression net
:param window_size: sliding window size after the CNN encoder
:param loss_ratio: ratio from 0-1 interval between the classifier and regression losses,
value 0.1 means the classifier will be 9 times less trained that the regression net
"""
diffs_dim = self._dataset.diffs_dim()
images = tf.placeholder(tf.float32, [None]+list(self._dataset.img_shape()), name='images')/255
# 1. CNN encoder
logging.info('Input shape: %s', images.shape)
net = eltf.models.cnn_encoder(x=images, is_training=self.is_training, **architecture)
logging.info('Encoded shape: %s', net.shape)
net = slim.conv2d(net, shared_dim, window_size)
logging.info('Shared vector shape: %s', net.shape)
encoded_shape = net.get_shape().as_list()[1:3]
pool_amount = eltf.models.conv.compute_pool_amount(architecture['encoder_config'])
logging.info('Pool amount: %s', pool_amount)
self._dataset.configure_shape(tuple(encoded_shape), pool_amount)
n_anchors_per_position = self._dataset.n_anchors_per_position
anchors_label = tf.placeholder(tf.int32, [None]+encoded_shape+[n_anchors_per_position], name='anchors_label')
anchors_mask = tf.placeholder(tf.float32, [None]+encoded_shape+[n_anchors_per_position], name='anchors_mask')
diffs = tf.placeholder(tf.float32, [None]+encoded_shape+[n_anchors_per_position, diffs_dim], name='diffs')
# 2. Classifier
classifier_net = slim.conv2d(net, n_anchors_per_position*2, 1, activation_fn=tf.identity)
classifier_net = tf.reshape(classifier_net, [-1]+encoded_shape+[n_anchors_per_position, 2])
logging.info('Classifier shape: %s', classifier_net.shape)
classifier_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=anchors_label, logits=classifier_net)
classifier_loss *= anchors_mask # train only the masked anchors
classifier_loss = tf.reduce_sum(classifier_loss, axis=[1, 2, 3], name='classifier_loss')
classifier_predictions = tf.argmax(classifier_net, output_type=tf.int32, axis=-1, name='classifier_predictions')
tf.identity(tf.nn.softmax(classifier_net)[:, :, :, :, 1], name='classifier_probabilities')
correctly_classified = tf.cast(tf.equal(classifier_predictions, anchors_label), tf.float32)
acc = (tf.reduce_sum(anchors_mask*correctly_classified, axis=[1, 2, 3]))\
/ tf.reduce_sum(anchors_mask, axis=[1, 2, 3])
tf.identity(acc, 'classifier_accuracy')
# 3. Regression net
regression_net = slim.conv2d(net, n_anchors_per_position*diffs_dim, 1, activation_fn=tf.identity)
regression_net = tf.reshape(regression_net, [-1]+encoded_shape+[n_anchors_per_position, diffs_dim])
logging.info('Regression shape: %s', regression_net.shape)
tf.identity(regression_net, 'regression_predictions')
regression_anchor_loss = tf.reduce_sum(eltf.ops.smooth_l1_loss(regression_net, diffs), axis=4)
# train only anchors with both expected and predicted ROIs
regression_loss = tf.cast(classifier_predictions, tf.float32)*anchors_mask*regression_anchor_loss
regression_loss = tf.reduce_sum(regression_loss, axis=[1, 2, 3], name='regression_loss')
# 4. Overall loss
tf.identity(loss_ratio*classifier_loss+(1-loss_ratio)*regression_loss, name='loss')