From 956e3438b66fc1fbda7098ec7bf0f84d647ee1c9 Mon Sep 17 00:00:00 2001
From: husseina <husseina@nvidia.com>
Date: Tue, 19 Nov 2019 16:54:21 -0800
Subject: [PATCH 1/3] Backbone network

---
 .../nemo_cv/nemo_cv/modules/retinanet.py      | 203 ++++++++++++++++++
 1 file changed, 203 insertions(+)
 create mode 100644 collections/nemo_cv/nemo_cv/modules/retinanet.py

diff --git a/collections/nemo_cv/nemo_cv/modules/retinanet.py b/collections/nemo_cv/nemo_cv/modules/retinanet.py
new file mode 100644
index 000000000000..ef3e92a87dd6
--- /dev/null
+++ b/collections/nemo_cv/nemo_cv/modules/retinanet.py
@@ -0,0 +1,203 @@
+# Copyright (C) husseina, NVIDIA Team. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__author__ = "Hussein Al-barazanchi"
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.model_zoo as model_zoo
+
+from torchvision.models import resnet as vrn
+
+from nemo.backends.pytorch.nm import TrainableNM
+
+from nemo.core import NeuralType, AxisType, DeviceType,\
+    BatchTag, ChannelTag, HeightTag, WidthTag, ListTag, BoundingBoxTag, \
+    LogProbabilityTag
+
+
+class FocalLoss(nn.Module):
+    'Focal Loss - https://arxiv.org/abs/1708.02002'
+
+    def __init__(self, alpha=0.25, gamma=2):
+        super().__init__()
+        self.alpha = alpha
+        self.gamma = gamma
+
+    def forward(self, pred_logits, target):
+        pred = pred_logits.sigmoid()
+        ce = F.binary_cross_entropy_with_logits(pred_logits, target, reduction='none')
+        alpha = target * self.alpha + (1. - target) * (1. - self.alpha)
+        pt = torch.where(target == 1,  pred, 1 - pred)
+        return alpha * (1. - pt) ** self.gamma * ce
+
+
+class SmoothL1Loss(nn.Module):
+    'Smooth L1 Loss'
+
+    def __init__(self, beta=0.11):
+        super().__init__()
+        self.beta = beta
+
+    def forward(self, pred, target):
+        x = (pred - target).abs()
+        l1 = x - 0.5 * self.beta
+        l2 = 0.5 * x ** 2 / self.beta
+        return torch.where(x >= self.beta, l1, l2)
+
+
+class ResNet(vrn.ResNet):
+    'Deep Residual Network - https://arxiv.org/abs/1512.03385'
+
+    def __init__(self, layers=[3, 4, 6, 3], bottleneck=vrn.Bottleneck, outputs=[5], url=None):
+        self.stride = 128        
+        self.bottleneck = bottleneck
+        self.outputs = outputs
+        self.url = url
+        super().__init__(bottleneck, layers)
+
+    def initialize(self):
+        if self.url:
+            self.load_state_dict(model_zoo.load_url(self.url))
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+
+        outputs = []
+        for i, layer in enumerate([self.layer1, self.layer2, self.layer3, self.layer4]):
+            level = i + 2
+            if level > max(self.outputs):
+                break
+            x = layer(x)
+            if level in self.outputs:
+                outputs.append(x)
+
+        return outputs
+
+
+class FPN(nn.Module):
+    'Feature Pyramid Network - https://arxiv.org/abs/1612.03144'
+
+    def __init__(self, features):
+        super().__init__()
+
+        self.stride = 128
+        self.features = features
+
+        is_light = features.bottleneck == vrn.BasicBlock
+        channels = [128, 256, 512] if is_light else [512, 1024, 2048]
+
+        self.lateral3 = nn.Conv2d(channels[0], 256, 1)
+        self.lateral4 = nn.Conv2d(channels[1], 256, 1)
+        self.lateral5 = nn.Conv2d(channels[2], 256, 1)
+        self.pyramid6 = nn.Conv2d(channels[2], 256, 3, stride=2, padding=1)
+        self.pyramid7 = nn.Conv2d(256, 256, 3, stride=2, padding=1)
+        self.smooth3 = nn.Conv2d(256, 256, 3, padding=1)
+        self.smooth4 = nn.Conv2d(256, 256, 3, padding=1)
+        self.smooth5 = nn.Conv2d(256, 256, 3, padding=1)
+
+    def initialize(self):
+        def init_layer(layer):
+            if isinstance(layer, nn.Conv2d):
+                nn.init.xavier_uniform_(layer.weight)
+                if layer.bias is not None:
+                    nn.init.constant_(layer.bias, val=0)
+        self.apply(init_layer)
+
+        self.features.initialize()
+
+    def forward(self, x):
+        c3, c4, c5 = self.features(x)
+
+        p5 = self.lateral5(c5)
+        p4 = self.lateral4(c4)
+        p4 = F.interpolate(p5, scale_factor=2) + p4
+        p3 = self.lateral3(c3)
+        p3 = F.interpolate(p4, scale_factor=2) + p3
+
+        p6 = self.pyramid6(c5)
+        p7 = self.pyramid7(F.relu(p6))
+
+        p3 = self.smooth3(p3)
+        p4 = self.smooth4(p4)
+        p5 = self.smooth5(p5)
+
+        return [p3, p4, p5, p6, p7]
+
+
+class RetinaNet(TrainableNM):
+    """
+        Wrapper class around the RetinaNet model.
+    """
+
+    @staticmethod
+    def create_ports():
+        input_ports = {
+            # Batch of images.
+            "images": NeuralType({0: AxisType(BatchTag),
+                                  1: AxisType(ChannelTag, 3),
+                                  2: AxisType(HeightTag),
+                                  3: AxisType(WidthTag)}),
+            # Batch of bounding boxes.
+            "bounding_boxes": NeuralType({0: AxisType(BatchTag),
+                                          1: AxisType(ListTag),
+                                          2: AxisType(BoundingBoxTag)}),
+            # Batch of targets.
+            "targets": NeuralType({0: AxisType(BatchTag)})
+        }
+        output_ports = {
+            "predictions": NeuralType({0: AxisType(BatchTag),
+                                       1: AxisType(LogProbabilityTag)
+                                       })
+
+        }
+        return input_ports, output_ports
+
+    def __init__(self, num_classes, pretrained=False):
+        """
+        Creates the Faster R-CNN model.
+
+        Args:
+            num_classes: Number of output classes of the model.
+            pretrained: use weights of model pretrained on COCO train2017.
+        """
+
+        super().__init__()
+
+        # Create
+        self.model = FPN(ResNet(layers=[2, 2, 2, 2], bottleneck=vrn.BasicBlock, outputs=[3, 4, 5], url=vrn.model_urls['resnet18']))
+
+        # Get number of input features for the classifier.
+        in_features = self.model.roi_heads.box_predictor.cls_score.in_features
+
+        self.to(self._device)
+
+    def forward(self, images, bounding_boxes, targets):
+        """
+        Performs the forward step of the model.
+
+        Args:
+            images: Batch of images to be classified.
+        """
+
+        # We need to put this in a tuple again, as OD "framework" assumes it :]
+        targets_tuple = [{"boxes": b, "labels": t} for b, t
+                         in zip(bounding_boxes, targets)]
+
+        predictions = self.model(images, targets_tuple)
+        return predictions

From 0a6ec6a08ab6bc2656fb16f464dc3615146e65f3 Mon Sep 17 00:00:00 2001
From: husseina <husseina@nvidia.com>
Date: Wed, 20 Nov 2019 11:01:32 -0800
Subject: [PATCH 2/3] RetinaNet Initial Implementation

---
 .../nemo_cv/nemo_cv/modules/retinanet.py      | 241 ++++++++++++++++++
 1 file changed, 241 insertions(+)

diff --git a/collections/nemo_cv/nemo_cv/modules/retinanet.py b/collections/nemo_cv/nemo_cv/modules/retinanet.py
index ef3e92a87dd6..44679c0dc4a3 100644
--- a/collections/nemo_cv/nemo_cv/modules/retinanet.py
+++ b/collections/nemo_cv/nemo_cv/modules/retinanet.py
@@ -21,12 +21,24 @@
 
 from torchvision.models import resnet as vrn
 
+import os.path
+import io
+import math
+
+from . import backbones as backbones_mod
+from ._C import Engine
+from .box import generate_anchors, snap_to_anchors, decode, nms
+
 from nemo.backends.pytorch.nm import TrainableNM
 
 from nemo.core import NeuralType, AxisType, DeviceType,\
     BatchTag, ChannelTag, HeightTag, WidthTag, ListTag, BoundingBoxTag, \
     LogProbabilityTag
 
+"""
+Alot of the code below is heavily borrowed from 
+https://github.com/NVIDIA/retinanet-examples
+"""
 
 class FocalLoss(nn.Module):
     'Focal Loss - https://arxiv.org/abs/1708.02002'
@@ -140,6 +152,235 @@ def forward(self, x):
         return [p3, p4, p5, p6, p7]
 
 
+class Model(nn.Module):
+    'RetinaNet - https://arxiv.org/abs/1708.02002'
+
+    def __init__(self, backbones='ResNet18FPN', classes=80, config={}):
+        super().__init__()
+
+        if not isinstance(backbones, list):
+            backbones = [backbones]
+
+        #self.backbones = nn.ModuleDict({b: getattr(backbones_mod, b)() for b in backbones})
+        self.backbones = FPN(ResNet(layers=[2, 2, 2, 2], bottleneck=vrn.BasicBlock, outputs=[3, 4, 5], url=vrn.model_urls['resnet18']))
+        self.name = 'RetinaNet'
+        self.exporting = False
+
+        self.ratios = [1.0, 2.0, 0.5]
+        self.scales = [4 * 2**(i/3) for i in range(3)]
+        self.anchors = {}
+        self.classes = classes
+
+        self.threshold  = config.get('threshold', 0.05)
+        self.top_n      = config.get('top_n', 1000)
+        self.nms        = config.get('nms', 0.5)
+        self.detections = config.get('detections', 100)
+
+        self.stride = max([b.stride for _, b in self.backbones.items()])
+
+        # classification and box regression heads
+        def make_head(out_size):
+            layers = []
+            for _ in range(4):
+                layers += [nn.Conv2d(256, 256, 3, padding=1), nn.ReLU()]
+            layers += [nn.Conv2d(256, out_size, 3, padding=1)]
+            return nn.Sequential(*layers)
+
+        anchors = len(self.ratios) * len(self.scales)
+        self.cls_head = make_head(classes * anchors)
+        self.box_head = make_head(4 * anchors)
+
+        self.cls_criterion = FocalLoss()
+        self.box_criterion = SmoothL1Loss(beta=0.11)
+
+    def __repr__(self):
+        return '\n'.join([
+            '     model: {}'.format(self.name),
+            '  backbone: {}'.format(', '.join([k for k, _ in self.backbones.items()])),
+            '   classes: {}, anchors: {}'.format(self.classes, len(self.ratios) * len(self.scales)),
+        ])
+
+    def initialize(self, pre_trained):
+        if pre_trained:
+            # Initialize using weights from pre-trained model
+            if not os.path.isfile(pre_trained):
+                raise ValueError('No checkpoint {}'.format(pre_trained))
+
+            print('Fine-tuning weights from {}...'.format(os.path.basename(pre_trained)))
+            state_dict = self.state_dict()
+            chk = torch.load(pre_trained, map_location=lambda storage, loc: storage)
+            ignored = ['cls_head.8.bias', 'cls_head.8.weight']
+            weights = { k: v for k, v in chk['state_dict'].items() if k not in ignored }
+            state_dict.update(weights)
+            self.load_state_dict(state_dict)
+
+            del chk, weights
+            torch.cuda.empty_cache()
+
+        else:
+            # Initialize backbone(s)
+            for _, backbone in self.backbones.items():
+                backbone.initialize()
+
+            # Initialize heads
+            def initialize_layer(layer):
+                if isinstance(layer, nn.Conv2d):
+                    nn.init.normal_(layer.weight, std=0.01)
+                    if layer.bias is not None:
+                        nn.init.constant_(layer.bias, val=0)
+            self.cls_head.apply(initialize_layer)
+            self.box_head.apply(initialize_layer)
+
+        # Initialize class head prior
+        def initialize_prior(layer):
+            pi = 0.01
+            b = - math.log((1 - pi) / pi)
+            nn.init.constant_(layer.bias, b)
+            nn.init.normal_(layer.weight, std=0.01)
+        self.cls_head[-1].apply(initialize_prior)
+
+    def forward(self, x):
+        if self.training: x, targets = x
+
+        # Backbones forward pass
+        features = []
+        for _, backbone in self.backbones.items():
+            features.extend(backbone(x))
+
+        # Heads forward pass
+        cls_heads = [self.cls_head(t) for t in features]
+        box_heads = [self.box_head(t) for t in features]
+
+        if self.training:
+            return self._compute_loss(x, cls_heads, box_heads, targets.float())
+
+        cls_heads = [cls_head.sigmoid() for cls_head in cls_heads]
+
+        if self.exporting:
+            self.strides = [x.shape[-1] // cls_head.shape[-1] for cls_head in cls_heads]
+            return cls_heads, box_heads
+
+        # Inference post-processing
+        decoded = []
+        for cls_head, box_head in zip(cls_heads, box_heads):
+            # Generate level's anchors
+            stride = x.shape[-1] // cls_head.shape[-1]
+            if stride not in self.anchors:
+                self.anchors[stride] = generate_anchors(stride, self.ratios, self.scales)
+
+            # Decode and filter boxes
+            decoded.append(decode(cls_head, box_head, stride,
+                self.threshold, self.top_n, self.anchors[stride]))
+
+        # Perform non-maximum suppression
+        decoded = [torch.cat(tensors, 1) for tensors in zip(*decoded)]
+        return nms(*decoded, self.nms, self.detections)
+
+    def _extract_targets(self, targets, stride, size):
+        cls_target, box_target, depth = [], [], []
+        for target in targets:
+            target = target[target[:, -1] > -1]
+            if stride not in self.anchors:
+                self.anchors[stride] = generate_anchors(stride, self.ratios, self.scales)
+            snapped = snap_to_anchors(
+                target, [s * stride for s in size[::-1]], stride,
+                self.anchors[stride].to(targets.device), self.classes, targets.device)
+            for l, s in zip((cls_target, box_target, depth), snapped): l.append(s)
+        return torch.stack(cls_target), torch.stack(box_target), torch.stack(depth)
+
+    def _compute_loss(self, x, cls_heads, box_heads, targets):
+        cls_losses, box_losses, fg_targets = [], [], []
+        for cls_head, box_head in zip(cls_heads, box_heads):
+            size = cls_head.shape[-2:]
+            stride = x.shape[-1] / cls_head.shape[-1]
+
+            cls_target, box_target, depth = self._extract_targets(targets, stride, size)
+            fg_targets.append((depth > 0).sum().float().clamp(min=1))
+
+            cls_head = cls_head.view_as(cls_target).float()
+            cls_mask = (depth >= 0).expand_as(cls_target).float()
+            cls_loss = self.cls_criterion(cls_head, cls_target)
+            cls_loss = cls_mask * cls_loss
+            cls_losses.append(cls_loss.sum())
+
+            box_head = box_head.view_as(box_target).float()
+            box_mask = (depth > 0).expand_as(box_target).float()
+            box_loss = self.box_criterion(box_head, box_target)
+            box_loss = box_mask * box_loss
+            box_losses.append(box_loss.sum())
+
+        fg_targets = torch.stack(fg_targets).sum()
+        cls_loss = torch.stack(cls_losses).sum() / fg_targets
+        box_loss = torch.stack(box_losses).sum() / fg_targets
+        return cls_loss, box_loss
+
+    def save(self, state):
+        checkpoint = {
+            'backbone': [k for k, _ in self.backbones.items()],
+            'classes': self.classes,
+            'state_dict': self.state_dict()
+        }
+
+        for key in ('iteration', 'optimizer', 'scheduler'):
+            if key in state:
+                checkpoint[key] = state[key]
+
+        torch.save(checkpoint, state['path'])
+
+    @classmethod
+    def load(cls, filename):
+        if not os.path.isfile(filename):
+            raise ValueError('No checkpoint {}'.format(filename))
+
+        checkpoint = torch.load(filename, map_location=lambda storage, loc: storage)
+        # Recreate model from checkpoint instead of from individual backbones
+        model = cls(backbones=checkpoint['backbone'], classes=checkpoint['classes'])
+        model.load_state_dict(checkpoint['state_dict'])
+
+        state = {}
+        for key in ('iteration', 'optimizer', 'scheduler'):
+            if key in checkpoint:
+                state[key] = checkpoint[key]
+
+        del checkpoint
+        torch.cuda.empty_cache()
+
+        return model, state
+
+    def export(self, size, batch, precision, calibration_files, calibration_table, verbose, onnx_only=False, opset=None):
+        import torch.onnx.symbolic
+
+        if opset is not None and opset < 9:
+            # Override Upsample's ONNX export from old opset if required (not needed for TRT 5.1+)
+            @torch.onnx.symbolic.parse_args('v', 'is')
+            def upsample_nearest2d(g, input, output_size):
+                height_scale = float(output_size[-2]) / input.type().sizes()[-2]
+                width_scale = float(output_size[-1]) / input.type().sizes()[-1]
+                return g.op("Upsample", input,
+                    scales_f=(1, 1, height_scale, width_scale),
+                    mode_s="nearest")
+            torch.onnx.symbolic.upsample_nearest2d = upsample_nearest2d
+
+        # Export to ONNX
+        print('Exporting to ONNX...')
+        self.exporting = True
+        onnx_bytes = io.BytesIO()
+        zero_input = torch.zeros([1, 3, *size]).cuda()
+        extra_args = { 'opset_version': opset } if opset else {}
+        torch.onnx.export(self.cuda(), zero_input, onnx_bytes, *extra_args)
+        self.exporting = False
+
+        if onnx_only:
+            return onnx_bytes.getvalue()
+
+        # Build TensorRT engine
+        model_name = '_'.join([k for k, _ in self.backbones.items()])
+        anchors = [generate_anchors(stride, self.ratios, self.scales).view(-1).tolist() 
+            for stride in self.strides]
+        return Engine(onnx_bytes.getvalue(), len(onnx_bytes.getvalue()), batch, precision,
+            self.threshold, self.top_n, anchors, self.nms, self.detections, calibration_files, model_name, calibration_table, verbose)
+
+
 class RetinaNet(TrainableNM):
     """
         Wrapper class around the RetinaNet model.

From be693ad26d5c836c2fb7ea579ed3905ba86b581d Mon Sep 17 00:00:00 2001
From: husseina <husseina@nvidia.com>
Date: Fri, 22 Nov 2019 11:25:41 -0800
Subject: [PATCH 3/3] NeMo RetinaNet simple wrapper

---
 .../nemo_cv/modules/README_RetinaNet.md       |  20 +
 .../nemo_cv/modules/pascal_retinanet.py       |  31 ++
 .../nemo_cv/nemo_cv/modules/retinanet.py      | 444 ------------------
 .../nemo_cv/modules/retinanet_module.py       |  61 +++
 4 files changed, 112 insertions(+), 444 deletions(-)
 create mode 100644 collections/nemo_cv/nemo_cv/modules/README_RetinaNet.md
 create mode 100644 collections/nemo_cv/nemo_cv/modules/pascal_retinanet.py
 delete mode 100644 collections/nemo_cv/nemo_cv/modules/retinanet.py
 create mode 100644 collections/nemo_cv/nemo_cv/modules/retinanet_module.py

diff --git a/collections/nemo_cv/nemo_cv/modules/README_RetinaNet.md b/collections/nemo_cv/nemo_cv/modules/README_RetinaNet.md
new file mode 100644
index 000000000000..1bf3af553029
--- /dev/null
+++ b/collections/nemo_cv/nemo_cv/modules/README_RetinaNet.md
@@ -0,0 +1,20 @@
+
+# The docker image that is tested working with RetinaNet is nvcr.io/nvidia/pytorch:19.09-py3
+
+# To run RetinaNet Nemo Wrapper follow the below steps:
+
+docker run -it --rm --ipc=host --gpus all -v {path to the project}:/workspace nvcr.io/nvidia/pytorch:19.09-py3
+
+pip install --no-cache-dir git+https://github.com/nvidia/retinanet-examples
+
+git clone https://github.com/NVIDIA/NeMo.git 
+
+cd NeMo/nemo
+# Change the version of Pytorch in Nemo/nemo/setup.py to 'torch==1.2.0'
+
+python setup install
+
+cd ..
+cd collections/nemo_cv/nemo_cv/modules/
+
+python pascal_retinanet.py train model_mydataset.pth --backbone ResNet18FPN --classes 20 --iters 10000 --val-iters 1000 --lr 0.0005 --resize 512 --jitter 480 640 --images /workspace/PASCAL_VOC/JPEGImages/ --annotations /workspace/PASCAL_VOC/pascal_train2012.json --val-annotations /workspace/PASCAL_VOC/pascal_val2012.json
diff --git a/collections/nemo_cv/nemo_cv/modules/pascal_retinanet.py b/collections/nemo_cv/nemo_cv/modules/pascal_retinanet.py
new file mode 100644
index 000000000000..0011bbe233d8
--- /dev/null
+++ b/collections/nemo_cv/nemo_cv/modules/pascal_retinanet.py
@@ -0,0 +1,31 @@
+# Copyright (C) , NVIDIA INC. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__author__ = "Hussein Al-barazanchi"
+
+
+import sys
+from retinanet_module import *
+
+
+def main(args=None):
+    'Entry point for the retinanet command'
+
+    args = parse(args or sys.argv[1:])
+
+    detector = RetinaNet(args)
+    detector.execute(args)
+
+if __name__ == '__main__':
+    main()
diff --git a/collections/nemo_cv/nemo_cv/modules/retinanet.py b/collections/nemo_cv/nemo_cv/modules/retinanet.py
deleted file mode 100644
index 44679c0dc4a3..000000000000
--- a/collections/nemo_cv/nemo_cv/modules/retinanet.py
+++ /dev/null
@@ -1,444 +0,0 @@
-# Copyright (C) husseina, NVIDIA Team. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-__author__ = "Hussein Al-barazanchi"
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torch.utils.model_zoo as model_zoo
-
-from torchvision.models import resnet as vrn
-
-import os.path
-import io
-import math
-
-from . import backbones as backbones_mod
-from ._C import Engine
-from .box import generate_anchors, snap_to_anchors, decode, nms
-
-from nemo.backends.pytorch.nm import TrainableNM
-
-from nemo.core import NeuralType, AxisType, DeviceType,\
-    BatchTag, ChannelTag, HeightTag, WidthTag, ListTag, BoundingBoxTag, \
-    LogProbabilityTag
-
-"""
-Alot of the code below is heavily borrowed from 
-https://github.com/NVIDIA/retinanet-examples
-"""
-
-class FocalLoss(nn.Module):
-    'Focal Loss - https://arxiv.org/abs/1708.02002'
-
-    def __init__(self, alpha=0.25, gamma=2):
-        super().__init__()
-        self.alpha = alpha
-        self.gamma = gamma
-
-    def forward(self, pred_logits, target):
-        pred = pred_logits.sigmoid()
-        ce = F.binary_cross_entropy_with_logits(pred_logits, target, reduction='none')
-        alpha = target * self.alpha + (1. - target) * (1. - self.alpha)
-        pt = torch.where(target == 1,  pred, 1 - pred)
-        return alpha * (1. - pt) ** self.gamma * ce
-
-
-class SmoothL1Loss(nn.Module):
-    'Smooth L1 Loss'
-
-    def __init__(self, beta=0.11):
-        super().__init__()
-        self.beta = beta
-
-    def forward(self, pred, target):
-        x = (pred - target).abs()
-        l1 = x - 0.5 * self.beta
-        l2 = 0.5 * x ** 2 / self.beta
-        return torch.where(x >= self.beta, l1, l2)
-
-
-class ResNet(vrn.ResNet):
-    'Deep Residual Network - https://arxiv.org/abs/1512.03385'
-
-    def __init__(self, layers=[3, 4, 6, 3], bottleneck=vrn.Bottleneck, outputs=[5], url=None):
-        self.stride = 128        
-        self.bottleneck = bottleneck
-        self.outputs = outputs
-        self.url = url
-        super().__init__(bottleneck, layers)
-
-    def initialize(self):
-        if self.url:
-            self.load_state_dict(model_zoo.load_url(self.url))
-
-    def forward(self, x):
-        x = self.conv1(x)
-        x = self.bn1(x)
-        x = self.relu(x)
-        x = self.maxpool(x)
-
-        outputs = []
-        for i, layer in enumerate([self.layer1, self.layer2, self.layer3, self.layer4]):
-            level = i + 2
-            if level > max(self.outputs):
-                break
-            x = layer(x)
-            if level in self.outputs:
-                outputs.append(x)
-
-        return outputs
-
-
-class FPN(nn.Module):
-    'Feature Pyramid Network - https://arxiv.org/abs/1612.03144'
-
-    def __init__(self, features):
-        super().__init__()
-
-        self.stride = 128
-        self.features = features
-
-        is_light = features.bottleneck == vrn.BasicBlock
-        channels = [128, 256, 512] if is_light else [512, 1024, 2048]
-
-        self.lateral3 = nn.Conv2d(channels[0], 256, 1)
-        self.lateral4 = nn.Conv2d(channels[1], 256, 1)
-        self.lateral5 = nn.Conv2d(channels[2], 256, 1)
-        self.pyramid6 = nn.Conv2d(channels[2], 256, 3, stride=2, padding=1)
-        self.pyramid7 = nn.Conv2d(256, 256, 3, stride=2, padding=1)
-        self.smooth3 = nn.Conv2d(256, 256, 3, padding=1)
-        self.smooth4 = nn.Conv2d(256, 256, 3, padding=1)
-        self.smooth5 = nn.Conv2d(256, 256, 3, padding=1)
-
-    def initialize(self):
-        def init_layer(layer):
-            if isinstance(layer, nn.Conv2d):
-                nn.init.xavier_uniform_(layer.weight)
-                if layer.bias is not None:
-                    nn.init.constant_(layer.bias, val=0)
-        self.apply(init_layer)
-
-        self.features.initialize()
-
-    def forward(self, x):
-        c3, c4, c5 = self.features(x)
-
-        p5 = self.lateral5(c5)
-        p4 = self.lateral4(c4)
-        p4 = F.interpolate(p5, scale_factor=2) + p4
-        p3 = self.lateral3(c3)
-        p3 = F.interpolate(p4, scale_factor=2) + p3
-
-        p6 = self.pyramid6(c5)
-        p7 = self.pyramid7(F.relu(p6))
-
-        p3 = self.smooth3(p3)
-        p4 = self.smooth4(p4)
-        p5 = self.smooth5(p5)
-
-        return [p3, p4, p5, p6, p7]
-
-
-class Model(nn.Module):
-    'RetinaNet - https://arxiv.org/abs/1708.02002'
-
-    def __init__(self, backbones='ResNet18FPN', classes=80, config={}):
-        super().__init__()
-
-        if not isinstance(backbones, list):
-            backbones = [backbones]
-
-        #self.backbones = nn.ModuleDict({b: getattr(backbones_mod, b)() for b in backbones})
-        self.backbones = FPN(ResNet(layers=[2, 2, 2, 2], bottleneck=vrn.BasicBlock, outputs=[3, 4, 5], url=vrn.model_urls['resnet18']))
-        self.name = 'RetinaNet'
-        self.exporting = False
-
-        self.ratios = [1.0, 2.0, 0.5]
-        self.scales = [4 * 2**(i/3) for i in range(3)]
-        self.anchors = {}
-        self.classes = classes
-
-        self.threshold  = config.get('threshold', 0.05)
-        self.top_n      = config.get('top_n', 1000)
-        self.nms        = config.get('nms', 0.5)
-        self.detections = config.get('detections', 100)
-
-        self.stride = max([b.stride for _, b in self.backbones.items()])
-
-        # classification and box regression heads
-        def make_head(out_size):
-            layers = []
-            for _ in range(4):
-                layers += [nn.Conv2d(256, 256, 3, padding=1), nn.ReLU()]
-            layers += [nn.Conv2d(256, out_size, 3, padding=1)]
-            return nn.Sequential(*layers)
-
-        anchors = len(self.ratios) * len(self.scales)
-        self.cls_head = make_head(classes * anchors)
-        self.box_head = make_head(4 * anchors)
-
-        self.cls_criterion = FocalLoss()
-        self.box_criterion = SmoothL1Loss(beta=0.11)
-
-    def __repr__(self):
-        return '\n'.join([
-            '     model: {}'.format(self.name),
-            '  backbone: {}'.format(', '.join([k for k, _ in self.backbones.items()])),
-            '   classes: {}, anchors: {}'.format(self.classes, len(self.ratios) * len(self.scales)),
-        ])
-
-    def initialize(self, pre_trained):
-        if pre_trained:
-            # Initialize using weights from pre-trained model
-            if not os.path.isfile(pre_trained):
-                raise ValueError('No checkpoint {}'.format(pre_trained))
-
-            print('Fine-tuning weights from {}...'.format(os.path.basename(pre_trained)))
-            state_dict = self.state_dict()
-            chk = torch.load(pre_trained, map_location=lambda storage, loc: storage)
-            ignored = ['cls_head.8.bias', 'cls_head.8.weight']
-            weights = { k: v for k, v in chk['state_dict'].items() if k not in ignored }
-            state_dict.update(weights)
-            self.load_state_dict(state_dict)
-
-            del chk, weights
-            torch.cuda.empty_cache()
-
-        else:
-            # Initialize backbone(s)
-            for _, backbone in self.backbones.items():
-                backbone.initialize()
-
-            # Initialize heads
-            def initialize_layer(layer):
-                if isinstance(layer, nn.Conv2d):
-                    nn.init.normal_(layer.weight, std=0.01)
-                    if layer.bias is not None:
-                        nn.init.constant_(layer.bias, val=0)
-            self.cls_head.apply(initialize_layer)
-            self.box_head.apply(initialize_layer)
-
-        # Initialize class head prior
-        def initialize_prior(layer):
-            pi = 0.01
-            b = - math.log((1 - pi) / pi)
-            nn.init.constant_(layer.bias, b)
-            nn.init.normal_(layer.weight, std=0.01)
-        self.cls_head[-1].apply(initialize_prior)
-
-    def forward(self, x):
-        if self.training: x, targets = x
-
-        # Backbones forward pass
-        features = []
-        for _, backbone in self.backbones.items():
-            features.extend(backbone(x))
-
-        # Heads forward pass
-        cls_heads = [self.cls_head(t) for t in features]
-        box_heads = [self.box_head(t) for t in features]
-
-        if self.training:
-            return self._compute_loss(x, cls_heads, box_heads, targets.float())
-
-        cls_heads = [cls_head.sigmoid() for cls_head in cls_heads]
-
-        if self.exporting:
-            self.strides = [x.shape[-1] // cls_head.shape[-1] for cls_head in cls_heads]
-            return cls_heads, box_heads
-
-        # Inference post-processing
-        decoded = []
-        for cls_head, box_head in zip(cls_heads, box_heads):
-            # Generate level's anchors
-            stride = x.shape[-1] // cls_head.shape[-1]
-            if stride not in self.anchors:
-                self.anchors[stride] = generate_anchors(stride, self.ratios, self.scales)
-
-            # Decode and filter boxes
-            decoded.append(decode(cls_head, box_head, stride,
-                self.threshold, self.top_n, self.anchors[stride]))
-
-        # Perform non-maximum suppression
-        decoded = [torch.cat(tensors, 1) for tensors in zip(*decoded)]
-        return nms(*decoded, self.nms, self.detections)
-
-    def _extract_targets(self, targets, stride, size):
-        cls_target, box_target, depth = [], [], []
-        for target in targets:
-            target = target[target[:, -1] > -1]
-            if stride not in self.anchors:
-                self.anchors[stride] = generate_anchors(stride, self.ratios, self.scales)
-            snapped = snap_to_anchors(
-                target, [s * stride for s in size[::-1]], stride,
-                self.anchors[stride].to(targets.device), self.classes, targets.device)
-            for l, s in zip((cls_target, box_target, depth), snapped): l.append(s)
-        return torch.stack(cls_target), torch.stack(box_target), torch.stack(depth)
-
-    def _compute_loss(self, x, cls_heads, box_heads, targets):
-        cls_losses, box_losses, fg_targets = [], [], []
-        for cls_head, box_head in zip(cls_heads, box_heads):
-            size = cls_head.shape[-2:]
-            stride = x.shape[-1] / cls_head.shape[-1]
-
-            cls_target, box_target, depth = self._extract_targets(targets, stride, size)
-            fg_targets.append((depth > 0).sum().float().clamp(min=1))
-
-            cls_head = cls_head.view_as(cls_target).float()
-            cls_mask = (depth >= 0).expand_as(cls_target).float()
-            cls_loss = self.cls_criterion(cls_head, cls_target)
-            cls_loss = cls_mask * cls_loss
-            cls_losses.append(cls_loss.sum())
-
-            box_head = box_head.view_as(box_target).float()
-            box_mask = (depth > 0).expand_as(box_target).float()
-            box_loss = self.box_criterion(box_head, box_target)
-            box_loss = box_mask * box_loss
-            box_losses.append(box_loss.sum())
-
-        fg_targets = torch.stack(fg_targets).sum()
-        cls_loss = torch.stack(cls_losses).sum() / fg_targets
-        box_loss = torch.stack(box_losses).sum() / fg_targets
-        return cls_loss, box_loss
-
-    def save(self, state):
-        checkpoint = {
-            'backbone': [k for k, _ in self.backbones.items()],
-            'classes': self.classes,
-            'state_dict': self.state_dict()
-        }
-
-        for key in ('iteration', 'optimizer', 'scheduler'):
-            if key in state:
-                checkpoint[key] = state[key]
-
-        torch.save(checkpoint, state['path'])
-
-    @classmethod
-    def load(cls, filename):
-        if not os.path.isfile(filename):
-            raise ValueError('No checkpoint {}'.format(filename))
-
-        checkpoint = torch.load(filename, map_location=lambda storage, loc: storage)
-        # Recreate model from checkpoint instead of from individual backbones
-        model = cls(backbones=checkpoint['backbone'], classes=checkpoint['classes'])
-        model.load_state_dict(checkpoint['state_dict'])
-
-        state = {}
-        for key in ('iteration', 'optimizer', 'scheduler'):
-            if key in checkpoint:
-                state[key] = checkpoint[key]
-
-        del checkpoint
-        torch.cuda.empty_cache()
-
-        return model, state
-
-    def export(self, size, batch, precision, calibration_files, calibration_table, verbose, onnx_only=False, opset=None):
-        import torch.onnx.symbolic
-
-        if opset is not None and opset < 9:
-            # Override Upsample's ONNX export from old opset if required (not needed for TRT 5.1+)
-            @torch.onnx.symbolic.parse_args('v', 'is')
-            def upsample_nearest2d(g, input, output_size):
-                height_scale = float(output_size[-2]) / input.type().sizes()[-2]
-                width_scale = float(output_size[-1]) / input.type().sizes()[-1]
-                return g.op("Upsample", input,
-                    scales_f=(1, 1, height_scale, width_scale),
-                    mode_s="nearest")
-            torch.onnx.symbolic.upsample_nearest2d = upsample_nearest2d
-
-        # Export to ONNX
-        print('Exporting to ONNX...')
-        self.exporting = True
-        onnx_bytes = io.BytesIO()
-        zero_input = torch.zeros([1, 3, *size]).cuda()
-        extra_args = { 'opset_version': opset } if opset else {}
-        torch.onnx.export(self.cuda(), zero_input, onnx_bytes, *extra_args)
-        self.exporting = False
-
-        if onnx_only:
-            return onnx_bytes.getvalue()
-
-        # Build TensorRT engine
-        model_name = '_'.join([k for k, _ in self.backbones.items()])
-        anchors = [generate_anchors(stride, self.ratios, self.scales).view(-1).tolist() 
-            for stride in self.strides]
-        return Engine(onnx_bytes.getvalue(), len(onnx_bytes.getvalue()), batch, precision,
-            self.threshold, self.top_n, anchors, self.nms, self.detections, calibration_files, model_name, calibration_table, verbose)
-
-
-class RetinaNet(TrainableNM):
-    """
-        Wrapper class around the RetinaNet model.
-    """
-
-    @staticmethod
-    def create_ports():
-        input_ports = {
-            # Batch of images.
-            "images": NeuralType({0: AxisType(BatchTag),
-                                  1: AxisType(ChannelTag, 3),
-                                  2: AxisType(HeightTag),
-                                  3: AxisType(WidthTag)}),
-            # Batch of bounding boxes.
-            "bounding_boxes": NeuralType({0: AxisType(BatchTag),
-                                          1: AxisType(ListTag),
-                                          2: AxisType(BoundingBoxTag)}),
-            # Batch of targets.
-            "targets": NeuralType({0: AxisType(BatchTag)})
-        }
-        output_ports = {
-            "predictions": NeuralType({0: AxisType(BatchTag),
-                                       1: AxisType(LogProbabilityTag)
-                                       })
-
-        }
-        return input_ports, output_ports
-
-    def __init__(self, num_classes, pretrained=False):
-        """
-        Creates the Faster R-CNN model.
-
-        Args:
-            num_classes: Number of output classes of the model.
-            pretrained: use weights of model pretrained on COCO train2017.
-        """
-
-        super().__init__()
-
-        # Create
-        self.model = FPN(ResNet(layers=[2, 2, 2, 2], bottleneck=vrn.BasicBlock, outputs=[3, 4, 5], url=vrn.model_urls['resnet18']))
-
-        # Get number of input features for the classifier.
-        in_features = self.model.roi_heads.box_predictor.cls_score.in_features
-
-        self.to(self._device)
-
-    def forward(self, images, bounding_boxes, targets):
-        """
-        Performs the forward step of the model.
-
-        Args:
-            images: Batch of images to be classified.
-        """
-
-        # We need to put this in a tuple again, as OD "framework" assumes it :]
-        targets_tuple = [{"boxes": b, "labels": t} for b, t
-                         in zip(bounding_boxes, targets)]
-
-        predictions = self.model(images, targets_tuple)
-        return predictions
diff --git a/collections/nemo_cv/nemo_cv/modules/retinanet_module.py b/collections/nemo_cv/nemo_cv/modules/retinanet_module.py
new file mode 100644
index 000000000000..408e0e597752
--- /dev/null
+++ b/collections/nemo_cv/nemo_cv/modules/retinanet_module.py
@@ -0,0 +1,61 @@
+# Copyright (C) , NVIDIA INC. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__author__ = "Hussein Al-barazanchi"
+
+
+import torch
+
+from retinanet.model import Model
+from retinanet.main import parse, load_model, worker
+
+from nemo.backends.pytorch.nm import TrainableNM
+
+
+class RetinaNet(TrainableNM):
+    """
+        Wrapper class around the RetinaNet model.
+    """
+
+    @staticmethod
+    def create_ports():
+        
+        return None, None
+
+    def __init__(self, args):
+        """
+        Creates the RetinaNet model.
+
+        Args:
+            num_classes: Number of output classes of the model.
+            pretrained: use weights of model pretrained on COCO train2017.
+        """
+
+        super().__init__()
+
+        # Create
+        self.model, self.state = load_model(args, verbose=True)
+        if self.model: 
+            self.model.share_memory()
+
+    def forward(self, images, bounding_boxes, targets):
+        pass
+
+    def execute(self, args):
+
+        world = torch.cuda.device_count()
+        if args.command == 'export' or world <= 1:
+            worker(0, args, 1, self.model, self.state)
+        else:
+            torch.multiprocessing.spawn(worker, args=(args, world, self.model, self.state), nprocs=world)