From abd5a194a440900a2779653dd76de567a401eebf Mon Sep 17 00:00:00 2001
From: Dillon Erb <585865+dte@users.noreply.github.com>
Date: Sat, 1 Feb 2020 21:18:31 -0500
Subject: [PATCH 01/13] Set model path

---
 examples/pytorch_mnist.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/examples/pytorch_mnist.py b/examples/pytorch_mnist.py
index 783e603d3d..68281c379a 100644
--- a/examples/pytorch_mnist.py
+++ b/examples/pytorch_mnist.py
@@ -7,7 +7,12 @@
 import torch.utils.data.distributed
 import horovod.torch as hvd
 
+import os
+
 # Training settings
+
+export_dir = os.path.abspath(os.environ.get('PS_MODEL_PATH', os.getcwd() + '/models'))
+
 parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
 parser.add_argument('--batch-size', type=int, default=64, metavar='N',
                     help='input batch size for training (default: 64)')
@@ -173,7 +178,7 @@ def test():
     if hvd.rank() == 0:
         print('\nTest set: Average loss: {:.4f}, Accuracy: {:.2f}%\n'.format(
             test_loss, 100. * test_accuracy))
-
+        model.save(export_dir, save_format='tf')
 
 for epoch in range(1, args.epochs + 1):
     train(epoch)

From 4f725e49a00cbb380c44b212dce519067df9a0f2 Mon Sep 17 00:00:00 2001
From: Dillon Erb <585865+dte@users.noreply.github.com>
Date: Sat, 1 Feb 2020 21:28:42 -0500
Subject: [PATCH 02/13] Use Horovod v0.18.2 syntax

---
 examples/pytorch_mnist.py | 16 +++-------------
 1 file changed, 3 insertions(+), 13 deletions(-)

diff --git a/examples/pytorch_mnist.py b/examples/pytorch_mnist.py
index 68281c379a..a867fb7445 100644
--- a/examples/pytorch_mnist.py
+++ b/examples/pytorch_mnist.py
@@ -32,9 +32,6 @@
                     help='how many batches to wait before logging training status')
 parser.add_argument('--fp16-allreduce', action='store_true', default=False,
                     help='use fp16 compression during allreduce')
-parser.add_argument('--use-adasum', action='store_true', default=False,
-                    help='use adasum algorithm to do reduction')
-
 args = parser.parse_args()
 args.cuda = not args.no_cuda and torch.cuda.is_available()
 
@@ -97,18 +94,12 @@ def forward(self, x):
 
 model = Net()
 
-# By default, Adasum doesn't need scaling up learning rate.
-lr_scaler = hvd.size() if not args.use_adasum else 1
-
 if args.cuda:
     # Move model to GPU.
     model.cuda()
-    # If using GPU Adasum allreduce, scale learning rate by local_size.
-    if args.use_adasum and hvd.nccl_built():
-        lr_scaler = hvd.local_size()
 
-# Horovod: scale learning rate by lr_scaler.
-optimizer = optim.SGD(model.parameters(), lr=args.lr * lr_scaler,
+# Horovod: scale learning rate by the number of GPUs.
+optimizer = optim.SGD(model.parameters(), lr=args.lr * hvd.size(),
                       momentum=args.momentum)
 
 # Horovod: broadcast parameters & optimizer state.
@@ -121,8 +112,7 @@ def forward(self, x):
 # Horovod: wrap optimizer with DistributedOptimizer.
 optimizer = hvd.DistributedOptimizer(optimizer,
                                      named_parameters=model.named_parameters(),
-                                     compression=compression,
-                                     op=hvd.Adasum if args.use_adasum else hvd.Average)
+                                     compression=compression)
 
 
 def train(epoch):

From 57c395723daf4528bd223247dac33287997d5633 Mon Sep 17 00:00:00 2001
From: Dillon Erb <585865+dte@users.noreply.github.com>
Date: Sat, 1 Feb 2020 21:55:34 -0500
Subject: [PATCH 03/13] Use toch.save() for model export

---
 examples/pytorch_mnist.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/pytorch_mnist.py b/examples/pytorch_mnist.py
index a867fb7445..36306cf106 100644
--- a/examples/pytorch_mnist.py
+++ b/examples/pytorch_mnist.py
@@ -168,7 +168,7 @@ def test():
     if hvd.rank() == 0:
         print('\nTest set: Average loss: {:.4f}, Accuracy: {:.2f}%\n'.format(
             test_loss, 100. * test_accuracy))
-        model.save(export_dir, save_format='tf')
+        torch.save(model.state_dict(), export_dir)
 
 for epoch in range(1, args.epochs + 1):
     train(epoch)

From d71e4895289af225a441278d83a28bd98de65557 Mon Sep 17 00:00:00 2001
From: Dillon Erb <585865+dte@users.noreply.github.com>
Date: Sat, 1 Feb 2020 22:06:00 -0500
Subject: [PATCH 04/13] ONNX export

---
 examples/pytorch_mnist.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/examples/pytorch_mnist.py b/examples/pytorch_mnist.py
index 36306cf106..abe8a3b24e 100644
--- a/examples/pytorch_mnist.py
+++ b/examples/pytorch_mnist.py
@@ -7,6 +7,8 @@
 import torch.utils.data.distributed
 import horovod.torch as hvd
 
+from torch.autograd import Variable
+
 import os
 
 # Training settings
@@ -169,6 +171,18 @@ def test():
         print('\nTest set: Average loss: {:.4f}, Accuracy: {:.2f}%\n'.format(
             test_loss, 100. * test_accuracy))
         torch.save(model.state_dict(), export_dir)
+        # Save to ONNX model format
+        # dummy_input = torch.randn(10, 3, 224, 224, device='cuda')
+        dummy_input = Variable(torch.randn(1, 1, 28, 28, device='cuda')) # one black and white 28 x 28 picture will be the input to the model
+
+        torch.onnx.export(model,               # model being run
+                  dummy_input,                 # model input (or a tuple for multiple inputs)
+                  export_dir + "model.onnx",   # where to save the model (can be a file or file-like object)
+                  export_params=True)        # store the trained parameter weights inside the model file
+                  # opset_version=10,          # the ONNX version to export the model to
+                  #do_constant_folding=True,  # whether to execute constant folding for optimization
+                  #input_names = ['input'],   # the model's input names
+                  #output_names = ['output']) # the model's output names
 
 for epoch in range(1, args.epochs + 1):
     train(epoch)

From 6625ac04b825c4eb63550974741d8f5c77099218 Mon Sep 17 00:00:00 2001
From: Dillon Erb <585865+dte@users.noreply.github.com>
Date: Sat, 1 Feb 2020 22:09:46 -0500
Subject: [PATCH 05/13] Model save path file not directory

---
 examples/pytorch_mnist.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/pytorch_mnist.py b/examples/pytorch_mnist.py
index abe8a3b24e..f620585ce5 100644
--- a/examples/pytorch_mnist.py
+++ b/examples/pytorch_mnist.py
@@ -170,7 +170,7 @@ def test():
     if hvd.rank() == 0:
         print('\nTest set: Average loss: {:.4f}, Accuracy: {:.2f}%\n'.format(
             test_loss, 100. * test_accuracy))
-        torch.save(model.state_dict(), export_dir)
+        torch.save(model.state_dict(), export_dir + 'model.pth')
         # Save to ONNX model format
         # dummy_input = torch.randn(10, 3, 224, 224, device='cuda')
         dummy_input = Variable(torch.randn(1, 1, 28, 28, device='cuda')) # one black and white 28 x 28 picture will be the input to the model

From c62af3c8b80192826d5ceb34f1a702430164ce2f Mon Sep 17 00:00:00 2001
From: Dillon Erb <585865+dte@users.noreply.github.com>
Date: Sun, 2 Feb 2020 13:33:00 -0500
Subject: [PATCH 06/13] Logging

---
 examples/pytorch_mnist.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/examples/pytorch_mnist.py b/examples/pytorch_mnist.py
index f620585ce5..23efcece29 100644
--- a/examples/pytorch_mnist.py
+++ b/examples/pytorch_mnist.py
@@ -170,14 +170,16 @@ def test():
     if hvd.rank() == 0:
         print('\nTest set: Average loss: {:.4f}, Accuracy: {:.2f}%\n'.format(
             test_loss, 100. * test_accuracy))
-        torch.save(model.state_dict(), export_dir + 'model.pth')
+        print('Saving PyTorch  model to: ' + export_dir)
+        torch.save(model.state_dict(), export_dir + '/model.pth')
         # Save to ONNX model format
         # dummy_input = torch.randn(10, 3, 224, 224, device='cuda')
         dummy_input = Variable(torch.randn(1, 1, 28, 28, device='cuda')) # one black and white 28 x 28 picture will be the input to the model
-
+        
+        print('Saving ONNX model to: ' + export_dir)
         torch.onnx.export(model,               # model being run
                   dummy_input,                 # model input (or a tuple for multiple inputs)
-                  export_dir + "model.onnx",   # where to save the model (can be a file or file-like object)
+                  export_dir + "/model.onnx",   # where to save the model (can be a file or file-like object)
                   export_params=True)        # store the trained parameter weights inside the model file
                   # opset_version=10,          # the ONNX version to export the model to
                   #do_constant_folding=True,  # whether to execute constant folding for optimization

From 598dabacad71a3323c493d9af542dac4ca90111d Mon Sep 17 00:00:00 2001
From: Dillon Erb <585865+dte@users.noreply.github.com>
Date: Sun, 2 Feb 2020 14:09:41 -0500
Subject: [PATCH 07/13] Update pytorch_imagenet_resnet50.py

---
 examples/pytorch_imagenet_resnet50.py | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/examples/pytorch_imagenet_resnet50.py b/examples/pytorch_imagenet_resnet50.py
index ce050d902a..515f7e7aeb 100644
--- a/examples/pytorch_imagenet_resnet50.py
+++ b/examples/pytorch_imagenet_resnet50.py
@@ -14,6 +14,9 @@
 from tqdm import tqdm
 
 # Training settings
+
+export_dir = os.path.abspath(os.environ.get('PS_MODEL_PATH', os.getcwd() + '/models'))
+
 parser = argparse.ArgumentParser(description='PyTorch ImageNet Example',
                                  formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 parser.add_argument('--train-dir', default=os.path.expanduser('~/imagenet/train'),
@@ -262,7 +265,22 @@ def save_checkpoint(epoch):
             'optimizer': optimizer.state_dict(),
         }
         torch.save(state, filepath)
-
+        print('Saving PyTorch  model to: ' + export_dir)
+        torch.save(state, export_dir + '/' + filepath)
+        # Save to ONNX model format
+        # dummy_input = torch.randn(10, 3, 224, 224, device='cuda')
+        dummy_input = Variable(torch.randn(1, 3, 256, 256, device='cuda')) # one color 256 x 256 picture will be the input to the model
+
+        print('Saving ONNX model to: ' + export_dir)
+        torch.onnx.export(model,               # model being run
+                  dummy_input,                 # model input (or a tuple for multiple inputs)
+                  export_dir + "/model.onnx",   # where to save the model (can be a file or file-like object)
+                  export_params=True)        # store the trained parameter weights inside the model file
+                  # opset_version=10,          # the ONNX version to export the model to
+                  #do_constant_folding=True,  # whether to execute constant folding for optimization
+                  #input_names = ['input'],   # the model's input names
+                  #output_names = ['output']) # the model's output names
+                
 
 # Horovod: average metrics from distributed training.
 class Metric(object):
@@ -279,7 +297,6 @@ def update(self, val):
     def avg(self):
         return self.sum / self.n
 
-
 for epoch in range(resume_from_epoch, args.epochs):
     train(epoch)
     validate(epoch)

From d022cdcb990638d2f7515c4ebc94c0e258450d03 Mon Sep 17 00:00:00 2001
From: mkutsovsky <misha@paperspace.com>
Date: Sun, 2 Feb 2020 16:15:59 -0500
Subject: [PATCH 08/13] export model at the end of training to onnx

---
 examples/pytorch_imagenet_resnet50.py | 33 +++++++++++++++------------
 1 file changed, 19 insertions(+), 14 deletions(-)

diff --git a/examples/pytorch_imagenet_resnet50.py b/examples/pytorch_imagenet_resnet50.py
index 515f7e7aeb..63af41152f 100644
--- a/examples/pytorch_imagenet_resnet50.py
+++ b/examples/pytorch_imagenet_resnet50.py
@@ -3,6 +3,7 @@
 import torch
 import argparse
 import torch.backends.cudnn as cudnn
+from torch.autograd import Variable
 import torch.nn.functional as F
 import torch.optim as optim
 import torch.utils.data.distributed
@@ -264,22 +265,25 @@ def save_checkpoint(epoch):
             'model': model.state_dict(),
             'optimizer': optimizer.state_dict(),
         }
-        torch.save(state, filepath)
+        #torch.save(state, filepath)
         print('Saving PyTorch  model to: ' + export_dir)
         torch.save(state, export_dir + '/' + filepath)
-        # Save to ONNX model format
-        # dummy_input = torch.randn(10, 3, 224, 224, device='cuda')
-        dummy_input = Variable(torch.randn(1, 3, 256, 256, device='cuda')) # one color 256 x 256 picture will be the input to the model
-
-        print('Saving ONNX model to: ' + export_dir)
-        torch.onnx.export(model,               # model being run
-                  dummy_input,                 # model input (or a tuple for multiple inputs)
-                  export_dir + "/model.onnx",   # where to save the model (can be a file or file-like object)
-                  export_params=True)        # store the trained parameter weights inside the model file
-                  # opset_version=10,          # the ONNX version to export the model to
-                  #do_constant_folding=True,  # whether to execute constant folding for optimization
-                  #input_names = ['input'],   # the model's input names
-                  #output_names = ['output']) # the model's output names
+
+def export_model():
+  if hvd.rank() == 0:
+    # Save to ONNX model format
+    dummy_input = Variable(torch.randn(1, 3, 224, 224, device='cuda')) # one color 224 x 224 picture will be the input to the model
+
+    print('Saving ONNX model to: ' + export_dir)
+    torch.onnx.export(model,               # model being run
+              dummy_input,                 # model input (or a tuple for multiple inputs)
+              export_dir + "/model.onnx",   # where to save the model (can be a file or file-like object)
+              export_params=True)        # store the trained parameter weights inside the model file
+              # opset_version=10,          # the ONNX version to export the model to
+              #do_constant_folding=True,  # whether to execute constant folding for optimization
+              #input_names = ['input'],   # the model's input names
+              #output_names = ['output']) # the model's output names 
+  
                 
 
 # Horovod: average metrics from distributed training.
@@ -301,3 +305,4 @@ def avg(self):
     train(epoch)
     validate(epoch)
     save_checkpoint(epoch)
+export_model()   

From a8fed1bf8f5b42e4bfc1808c003c245bde5dc966 Mon Sep 17 00:00:00 2001
From: Dillon Erb <585865+dte@users.noreply.github.com>
Date: Sun, 2 Feb 2020 17:07:16 -0500
Subject: [PATCH 09/13] Change back to horovod 0.18.2

---
 examples/pytorch_imagenet_resnet50.py | 27 ++++++++-------------------
 1 file changed, 8 insertions(+), 19 deletions(-)

diff --git a/examples/pytorch_imagenet_resnet50.py b/examples/pytorch_imagenet_resnet50.py
index 63af41152f..db17d89b6d 100644
--- a/examples/pytorch_imagenet_resnet50.py
+++ b/examples/pytorch_imagenet_resnet50.py
@@ -34,8 +34,6 @@
                     help='number of batches processed locally before '
                          'executing allreduce across workers; it multiplies '
                          'total batch size.')
-parser.add_argument('--use-adasum', action='store_true', default=False,
-                    help='use adasum algorithm to do reduction')
 
 # Default settings from https://arxiv.org/abs/1706.02677.
 parser.add_argument('--batch-size', type=int, default=32,
@@ -131,21 +129,15 @@
 # Set up standard ResNet-50 model.
 model = models.resnet50()
 
-# By default, Adasum doesn't need scaling up learning rate.
-# For sum/average with gradient Accumulation: scale learning rate by batches_per_allreduce
-lr_scaler = args.batches_per_allreduce * hvd.size() if not args.use_adasum else 1
-
 if args.cuda:
     # Move model to GPU.
     model.cuda()
-    # If using GPU Adasum allreduce, scale learning rate by local_size.
-    if args.use_adasum and hvd.nccl_built():
-        lr_scaler = args.batches_per_allreduce * hvd.local_size()
 
 # Horovod: scale learning rate by the number of GPUs.
+# Gradient Accumulation: scale learning rate by batches_per_allreduce
 optimizer = optim.SGD(model.parameters(),
                       lr=(args.base_lr *
-                          lr_scaler),
+                          args.batches_per_allreduce * hvd.size()),
                       momentum=args.momentum, weight_decay=args.wd)
 
 # Horovod: (optional) compression algorithm.
@@ -155,8 +147,7 @@
 optimizer = hvd.DistributedOptimizer(
     optimizer, named_parameters=model.named_parameters(),
     compression=compression,
-    backward_passes_per_step=args.batches_per_allreduce,
-    op=hvd.Adasum if args.use_adasum else hvd.Average)
+    backward_passes_per_step=args.batches_per_allreduce)
 
 # Restore from a previous checkpoint, if initial_epoch is specified.
 # Horovod: restore on the first worker which will broadcast weights to other workers.
@@ -265,10 +256,8 @@ def save_checkpoint(epoch):
             'model': model.state_dict(),
             'optimizer': optimizer.state_dict(),
         }
-        #torch.save(state, filepath)
-        print('Saving PyTorch  model to: ' + export_dir)
         torch.save(state, export_dir + '/' + filepath)
-
+        
 def export_model():
   if hvd.rank() == 0:
     # Save to ONNX model format
@@ -283,9 +272,7 @@ def export_model():
               #do_constant_folding=True,  # whether to execute constant folding for optimization
               #input_names = ['input'],   # the model's input names
               #output_names = ['output']) # the model's output names 
-  
-                
-
+            
 # Horovod: average metrics from distributed training.
 class Metric(object):
     def __init__(self, name):
@@ -301,8 +288,10 @@ def update(self, val):
     def avg(self):
         return self.sum / self.n
 
+
 for epoch in range(resume_from_epoch, args.epochs):
     train(epoch)
     validate(epoch)
     save_checkpoint(epoch)
-export_model()   
+export_model()    
+    

From 3f4879e13b7ca05faeaf72152d97ec76ae9481b8 Mon Sep 17 00:00:00 2001
From: Dillon Erb <585865+dte@users.noreply.github.com>
Date: Sun, 2 Feb 2020 19:01:31 -0500
Subject: [PATCH 10/13] Create TF2 keras imagenet

---
 examples/tensorflow2_keras_imagenet.py | 198 +++++++++++++++++++++++++
 1 file changed, 198 insertions(+)
 create mode 100644 examples/tensorflow2_keras_imagenet.py

diff --git a/examples/tensorflow2_keras_imagenet.py b/examples/tensorflow2_keras_imagenet.py
new file mode 100644
index 0000000000..cfdff2f80e
--- /dev/null
+++ b/examples/tensorflow2_keras_imagenet.py
@@ -0,0 +1,198 @@
+#
+# ResNet-50 model training using Keras and Horovod.
+#
+# This model is an example of a computation-intensive model that achieves good accuracy on an image
+# classification task.  It brings together distributed training concepts such as learning rate
+# schedule adjustments with a warmup, randomized data reading, and checkpointing on the first worker
+# only.
+#
+# Note: This model uses Keras native ImageDataGenerator and not the sophisticated preprocessing
+# pipeline that is typically used to train state-of-the-art ResNet-50 model.  This results in ~0.5%
+# increase in the top-1 validation error compared to the single-crop top-1 validation error from
+# https://github.com/KaimingHe/deep-residual-networks.
+#
+from __future__ import print_function
+
+import argparse
+# import keras
+# from keras import backend as K
+# from keras.preprocessing import image
+import tensorflow as tf
+from tensorflow import keras
+from keras import backend as K
+from keras.preprocessing import image
+
+import horovod.keras as hvd
+import os
+
+parser = argparse.ArgumentParser(description='Keras ImageNet Example',
+                                 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+parser.add_argument('--train-dir', default=os.path.expanduser('~/imagenet/train'),
+                    help='path to training data')
+parser.add_argument('--val-dir', default=os.path.expanduser('~/imagenet/validation'),
+                    help='path to validation data')
+parser.add_argument('--log-dir', default='./logs',
+                    help='tensorboard log directory')
+parser.add_argument('--checkpoint-format', default='./checkpoint-{epoch}.h5',
+                    help='checkpoint file format')
+parser.add_argument('--fp16-allreduce', action='store_true', default=False,
+                    help='use fp16 compression during allreduce')
+
+# Default settings from https://arxiv.org/abs/1706.02677.
+parser.add_argument('--batch-size', type=int, default=32,
+                    help='input batch size for training')
+parser.add_argument('--val-batch-size', type=int, default=32,
+                    help='input batch size for validation')
+parser.add_argument('--epochs', type=int, default=90,
+                    help='number of epochs to train')
+parser.add_argument('--base-lr', type=float, default=0.0125,
+                    help='learning rate for a single GPU')
+parser.add_argument('--warmup-epochs', type=float, default=5,
+                    help='number of warmup epochs')
+parser.add_argument('--momentum', type=float, default=0.9,
+                    help='SGD momentum')
+parser.add_argument('--wd', type=float, default=0.00005,
+                    help='weight decay')
+
+args = parser.parse_args()
+
+print(tf.test.is_built_with_cuda())
+# data_format = ('channels_first'
+#                 if tf.test.is_built_with_cuda() else 'channels_last')
+tf.keras.backend.set_image_data_format('channels_last')
+
+# Horovod: initialize Horovod.
+hvd.init()
+
+# Horovod: pin GPU to be used to process local rank (one GPU per process)
+# config = tf.compat.v1.ConfigProto()
+# config.gpu_options.allow_growth = True
+# config.gpu_options.visible_device_list = str(hvd.local_rank())
+# K.set_session(tf.compat.v1.Session(config=config))
+
+# Horovod: pin GPU to be used to process local rank (one GPU per process)
+gpus = tf.config.experimental.list_physical_devices('GPU')
+for gpu in gpus:
+    tf.config.experimental.set_memory_growth(gpu, True)
+if gpus:
+    tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU')
+
+# If set > 0, will resume training from a given checkpoint.
+resume_from_epoch = 0
+for try_epoch in range(args.epochs, 0, -1):
+    if os.path.exists(args.checkpoint_format.format(epoch=try_epoch)):
+        resume_from_epoch = try_epoch
+        break
+
+# Horovod: broadcast resume_from_epoch from rank 0 (which will have
+# checkpoints) to other ranks.
+resume_from_epoch = hvd.broadcast(resume_from_epoch, 0, name='resume_from_epoch')
+
+# Horovod: print logs on the first worker.
+verbose = 1 if hvd.rank() == 0 else 0
+
+# Training data iterator.
+train_gen = image.ImageDataGenerator(
+    width_shift_range=0.33, height_shift_range=0.33, zoom_range=0.5, horizontal_flip=True,
+    preprocessing_function=keras.applications.resnet50.preprocess_input)
+
+train_iter = train_gen.flow_from_directory(args.train_dir,
+                                           batch_size=args.batch_size,
+                                           target_size=(224, 224))
+
+# Validation data iterator.
+test_gen = image.ImageDataGenerator(
+    zoom_range=(0.875, 0.875), preprocessing_function=keras.applications.resnet50.preprocess_input)
+test_iter = test_gen.flow_from_directory(args.val_dir,
+                                         batch_size=args.val_batch_size,
+                                         target_size=(224, 224))
+
+
+# Set up standard ResNet-50 model.
+model = keras.applications.resnet50.ResNet50(weights=None, classes=200)
+
+# Horovod: (optional) compression algorithm.
+compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none
+
+# Restore from a previous checkpoint, if initial_epoch is specified.
+# Horovod: restore on the first worker which will broadcast both model and optimizer weights
+# to other workers.
+if resume_from_epoch > 0 and hvd.rank() == 0:
+    model = hvd.load_model(args.checkpoint_format.format(epoch=resume_from_epoch),
+                           compression=compression)
+else:
+    # ResNet-50 model that is included with Keras is optimized for inference.
+    # Add L2 weight decay & adjust BN settings.
+    model_config = model.get_config()
+    for layer, layer_config in zip(model.layers, model_config['layers']):
+        if hasattr(layer, 'kernel_regularizer'):
+            regularizer = keras.regularizers.l2(args.wd)
+            layer_config['config']['kernel_regularizer'] = \
+                {'class_name': regularizer.__class__.__name__,
+                 'config': regularizer.get_config()}
+        if type(layer) == keras.layers.BatchNormalization:
+            layer_config['config']['momentum'] = 0.9
+            layer_config['config']['epsilon'] = 1e-5
+
+    model = keras.models.Model.from_config(model_config)
+
+    # Horovod: adjust learning rate based on number of GPUs.
+    opt = keras.optimizers.SGD(lr=args.base_lr * hvd.size(),
+                               momentum=args.momentum)
+
+    # Horovod: add Horovod Distributed Optimizer.
+    opt = hvd.DistributedOptimizer(opt, compression=compression)
+
+    model.compile(loss=keras.losses.categorical_crossentropy,
+                  optimizer=opt,
+                  metrics=['accuracy', 'top_k_categorical_accuracy'],
+                  experimental_run_tf_function=False)
+
+callbacks = [
+    # Horovod: broadcast initial variable states from rank 0 to all other processes.
+    # This is necessary to ensure consistent initialization of all workers when
+    # training is started with random weights or restored from a checkpoint.
+    hvd.callbacks.BroadcastGlobalVariablesCallback(0),
+
+    # Horovod: average metrics among workers at the end of every epoch.
+    #
+    # Note: This callback must be in the list before the ReduceLROnPlateau,
+    # TensorBoard, or other metrics-based callbacks.
+    hvd.callbacks.MetricAverageCallback(),
+
+    # Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final
+    # accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during
+    # the first five epochs. See https://arxiv.org/abs/1706.02677 for details.
+    hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=args.warmup_epochs, verbose=verbose),
+
+    # Horovod: after the warmup reduce learning rate by 10 on the 30th, 60th and 80th epochs.
+    hvd.callbacks.LearningRateScheduleCallback(start_epoch=args.warmup_epochs, end_epoch=30, multiplier=1.),
+    hvd.callbacks.LearningRateScheduleCallback(start_epoch=30, end_epoch=60, multiplier=1e-1),
+    hvd.callbacks.LearningRateScheduleCallback(start_epoch=60, end_epoch=80, multiplier=1e-2),
+    hvd.callbacks.LearningRateScheduleCallback(start_epoch=80, multiplier=1e-3),
+]
+
+# Horovod: save checkpoints only on the first worker to prevent other workers from corrupting them.
+if hvd.rank() == 0:
+    callbacks.append(keras.callbacks.ModelCheckpoint(args.checkpoint_format))
+    callbacks.append(keras.callbacks.TensorBoard(args.log_dir))
+
+# Train the model. The training will randomly sample 1 / N batches of training data and
+# 3 / N batches of validation data on every worker, where N is the number of workers.
+# Over-sampling of validation data helps to increase probability that every validation
+# example will be evaluated.
+model.fit_generator(train_iter,
+                    steps_per_epoch=len(train_iter) // hvd.size(),
+                    callbacks=callbacks,
+                    epochs=args.epochs,
+                    verbose=verbose,
+                    workers=4,
+                    initial_epoch=resume_from_epoch,
+                    validation_data=test_iter,
+                    validation_steps=3 * len(test_iter) // hvd.size())
+
+# Evaluate the model on the full data set.
+score = hvd.allreduce(model.evaluate_generator(test_iter, len(test_iter), workers=4))
+if verbose:
+    print('Test loss:', score[0])
+    print('Test accuracy:', score[1])

From e47de2c6b63101090bf067052e9ecd1158637d31 Mon Sep 17 00:00:00 2001
From: Dillon Erb <585865+dte@users.noreply.github.com>
Date: Sun, 2 Feb 2020 19:11:19 -0500
Subject: [PATCH 11/13] Pass in ENV variable to distinguish ImageNet vs
 TinyImageNet

---
 examples/tensorflow2_keras_imagenet.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/tensorflow2_keras_imagenet.py b/examples/tensorflow2_keras_imagenet.py
index cfdff2f80e..774ce67c7a 100644
--- a/examples/tensorflow2_keras_imagenet.py
+++ b/examples/tensorflow2_keras_imagenet.py
@@ -109,6 +109,7 @@
 
 
 # Set up standard ResNet-50 model.
+num_classes = int(os.environ.get('IMAGENET_CLASSES', 1000))
 model = keras.applications.resnet50.ResNet50(weights=None, classes=200)
 
 # Horovod: (optional) compression algorithm.

From 8e6d8237e209c9e185b69f6639882445228a837a Mon Sep 17 00:00:00 2001
From: Dillon Erb <585865+dte@users.noreply.github.com>
Date: Sun, 2 Feb 2020 19:29:31 -0500
Subject: [PATCH 12/13] Update tensorflow2_keras_imagenet.py

---
 examples/tensorflow2_keras_imagenet.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/tensorflow2_keras_imagenet.py b/examples/tensorflow2_keras_imagenet.py
index 774ce67c7a..ec2fb37378 100644
--- a/examples/tensorflow2_keras_imagenet.py
+++ b/examples/tensorflow2_keras_imagenet.py
@@ -110,7 +110,7 @@
 
 # Set up standard ResNet-50 model.
 num_classes = int(os.environ.get('IMAGENET_CLASSES', 1000))
-model = keras.applications.resnet50.ResNet50(weights=None, classes=200)
+model = keras.applications.resnet50.ResNet50(weights=None, classes=num_classes)
 
 # Horovod: (optional) compression algorithm.
 compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none

From 9f9a624d5684b4515d9def6b2fd2e51d498935c7 Mon Sep 17 00:00:00 2001
From: Dillon Erb <585865+dte@users.noreply.github.com>
Date: Sun, 2 Feb 2020 19:56:40 -0500
Subject: [PATCH 13/13] Update log and model dir

---
 examples/tensorflow2_keras_imagenet.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/examples/tensorflow2_keras_imagenet.py b/examples/tensorflow2_keras_imagenet.py
index ec2fb37378..7e4971f1c1 100644
--- a/examples/tensorflow2_keras_imagenet.py
+++ b/examples/tensorflow2_keras_imagenet.py
@@ -33,7 +33,7 @@
                     help='path to validation data')
 parser.add_argument('--log-dir', default='./logs',
                     help='tensorboard log directory')
-parser.add_argument('--checkpoint-format', default='./checkpoint-{epoch}.h5',
+parser.add_argument('--checkpoint-format', default='checkpoint-{epoch}.h5',
                     help='checkpoint file format')
 parser.add_argument('--fp16-allreduce', action='store_true', default=False,
                     help='use fp16 compression during allreduce')
@@ -56,6 +56,8 @@
 
 args = parser.parse_args()
 
+export_dir = os.path.abspath(os.environ.get('PS_MODEL_PATH', os.getcwd() + '/models'))
+
 print(tf.test.is_built_with_cuda())
 # data_format = ('channels_first'
 #                 if tf.test.is_built_with_cuda() else 'channels_last')
@@ -175,8 +177,9 @@
 
 # Horovod: save checkpoints only on the first worker to prevent other workers from corrupting them.
 if hvd.rank() == 0:
-    callbacks.append(keras.callbacks.ModelCheckpoint(args.checkpoint_format))
-    callbacks.append(keras.callbacks.TensorBoard(args.log_dir))
+    callbacks.append(keras.callbacks.ModelCheckpoint(export_dir + args.checkpoint_format))
+    
+    callbacks.append(keras.callbacks.TensorBoard(export_dir))
 
 # Train the model. The training will randomly sample 1 / N batches of training data and
 # 3 / N batches of validation data on every worker, where N is the number of workers.