From abd5a194a440900a2779653dd76de567a401eebf Mon Sep 17 00:00:00 2001 From: Dillon Erb <585865+dte@users.noreply.github.com> Date: Sat, 1 Feb 2020 21:18:31 -0500 Subject: [PATCH 01/13] Set model path --- examples/pytorch_mnist.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/examples/pytorch_mnist.py b/examples/pytorch_mnist.py index 783e603d3d..68281c379a 100644 --- a/examples/pytorch_mnist.py +++ b/examples/pytorch_mnist.py @@ -7,7 +7,12 @@ import torch.utils.data.distributed import horovod.torch as hvd +import os + # Training settings + +export_dir = os.path.abspath(os.environ.get('PS_MODEL_PATH', os.getcwd() + '/models')) + parser = argparse.ArgumentParser(description='PyTorch MNIST Example') parser.add_argument('--batch-size', type=int, default=64, metavar='N', help='input batch size for training (default: 64)') @@ -173,7 +178,7 @@ def test(): if hvd.rank() == 0: print('\nTest set: Average loss: {:.4f}, Accuracy: {:.2f}%\n'.format( test_loss, 100. * test_accuracy)) - + model.save(export_dir, save_format='tf') for epoch in range(1, args.epochs + 1): train(epoch) From 4f725e49a00cbb380c44b212dce519067df9a0f2 Mon Sep 17 00:00:00 2001 From: Dillon Erb <585865+dte@users.noreply.github.com> Date: Sat, 1 Feb 2020 21:28:42 -0500 Subject: [PATCH 02/13] Use Horovod v0.18.2 syntax --- examples/pytorch_mnist.py | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/examples/pytorch_mnist.py b/examples/pytorch_mnist.py index 68281c379a..a867fb7445 100644 --- a/examples/pytorch_mnist.py +++ b/examples/pytorch_mnist.py @@ -32,9 +32,6 @@ help='how many batches to wait before logging training status') parser.add_argument('--fp16-allreduce', action='store_true', default=False, help='use fp16 compression during allreduce') -parser.add_argument('--use-adasum', action='store_true', default=False, - help='use adasum algorithm to do reduction') - args = parser.parse_args() args.cuda = not args.no_cuda and torch.cuda.is_available() @@ -97,18 +94,12 @@ def forward(self, x): model = Net() -# By default, Adasum doesn't need scaling up learning rate. -lr_scaler = hvd.size() if not args.use_adasum else 1 - if args.cuda: # Move model to GPU. model.cuda() - # If using GPU Adasum allreduce, scale learning rate by local_size. - if args.use_adasum and hvd.nccl_built(): - lr_scaler = hvd.local_size() -# Horovod: scale learning rate by lr_scaler. -optimizer = optim.SGD(model.parameters(), lr=args.lr * lr_scaler, +# Horovod: scale learning rate by the number of GPUs. +optimizer = optim.SGD(model.parameters(), lr=args.lr * hvd.size(), momentum=args.momentum) # Horovod: broadcast parameters & optimizer state. @@ -121,8 +112,7 @@ def forward(self, x): # Horovod: wrap optimizer with DistributedOptimizer. optimizer = hvd.DistributedOptimizer(optimizer, named_parameters=model.named_parameters(), - compression=compression, - op=hvd.Adasum if args.use_adasum else hvd.Average) + compression=compression) def train(epoch): From 57c395723daf4528bd223247dac33287997d5633 Mon Sep 17 00:00:00 2001 From: Dillon Erb <585865+dte@users.noreply.github.com> Date: Sat, 1 Feb 2020 21:55:34 -0500 Subject: [PATCH 03/13] Use toch.save() for model export --- examples/pytorch_mnist.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/pytorch_mnist.py b/examples/pytorch_mnist.py index a867fb7445..36306cf106 100644 --- a/examples/pytorch_mnist.py +++ b/examples/pytorch_mnist.py @@ -168,7 +168,7 @@ def test(): if hvd.rank() == 0: print('\nTest set: Average loss: {:.4f}, Accuracy: {:.2f}%\n'.format( test_loss, 100. * test_accuracy)) - model.save(export_dir, save_format='tf') + torch.save(model.state_dict(), export_dir) for epoch in range(1, args.epochs + 1): train(epoch) From d71e4895289af225a441278d83a28bd98de65557 Mon Sep 17 00:00:00 2001 From: Dillon Erb <585865+dte@users.noreply.github.com> Date: Sat, 1 Feb 2020 22:06:00 -0500 Subject: [PATCH 04/13] ONNX export --- examples/pytorch_mnist.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/examples/pytorch_mnist.py b/examples/pytorch_mnist.py index 36306cf106..abe8a3b24e 100644 --- a/examples/pytorch_mnist.py +++ b/examples/pytorch_mnist.py @@ -7,6 +7,8 @@ import torch.utils.data.distributed import horovod.torch as hvd +from torch.autograd import Variable + import os # Training settings @@ -169,6 +171,18 @@ def test(): print('\nTest set: Average loss: {:.4f}, Accuracy: {:.2f}%\n'.format( test_loss, 100. * test_accuracy)) torch.save(model.state_dict(), export_dir) + # Save to ONNX model format + # dummy_input = torch.randn(10, 3, 224, 224, device='cuda') + dummy_input = Variable(torch.randn(1, 1, 28, 28, device='cuda')) # one black and white 28 x 28 picture will be the input to the model + + torch.onnx.export(model, # model being run + dummy_input, # model input (or a tuple for multiple inputs) + export_dir + "model.onnx", # where to save the model (can be a file or file-like object) + export_params=True) # store the trained parameter weights inside the model file + # opset_version=10, # the ONNX version to export the model to + #do_constant_folding=True, # whether to execute constant folding for optimization + #input_names = ['input'], # the model's input names + #output_names = ['output']) # the model's output names for epoch in range(1, args.epochs + 1): train(epoch) From 6625ac04b825c4eb63550974741d8f5c77099218 Mon Sep 17 00:00:00 2001 From: Dillon Erb <585865+dte@users.noreply.github.com> Date: Sat, 1 Feb 2020 22:09:46 -0500 Subject: [PATCH 05/13] Model save path file not directory --- examples/pytorch_mnist.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/pytorch_mnist.py b/examples/pytorch_mnist.py index abe8a3b24e..f620585ce5 100644 --- a/examples/pytorch_mnist.py +++ b/examples/pytorch_mnist.py @@ -170,7 +170,7 @@ def test(): if hvd.rank() == 0: print('\nTest set: Average loss: {:.4f}, Accuracy: {:.2f}%\n'.format( test_loss, 100. * test_accuracy)) - torch.save(model.state_dict(), export_dir) + torch.save(model.state_dict(), export_dir + 'model.pth') # Save to ONNX model format # dummy_input = torch.randn(10, 3, 224, 224, device='cuda') dummy_input = Variable(torch.randn(1, 1, 28, 28, device='cuda')) # one black and white 28 x 28 picture will be the input to the model From c62af3c8b80192826d5ceb34f1a702430164ce2f Mon Sep 17 00:00:00 2001 From: Dillon Erb <585865+dte@users.noreply.github.com> Date: Sun, 2 Feb 2020 13:33:00 -0500 Subject: [PATCH 06/13] Logging --- examples/pytorch_mnist.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/examples/pytorch_mnist.py b/examples/pytorch_mnist.py index f620585ce5..23efcece29 100644 --- a/examples/pytorch_mnist.py +++ b/examples/pytorch_mnist.py @@ -170,14 +170,16 @@ def test(): if hvd.rank() == 0: print('\nTest set: Average loss: {:.4f}, Accuracy: {:.2f}%\n'.format( test_loss, 100. * test_accuracy)) - torch.save(model.state_dict(), export_dir + 'model.pth') + print('Saving PyTorch model to: ' + export_dir) + torch.save(model.state_dict(), export_dir + '/model.pth') # Save to ONNX model format # dummy_input = torch.randn(10, 3, 224, 224, device='cuda') dummy_input = Variable(torch.randn(1, 1, 28, 28, device='cuda')) # one black and white 28 x 28 picture will be the input to the model - + + print('Saving ONNX model to: ' + export_dir) torch.onnx.export(model, # model being run dummy_input, # model input (or a tuple for multiple inputs) - export_dir + "model.onnx", # where to save the model (can be a file or file-like object) + export_dir + "/model.onnx", # where to save the model (can be a file or file-like object) export_params=True) # store the trained parameter weights inside the model file # opset_version=10, # the ONNX version to export the model to #do_constant_folding=True, # whether to execute constant folding for optimization From 598dabacad71a3323c493d9af542dac4ca90111d Mon Sep 17 00:00:00 2001 From: Dillon Erb <585865+dte@users.noreply.github.com> Date: Sun, 2 Feb 2020 14:09:41 -0500 Subject: [PATCH 07/13] Update pytorch_imagenet_resnet50.py --- examples/pytorch_imagenet_resnet50.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/examples/pytorch_imagenet_resnet50.py b/examples/pytorch_imagenet_resnet50.py index ce050d902a..515f7e7aeb 100644 --- a/examples/pytorch_imagenet_resnet50.py +++ b/examples/pytorch_imagenet_resnet50.py @@ -14,6 +14,9 @@ from tqdm import tqdm # Training settings + +export_dir = os.path.abspath(os.environ.get('PS_MODEL_PATH', os.getcwd() + '/models')) + parser = argparse.ArgumentParser(description='PyTorch ImageNet Example', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--train-dir', default=os.path.expanduser('~/imagenet/train'), @@ -262,7 +265,22 @@ def save_checkpoint(epoch): 'optimizer': optimizer.state_dict(), } torch.save(state, filepath) - + print('Saving PyTorch model to: ' + export_dir) + torch.save(state, export_dir + '/' + filepath) + # Save to ONNX model format + # dummy_input = torch.randn(10, 3, 224, 224, device='cuda') + dummy_input = Variable(torch.randn(1, 3, 256, 256, device='cuda')) # one color 256 x 256 picture will be the input to the model + + print('Saving ONNX model to: ' + export_dir) + torch.onnx.export(model, # model being run + dummy_input, # model input (or a tuple for multiple inputs) + export_dir + "/model.onnx", # where to save the model (can be a file or file-like object) + export_params=True) # store the trained parameter weights inside the model file + # opset_version=10, # the ONNX version to export the model to + #do_constant_folding=True, # whether to execute constant folding for optimization + #input_names = ['input'], # the model's input names + #output_names = ['output']) # the model's output names + # Horovod: average metrics from distributed training. class Metric(object): @@ -279,7 +297,6 @@ def update(self, val): def avg(self): return self.sum / self.n - for epoch in range(resume_from_epoch, args.epochs): train(epoch) validate(epoch) From d022cdcb990638d2f7515c4ebc94c0e258450d03 Mon Sep 17 00:00:00 2001 From: mkutsovsky Date: Sun, 2 Feb 2020 16:15:59 -0500 Subject: [PATCH 08/13] export model at the end of training to onnx --- examples/pytorch_imagenet_resnet50.py | 33 +++++++++++++++------------ 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/examples/pytorch_imagenet_resnet50.py b/examples/pytorch_imagenet_resnet50.py index 515f7e7aeb..63af41152f 100644 --- a/examples/pytorch_imagenet_resnet50.py +++ b/examples/pytorch_imagenet_resnet50.py @@ -3,6 +3,7 @@ import torch import argparse import torch.backends.cudnn as cudnn +from torch.autograd import Variable import torch.nn.functional as F import torch.optim as optim import torch.utils.data.distributed @@ -264,22 +265,25 @@ def save_checkpoint(epoch): 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), } - torch.save(state, filepath) + #torch.save(state, filepath) print('Saving PyTorch model to: ' + export_dir) torch.save(state, export_dir + '/' + filepath) - # Save to ONNX model format - # dummy_input = torch.randn(10, 3, 224, 224, device='cuda') - dummy_input = Variable(torch.randn(1, 3, 256, 256, device='cuda')) # one color 256 x 256 picture will be the input to the model - - print('Saving ONNX model to: ' + export_dir) - torch.onnx.export(model, # model being run - dummy_input, # model input (or a tuple for multiple inputs) - export_dir + "/model.onnx", # where to save the model (can be a file or file-like object) - export_params=True) # store the trained parameter weights inside the model file - # opset_version=10, # the ONNX version to export the model to - #do_constant_folding=True, # whether to execute constant folding for optimization - #input_names = ['input'], # the model's input names - #output_names = ['output']) # the model's output names + +def export_model(): + if hvd.rank() == 0: + # Save to ONNX model format + dummy_input = Variable(torch.randn(1, 3, 224, 224, device='cuda')) # one color 224 x 224 picture will be the input to the model + + print('Saving ONNX model to: ' + export_dir) + torch.onnx.export(model, # model being run + dummy_input, # model input (or a tuple for multiple inputs) + export_dir + "/model.onnx", # where to save the model (can be a file or file-like object) + export_params=True) # store the trained parameter weights inside the model file + # opset_version=10, # the ONNX version to export the model to + #do_constant_folding=True, # whether to execute constant folding for optimization + #input_names = ['input'], # the model's input names + #output_names = ['output']) # the model's output names + # Horovod: average metrics from distributed training. @@ -301,3 +305,4 @@ def avg(self): train(epoch) validate(epoch) save_checkpoint(epoch) +export_model() From a8fed1bf8f5b42e4bfc1808c003c245bde5dc966 Mon Sep 17 00:00:00 2001 From: Dillon Erb <585865+dte@users.noreply.github.com> Date: Sun, 2 Feb 2020 17:07:16 -0500 Subject: [PATCH 09/13] Change back to horovod 0.18.2 --- examples/pytorch_imagenet_resnet50.py | 27 ++++++++------------------- 1 file changed, 8 insertions(+), 19 deletions(-) diff --git a/examples/pytorch_imagenet_resnet50.py b/examples/pytorch_imagenet_resnet50.py index 63af41152f..db17d89b6d 100644 --- a/examples/pytorch_imagenet_resnet50.py +++ b/examples/pytorch_imagenet_resnet50.py @@ -34,8 +34,6 @@ help='number of batches processed locally before ' 'executing allreduce across workers; it multiplies ' 'total batch size.') -parser.add_argument('--use-adasum', action='store_true', default=False, - help='use adasum algorithm to do reduction') # Default settings from https://arxiv.org/abs/1706.02677. parser.add_argument('--batch-size', type=int, default=32, @@ -131,21 +129,15 @@ # Set up standard ResNet-50 model. model = models.resnet50() -# By default, Adasum doesn't need scaling up learning rate. -# For sum/average with gradient Accumulation: scale learning rate by batches_per_allreduce -lr_scaler = args.batches_per_allreduce * hvd.size() if not args.use_adasum else 1 - if args.cuda: # Move model to GPU. model.cuda() - # If using GPU Adasum allreduce, scale learning rate by local_size. - if args.use_adasum and hvd.nccl_built(): - lr_scaler = args.batches_per_allreduce * hvd.local_size() # Horovod: scale learning rate by the number of GPUs. +# Gradient Accumulation: scale learning rate by batches_per_allreduce optimizer = optim.SGD(model.parameters(), lr=(args.base_lr * - lr_scaler), + args.batches_per_allreduce * hvd.size()), momentum=args.momentum, weight_decay=args.wd) # Horovod: (optional) compression algorithm. @@ -155,8 +147,7 @@ optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters(), compression=compression, - backward_passes_per_step=args.batches_per_allreduce, - op=hvd.Adasum if args.use_adasum else hvd.Average) + backward_passes_per_step=args.batches_per_allreduce) # Restore from a previous checkpoint, if initial_epoch is specified. # Horovod: restore on the first worker which will broadcast weights to other workers. @@ -265,10 +256,8 @@ def save_checkpoint(epoch): 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), } - #torch.save(state, filepath) - print('Saving PyTorch model to: ' + export_dir) torch.save(state, export_dir + '/' + filepath) - + def export_model(): if hvd.rank() == 0: # Save to ONNX model format @@ -283,9 +272,7 @@ def export_model(): #do_constant_folding=True, # whether to execute constant folding for optimization #input_names = ['input'], # the model's input names #output_names = ['output']) # the model's output names - - - + # Horovod: average metrics from distributed training. class Metric(object): def __init__(self, name): @@ -301,8 +288,10 @@ def update(self, val): def avg(self): return self.sum / self.n + for epoch in range(resume_from_epoch, args.epochs): train(epoch) validate(epoch) save_checkpoint(epoch) -export_model() +export_model() + From 3f4879e13b7ca05faeaf72152d97ec76ae9481b8 Mon Sep 17 00:00:00 2001 From: Dillon Erb <585865+dte@users.noreply.github.com> Date: Sun, 2 Feb 2020 19:01:31 -0500 Subject: [PATCH 10/13] Create TF2 keras imagenet --- examples/tensorflow2_keras_imagenet.py | 198 +++++++++++++++++++++++++ 1 file changed, 198 insertions(+) create mode 100644 examples/tensorflow2_keras_imagenet.py diff --git a/examples/tensorflow2_keras_imagenet.py b/examples/tensorflow2_keras_imagenet.py new file mode 100644 index 0000000000..cfdff2f80e --- /dev/null +++ b/examples/tensorflow2_keras_imagenet.py @@ -0,0 +1,198 @@ +# +# ResNet-50 model training using Keras and Horovod. +# +# This model is an example of a computation-intensive model that achieves good accuracy on an image +# classification task. It brings together distributed training concepts such as learning rate +# schedule adjustments with a warmup, randomized data reading, and checkpointing on the first worker +# only. +# +# Note: This model uses Keras native ImageDataGenerator and not the sophisticated preprocessing +# pipeline that is typically used to train state-of-the-art ResNet-50 model. This results in ~0.5% +# increase in the top-1 validation error compared to the single-crop top-1 validation error from +# https://github.com/KaimingHe/deep-residual-networks. +# +from __future__ import print_function + +import argparse +# import keras +# from keras import backend as K +# from keras.preprocessing import image +import tensorflow as tf +from tensorflow import keras +from keras import backend as K +from keras.preprocessing import image + +import horovod.keras as hvd +import os + +parser = argparse.ArgumentParser(description='Keras ImageNet Example', + formatter_class=argparse.ArgumentDefaultsHelpFormatter) +parser.add_argument('--train-dir', default=os.path.expanduser('~/imagenet/train'), + help='path to training data') +parser.add_argument('--val-dir', default=os.path.expanduser('~/imagenet/validation'), + help='path to validation data') +parser.add_argument('--log-dir', default='./logs', + help='tensorboard log directory') +parser.add_argument('--checkpoint-format', default='./checkpoint-{epoch}.h5', + help='checkpoint file format') +parser.add_argument('--fp16-allreduce', action='store_true', default=False, + help='use fp16 compression during allreduce') + +# Default settings from https://arxiv.org/abs/1706.02677. +parser.add_argument('--batch-size', type=int, default=32, + help='input batch size for training') +parser.add_argument('--val-batch-size', type=int, default=32, + help='input batch size for validation') +parser.add_argument('--epochs', type=int, default=90, + help='number of epochs to train') +parser.add_argument('--base-lr', type=float, default=0.0125, + help='learning rate for a single GPU') +parser.add_argument('--warmup-epochs', type=float, default=5, + help='number of warmup epochs') +parser.add_argument('--momentum', type=float, default=0.9, + help='SGD momentum') +parser.add_argument('--wd', type=float, default=0.00005, + help='weight decay') + +args = parser.parse_args() + +print(tf.test.is_built_with_cuda()) +# data_format = ('channels_first' +# if tf.test.is_built_with_cuda() else 'channels_last') +tf.keras.backend.set_image_data_format('channels_last') + +# Horovod: initialize Horovod. +hvd.init() + +# Horovod: pin GPU to be used to process local rank (one GPU per process) +# config = tf.compat.v1.ConfigProto() +# config.gpu_options.allow_growth = True +# config.gpu_options.visible_device_list = str(hvd.local_rank()) +# K.set_session(tf.compat.v1.Session(config=config)) + +# Horovod: pin GPU to be used to process local rank (one GPU per process) +gpus = tf.config.experimental.list_physical_devices('GPU') +for gpu in gpus: + tf.config.experimental.set_memory_growth(gpu, True) +if gpus: + tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU') + +# If set > 0, will resume training from a given checkpoint. +resume_from_epoch = 0 +for try_epoch in range(args.epochs, 0, -1): + if os.path.exists(args.checkpoint_format.format(epoch=try_epoch)): + resume_from_epoch = try_epoch + break + +# Horovod: broadcast resume_from_epoch from rank 0 (which will have +# checkpoints) to other ranks. +resume_from_epoch = hvd.broadcast(resume_from_epoch, 0, name='resume_from_epoch') + +# Horovod: print logs on the first worker. +verbose = 1 if hvd.rank() == 0 else 0 + +# Training data iterator. +train_gen = image.ImageDataGenerator( + width_shift_range=0.33, height_shift_range=0.33, zoom_range=0.5, horizontal_flip=True, + preprocessing_function=keras.applications.resnet50.preprocess_input) + +train_iter = train_gen.flow_from_directory(args.train_dir, + batch_size=args.batch_size, + target_size=(224, 224)) + +# Validation data iterator. +test_gen = image.ImageDataGenerator( + zoom_range=(0.875, 0.875), preprocessing_function=keras.applications.resnet50.preprocess_input) +test_iter = test_gen.flow_from_directory(args.val_dir, + batch_size=args.val_batch_size, + target_size=(224, 224)) + + +# Set up standard ResNet-50 model. +model = keras.applications.resnet50.ResNet50(weights=None, classes=200) + +# Horovod: (optional) compression algorithm. +compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none + +# Restore from a previous checkpoint, if initial_epoch is specified. +# Horovod: restore on the first worker which will broadcast both model and optimizer weights +# to other workers. +if resume_from_epoch > 0 and hvd.rank() == 0: + model = hvd.load_model(args.checkpoint_format.format(epoch=resume_from_epoch), + compression=compression) +else: + # ResNet-50 model that is included with Keras is optimized for inference. + # Add L2 weight decay & adjust BN settings. + model_config = model.get_config() + for layer, layer_config in zip(model.layers, model_config['layers']): + if hasattr(layer, 'kernel_regularizer'): + regularizer = keras.regularizers.l2(args.wd) + layer_config['config']['kernel_regularizer'] = \ + {'class_name': regularizer.__class__.__name__, + 'config': regularizer.get_config()} + if type(layer) == keras.layers.BatchNormalization: + layer_config['config']['momentum'] = 0.9 + layer_config['config']['epsilon'] = 1e-5 + + model = keras.models.Model.from_config(model_config) + + # Horovod: adjust learning rate based on number of GPUs. + opt = keras.optimizers.SGD(lr=args.base_lr * hvd.size(), + momentum=args.momentum) + + # Horovod: add Horovod Distributed Optimizer. + opt = hvd.DistributedOptimizer(opt, compression=compression) + + model.compile(loss=keras.losses.categorical_crossentropy, + optimizer=opt, + metrics=['accuracy', 'top_k_categorical_accuracy'], + experimental_run_tf_function=False) + +callbacks = [ + # Horovod: broadcast initial variable states from rank 0 to all other processes. + # This is necessary to ensure consistent initialization of all workers when + # training is started with random weights or restored from a checkpoint. + hvd.callbacks.BroadcastGlobalVariablesCallback(0), + + # Horovod: average metrics among workers at the end of every epoch. + # + # Note: This callback must be in the list before the ReduceLROnPlateau, + # TensorBoard, or other metrics-based callbacks. + hvd.callbacks.MetricAverageCallback(), + + # Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final + # accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during + # the first five epochs. See https://arxiv.org/abs/1706.02677 for details. + hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=args.warmup_epochs, verbose=verbose), + + # Horovod: after the warmup reduce learning rate by 10 on the 30th, 60th and 80th epochs. + hvd.callbacks.LearningRateScheduleCallback(start_epoch=args.warmup_epochs, end_epoch=30, multiplier=1.), + hvd.callbacks.LearningRateScheduleCallback(start_epoch=30, end_epoch=60, multiplier=1e-1), + hvd.callbacks.LearningRateScheduleCallback(start_epoch=60, end_epoch=80, multiplier=1e-2), + hvd.callbacks.LearningRateScheduleCallback(start_epoch=80, multiplier=1e-3), +] + +# Horovod: save checkpoints only on the first worker to prevent other workers from corrupting them. +if hvd.rank() == 0: + callbacks.append(keras.callbacks.ModelCheckpoint(args.checkpoint_format)) + callbacks.append(keras.callbacks.TensorBoard(args.log_dir)) + +# Train the model. The training will randomly sample 1 / N batches of training data and +# 3 / N batches of validation data on every worker, where N is the number of workers. +# Over-sampling of validation data helps to increase probability that every validation +# example will be evaluated. +model.fit_generator(train_iter, + steps_per_epoch=len(train_iter) // hvd.size(), + callbacks=callbacks, + epochs=args.epochs, + verbose=verbose, + workers=4, + initial_epoch=resume_from_epoch, + validation_data=test_iter, + validation_steps=3 * len(test_iter) // hvd.size()) + +# Evaluate the model on the full data set. +score = hvd.allreduce(model.evaluate_generator(test_iter, len(test_iter), workers=4)) +if verbose: + print('Test loss:', score[0]) + print('Test accuracy:', score[1]) From e47de2c6b63101090bf067052e9ecd1158637d31 Mon Sep 17 00:00:00 2001 From: Dillon Erb <585865+dte@users.noreply.github.com> Date: Sun, 2 Feb 2020 19:11:19 -0500 Subject: [PATCH 11/13] Pass in ENV variable to distinguish ImageNet vs TinyImageNet --- examples/tensorflow2_keras_imagenet.py | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/tensorflow2_keras_imagenet.py b/examples/tensorflow2_keras_imagenet.py index cfdff2f80e..774ce67c7a 100644 --- a/examples/tensorflow2_keras_imagenet.py +++ b/examples/tensorflow2_keras_imagenet.py @@ -109,6 +109,7 @@ # Set up standard ResNet-50 model. +num_classes = int(os.environ.get('IMAGENET_CLASSES', 1000)) model = keras.applications.resnet50.ResNet50(weights=None, classes=200) # Horovod: (optional) compression algorithm. From 8e6d8237e209c9e185b69f6639882445228a837a Mon Sep 17 00:00:00 2001 From: Dillon Erb <585865+dte@users.noreply.github.com> Date: Sun, 2 Feb 2020 19:29:31 -0500 Subject: [PATCH 12/13] Update tensorflow2_keras_imagenet.py --- examples/tensorflow2_keras_imagenet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/tensorflow2_keras_imagenet.py b/examples/tensorflow2_keras_imagenet.py index 774ce67c7a..ec2fb37378 100644 --- a/examples/tensorflow2_keras_imagenet.py +++ b/examples/tensorflow2_keras_imagenet.py @@ -110,7 +110,7 @@ # Set up standard ResNet-50 model. num_classes = int(os.environ.get('IMAGENET_CLASSES', 1000)) -model = keras.applications.resnet50.ResNet50(weights=None, classes=200) +model = keras.applications.resnet50.ResNet50(weights=None, classes=num_classes) # Horovod: (optional) compression algorithm. compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none From 9f9a624d5684b4515d9def6b2fd2e51d498935c7 Mon Sep 17 00:00:00 2001 From: Dillon Erb <585865+dte@users.noreply.github.com> Date: Sun, 2 Feb 2020 19:56:40 -0500 Subject: [PATCH 13/13] Update log and model dir --- examples/tensorflow2_keras_imagenet.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/examples/tensorflow2_keras_imagenet.py b/examples/tensorflow2_keras_imagenet.py index ec2fb37378..7e4971f1c1 100644 --- a/examples/tensorflow2_keras_imagenet.py +++ b/examples/tensorflow2_keras_imagenet.py @@ -33,7 +33,7 @@ help='path to validation data') parser.add_argument('--log-dir', default='./logs', help='tensorboard log directory') -parser.add_argument('--checkpoint-format', default='./checkpoint-{epoch}.h5', +parser.add_argument('--checkpoint-format', default='checkpoint-{epoch}.h5', help='checkpoint file format') parser.add_argument('--fp16-allreduce', action='store_true', default=False, help='use fp16 compression during allreduce') @@ -56,6 +56,8 @@ args = parser.parse_args() +export_dir = os.path.abspath(os.environ.get('PS_MODEL_PATH', os.getcwd() + '/models')) + print(tf.test.is_built_with_cuda()) # data_format = ('channels_first' # if tf.test.is_built_with_cuda() else 'channels_last') @@ -175,8 +177,9 @@ # Horovod: save checkpoints only on the first worker to prevent other workers from corrupting them. if hvd.rank() == 0: - callbacks.append(keras.callbacks.ModelCheckpoint(args.checkpoint_format)) - callbacks.append(keras.callbacks.TensorBoard(args.log_dir)) + callbacks.append(keras.callbacks.ModelCheckpoint(export_dir + args.checkpoint_format)) + + callbacks.append(keras.callbacks.TensorBoard(export_dir)) # Train the model. The training will randomly sample 1 / N batches of training data and # 3 / N batches of validation data on every worker, where N is the number of workers.