From a1e0d5b8d8052a01e813b4b894a5226844f75b55 Mon Sep 17 00:00:00 2001 From: ShawnXuan Date: Tue, 24 Aug 2021 12:21:33 +0800 Subject: [PATCH 1/7] for resnet regration --- Classification/cnns/of_cnn_train_val.py | 3 ++- Classification/cnns/train.sh | 6 ++++-- Classification/cnns/train_fp16.sh | 8 ++------ 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/Classification/cnns/of_cnn_train_val.py b/Classification/cnns/of_cnn_train_val.py index 3d5cbbd..3258849 100755 --- a/Classification/cnns/of_cnn_train_val.py +++ b/Classification/cnns/of_cnn_train_val.py @@ -21,7 +21,8 @@ import config as configs from util import Snapshot, InitNodes, Metric from job_function_util import get_train_config, get_val_config -import resnet_model +# import resnet_model +import resnet_rename as resnet_model import resnext_model import vgg_model import alexnet_model diff --git a/Classification/cnns/train.sh b/Classification/cnns/train.sh index 6aa2b80..5a78d34 100755 --- a/Classification/cnns/train.sh +++ b/Classification/cnns/train.sh @@ -19,6 +19,7 @@ echo DATA_ROOT=$DATA_ROOT LOG_FOLDER=../logs mkdir -p $LOG_FOLDER LOGFILE=$LOG_FOLDER/resnet_training.log +export PYTHONUNBUFFERED=1 python3 of_cnn_train_val.py \ --train_data_dir=$DATA_ROOT/train \ @@ -26,15 +27,16 @@ python3 of_cnn_train_val.py \ --val_data_dir=$DATA_ROOT/validation \ --val_data_part_num=256 \ --num_nodes=1 \ - --gpu_num_per_node=8 \ + --gpu_num_per_node=4 \ --optimizer="sgd" \ --momentum=0.875 \ --label_smoothing=0.1 \ --learning_rate=1.024 \ --loss_print_every_n_iter=100 \ - --batch_size_per_device=128 \ + --batch_size_per_device=32 \ --val_batch_size_per_device=50 \ --num_epoch=$NUM_EPOCH \ --model="resnet50" 2>&1 | tee ${LOGFILE} + #--model="resnet50" 2>&1 | tee ${LOGFILE} echo "Writting log to ${LOGFILE}" diff --git a/Classification/cnns/train_fp16.sh b/Classification/cnns/train_fp16.sh index 7ecfa5c..0c59ef0 100755 --- a/Classification/cnns/train_fp16.sh +++ b/Classification/cnns/train_fp16.sh @@ -26,18 +26,14 @@ export NCCL_LAUNCH_MODE=PARALLEL echo NCCL_LAUNCH_MODE=$NCCL_LAUNCH_MODE python3 of_cnn_train_val.py \ - --train_data_dir=$DATA_ROOT/train \ - --train_data_part_num=256 \ - --val_data_dir=$DATA_ROOT/validation \ - --val_data_part_num=256 \ --num_nodes=1 \ - --gpu_num_per_node=8 \ + --gpu_num_per_node=1 \ --optimizer="sgd" \ --momentum=0.875 \ --label_smoothing=0.1 \ --learning_rate=1.536 \ --loss_print_every_n_iter=100 \ - --batch_size_per_device=192 \ + --batch_size_per_device=64 \ --val_batch_size_per_device=50 \ --use_fp16 \ --channel_last=True \ From 2632d98411449111b560b040c2112b7d26baf123 Mon Sep 17 00:00:00 2001 From: ShawnXuan Date: Tue, 24 Aug 2021 14:41:31 +0800 Subject: [PATCH 2/7] var renamed resnet --- Classification/cnns/resnet_rename.py | 261 +++++++++++++++++++++++++++ 1 file changed, 261 insertions(+) create mode 100644 Classification/cnns/resnet_rename.py diff --git a/Classification/cnns/resnet_rename.py b/Classification/cnns/resnet_rename.py new file mode 100644 index 0000000..117dad4 --- /dev/null +++ b/Classification/cnns/resnet_rename.py @@ -0,0 +1,261 @@ +""" +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import oneflow.compatible.single_client as flow + +BLOCK_COUNTS = [3, 4, 6, 3] +BLOCK_FILTERS = [256, 512, 1024, 2048] +BLOCK_FILTERS_INNER = [64, 128, 256, 512] + + +class ResnetBuilder(object): + def __init__( + self, + weight_regularizer, + trainable=True, + training=True, + channel_last=False, + fuse_bn_relu=True, + fuse_bn_add_relu=True, + ): + self.data_format = "NHWC" if channel_last else "NCHW" + self.weight_initializer = flow.variance_scaling_initializer( + 2, "fan_in", "random_normal", data_format=self.data_format + ) + self.weight_regularizer = weight_regularizer + self.trainable = trainable + self.training = training + self.fuse_bn_relu = fuse_bn_relu + self.fuse_bn_add_relu = fuse_bn_add_relu + + def _conv2d( + self, name, input, filters, kernel_size, strides=1, padding="SAME", dilations=1, + ): + # There are different shapes of weight metric between 'NCHW' and 'NHWC' mode + if self.data_format == "NHWC": + shape = (filters, kernel_size, kernel_size, input.shape[3]) + else: + shape = (filters, input.shape[1], kernel_size, kernel_size) + weight = flow.get_variable( + name + ".weight", + shape=shape, + dtype=input.dtype, + initializer=self.weight_initializer, + regularizer=self.weight_regularizer, + model_name="weight", + trainable=self.trainable, + ) + + return flow.nn.conv2d( + input, + weight, + strides, + padding, + None, + self.data_format, + dilations, + name=name, + ) + + def _batch_norm(self, inputs, name=None, last=False): + initializer = flow.zeros_initializer() if last else flow.ones_initializer() + axis = 1 + if self.data_format == "NHWC": + axis = 3 + return flow.layers.batch_normalization( + inputs=inputs, + axis=axis, + momentum=0.9, # 97, + epsilon=1e-5, + center=True, + scale=True, + trainable=self.trainable, + training=self.training, + gamma_initializer=initializer, + moving_variance_initializer=initializer, + gamma_regularizer=self.weight_regularizer, + beta_regularizer=self.weight_regularizer, + name=name, + ) + + def _batch_norm_relu(self, inputs, name=None, last=False): + # if self.fuse_bn_relu: + # initializer = flow.zeros_initializer() if last else flow.ones_initializer() + # axis = 1 + # if self.data_format == "NHWC": + # axis = 3 + # return flow.layers.batch_normalization_relu( + # inputs=inputs, + # axis=axis, + # momentum=0.9, + # epsilon=1e-5, + # center=True, + # scale=True, + # trainable=self.trainable, + # training=self.training, + # gamma_initializer=initializer, + # moving_variance_initializer=initializer, + # gamma_regularizer=self.weight_regularizer, + # beta_regularizer=self.weight_regularizer, + # name=name + "_bn_relu", + # ) + # else: + # return flow.nn.relu(self._batch_norm(inputs, name + "_bn", last=last)) + return flow.nn.relu(self._batch_norm(inputs, name, last=last)) + + def _batch_norm_add_relu(self, inputs, addend, name=None, last=False): + # if self.fuse_bn_add_relu: + # initializer = flow.zeros_initializer() if last else flow.ones_initializer() + # axis = 1 + # if self.data_format == "NHWC": + # axis = 3 + # return flow.layers.batch_normalization_add_relu( + # inputs=inputs, + # addend=addend, + # axis=axis, + # momentum=0.9, + # epsilon=1e-5, + # center=True, + # scale=True, + # trainable=self.trainable, + # training=self.training, + # gamma_initializer=initializer, + # moving_variance_initializer=initializer, + # gamma_regularizer=self.weight_regularizer, + # beta_regularizer=self.weight_regularizer, + # name=name + "_bn_add_relu", + # ) + # else: + return flow.nn.relu( + self._batch_norm(inputs, name, last=last) + addend + ) + + def conv2d_affine(self, input, name, filters, kernel_size, strides): + padding = "SAME" if strides > 1 or kernel_size > 1 else "VALID" + output = self._conv2d(name, input, filters, kernel_size, strides, padding) + return output + + def bottleneck_transformation( + self, input, block_name, filters, filters_inner, strides + ): + a = self.conv2d_affine(input, block_name + ".conv1", filters_inner, 1, 1) + a = self._batch_norm_relu(a, block_name + ".bn1") + + b = self.conv2d_affine(a, block_name + ".conv2", filters_inner, 3, strides) + b = self._batch_norm_relu(b, block_name + ".bn2") + + c = self.conv2d_affine(b, block_name + ".conv3", filters, 1, 1) + return c + + def residual_block(self, input, block_name, filters, filters_inner, strides_init): + if strides_init != 1 or block_name == "layer1.0": + shortcut = self.conv2d_affine( + input, block_name + ".downsample.0", filters, 1, strides_init + ) + shortcut = self._batch_norm(shortcut, block_name + ".downsample.1") + else: + shortcut = input + + bottleneck = self.bottleneck_transformation( + input, block_name, filters, filters_inner, strides_init, + ) + return self._batch_norm_add_relu( + bottleneck, shortcut, block_name + ".bn3", last=True + ) + + def residual_stage( + self, input, stage_name, counts, filters, filters_inner, stride_init=2 + ): + output = input + for i in range(counts): + block_name = "%s.%d" % (stage_name, i) + output = self.residual_block( + output, block_name, filters, filters_inner, stride_init if i == 0 else 1 + ) + + return output + + def resnet_conv_x_body(self, input): + output = input + for i, (counts, filters, filters_inner) in enumerate( + zip(BLOCK_COUNTS, BLOCK_FILTERS, BLOCK_FILTERS_INNER) + ): + stage_name = "layer%d" % (i + 1) + output = self.residual_stage( + output, stage_name, counts, filters, filters_inner, 1 if i == 0 else 2 + ) + + return output + + def resnet_stem(self, input): + conv1 = self._conv2d("conv1", input, 64, 7, 2) + conv1_bn = self._batch_norm_relu(conv1, "bn1") + pool1 = flow.nn.max_pool2d( + conv1_bn, + ksize=3, + strides=2, + padding="SAME", + data_format=self.data_format, + name="pool1", + ) + return pool1 + + +def resnet50(images, args, trainable=True, training=True): + weight_regularizer = ( + flow.regularizers.l2(args.wd) if args.wd > 0.0 and args.wd < 1.0 else None + ) + builder = ResnetBuilder( + weight_regularizer, + trainable, + training, + args.channel_last, + args.fuse_bn_relu, + args.fuse_bn_add_relu, + ) + + # if args.pad_output: + # if args.channel_last: + # paddings = ((0, 0), (0, 0), (0, 0), (0, 1)) + # else: + # paddings = ((0, 0), (0, 1), (0, 0), (0, 0)) + # images = flow.pad(images, paddings=paddings) + # with flow.scope.namespace("resnet50"): + stem = builder.resnet_stem(images) + body = builder.resnet_conv_x_body(stem) + pool5 = flow.nn.avg_pool2d( + body, + ksize=7, + strides=1, + padding="VALID", + data_format=builder.data_format, + name="avgpool", + ) + fc1001 = flow.layers.dense( + flow.reshape(pool5, (pool5.shape[0], -1)), + units=1000, + use_bias=True, + kernel_initializer=flow.variance_scaling_initializer( + 2, "fan_in", "random_normal" + ), + bias_initializer=flow.zeros_initializer(), + kernel_regularizer=weight_regularizer, + bias_regularizer=weight_regularizer, + trainable=trainable, + name="fc", + ) + return fc1001 + From 55921dbc35eea9b9a1648dc137d62a2e9ca9710b Mon Sep 17 00:00:00 2001 From: ShawnXuan Date: Thu, 26 Aug 2021 14:42:11 +0800 Subject: [PATCH 3/7] compaer with eager --- Classification/cnns/align.sh | 56 +++++++++++++++++++++++++ Classification/cnns/of_cnn_train_val.py | 18 ++++++-- Classification/cnns/optimizer_util.py | 7 +++- Classification/cnns/resnet_rename.py | 4 ++ Classification/cnns/util.py | 30 ++++++++++++- 5 files changed, 109 insertions(+), 6 deletions(-) create mode 100755 Classification/cnns/align.sh diff --git a/Classification/cnns/align.sh b/Classification/cnns/align.sh new file mode 100755 index 0000000..ca6cc06 --- /dev/null +++ b/Classification/cnns/align.sh @@ -0,0 +1,56 @@ +rm -rf core.* +rm -rf ./output/snapshots/* + +if [ -n "$1" ]; then + NUM_EPOCH=$1 +else + NUM_EPOCH=50 +fi +echo NUM_EPOCH=$NUM_EPOCH + +# training with imagenet +if [ -n "$2" ]; then + DATA_ROOT=$2 +else + DATA_ROOT=/dataset/ImageNet/ofrecord +fi +echo DATA_ROOT=$DATA_ROOT + +LOG_FOLDER=../logs +mkdir -p $LOG_FOLDER +LOGFILE=$LOG_FOLDER/resnet_training.log + +export PYTHONUNBUFFERED=1 +echo PYTHONUNBUFFERED=$PYTHONUNBUFFERED +export NCCL_LAUNCH_MODE=PARALLEL +echo NCCL_LAUNCH_MODE=$NCCL_LAUNCH_MODE + + #--momentum=0.875 \ +python3 of_cnn_train_val.py \ + --train_data_dir=$DATA_ROOT/train \ + --train_data_part_num=256 \ + --val_data_dir=$DATA_ROOT/validation \ + --val_data_part_num=256 \ + --num_nodes=1 \ + --model_load_dir=/ssd/xiexuan/models/resnet50/init_ckpt \ + --gpu_num_per_node=1 \ + --optimizer="sgd" \ + --momentum=0.0 \ + --lr_decay="none" \ + --label_smoothing=0.1 \ + --learning_rate=0.1 \ + --loss_print_every_n_iter=1 \ + --batch_size_per_device=64 \ + --val_batch_size_per_device=64 \ + --channel_last=False \ + --pad_output \ + --fuse_bn_relu=True \ + --fuse_bn_add_relu=True \ + --nccl_fusion_threshold_mb=16 \ + --nccl_fusion_max_ops=24 \ + --gpu_image_decoder=True \ + --num_epoch=$NUM_EPOCH \ + --model="resnet50" 2>&1 | tee ${LOGFILE} + # --use_fp16 \ + +echo "Writting log to ${LOGFILE}" diff --git a/Classification/cnns/of_cnn_train_val.py b/Classification/cnns/of_cnn_train_val.py index 3258849..6ea6864 100755 --- a/Classification/cnns/of_cnn_train_val.py +++ b/Classification/cnns/of_cnn_train_val.py @@ -28,6 +28,7 @@ import alexnet_model import inception_model import mobilenet_v2_model +from util import build_watch_cb, build_watch_diff_cb parser = configs.get_parser() args = parser.parse_args() @@ -52,7 +53,7 @@ flow.config.gpu_device_num(args.gpu_num_per_node) -# flow.config.enable_debug_mode(True) +flow.config.enable_debug_mode(True) if args.use_fp16 and args.num_nodes * args.gpu_num_per_node > 1: flow.config.collective_boxing.nccl_fusion_all_reduce_use_buffer(False) @@ -85,12 +86,15 @@ def TrainNet(): if args.train_data_dir: assert os.path.exists(args.train_data_dir) print("Loading data from {}".format(args.train_data_dir)) - (labels, images) = ofrecord_util.load_imagenet_for_training(args) + #(labels, images) = ofrecord_util.load_imagenet_for_training(args) + (labels, images) = ofrecord_util.load_imagenet_for_validation(args) else: print("Loading synthetic data.") (labels, images) = ofrecord_util.load_synthetic(args) logits = model_dict[args.model](images, args) + flow.watch(logits, build_watch_cb('logits')) + flow.watch_diff(logits, build_watch_diff_cb('logits_grad')) if args.label_smoothing > 0: one_hot_labels = label_smoothing( labels, args.num_classes, args.label_smoothing, logits.dtype @@ -105,7 +109,7 @@ def TrainNet(): loss = flow.math.reduce_mean(loss) predictions = flow.nn.softmax(logits) - outputs = {"loss": loss, "predictions": predictions, "labels": labels} + outputs = {"loss": loss, "predictions": predictions, "labels": labels, 'images': images, 'logits': logits} # set up warmup,learning rate and optimizer optimizer_util.set_up_optimizer(loss, args) @@ -145,7 +149,13 @@ def main(): loss_key="loss", ) for i in range(epoch_size): - TrainNet().async_get(metric.metric_cb(epoch, i)) + # TrainNet().async_get(metric.metric_cb(epoch, i)) + a = TrainNet().get() + snapshot.save("epoch_{}_iter{}".format(epoch, i)) + print(a['loss'].numpy()) + if i>=1: + break + break if args.val_data_dir: metric = Metric( diff --git a/Classification/cnns/optimizer_util.py b/Classification/cnns/optimizer_util.py index 43cd977..a5f4193 100755 --- a/Classification/cnns/optimizer_util.py +++ b/Classification/cnns/optimizer_util.py @@ -116,6 +116,11 @@ def set_up_optimizer(loss, args): staircase=False, warmup=warmup, ) + elif args.lr_decay == "none": + lr_scheduler = flow.optimizer.PiecewiseConstantScheduler( + boundaries=[], + values=[args.learning_rate], + ) else: lr_scheduler = flow.optimizer.PiecewiseScalingScheduler( base_lr=args.learning_rate, @@ -134,7 +139,7 @@ def set_up_optimizer(loss, args): print("Optimizer: SGD") flow.optimizer.SGD( lr_scheduler, - momentum=args.momentum if args.momentum > 0 else None, + momentum=args.momentum if args.momentum > 0 else 0.0, grad_clipping=grad_clipping, loss_scale_policy=loss_scale_policy, ).minimize(loss) diff --git a/Classification/cnns/resnet_rename.py b/Classification/cnns/resnet_rename.py index 117dad4..eac8c7a 100644 --- a/Classification/cnns/resnet_rename.py +++ b/Classification/cnns/resnet_rename.py @@ -15,6 +15,7 @@ """ import oneflow.compatible.single_client as flow +from util import build_watch_cb, build_watch_diff_cb BLOCK_COUNTS = [3, 4, 6, 3] BLOCK_FILTERS = [256, 512, 1024, 2048] @@ -58,6 +59,9 @@ def _conv2d( model_name="weight", trainable=self.trainable, ) + if 'conv1' == name: + flow.watch(weight, build_watch_cb('conv1_weight')) + flow.watch_diff(weight, build_watch_diff_cb('conv1_weight_grad')) return flow.nn.conv2d( input, diff --git a/Classification/cnns/util.py b/Classification/cnns/util.py index 6b7ce9a..c865cec 100755 --- a/Classification/cnns/util.py +++ b/Classification/cnns/util.py @@ -41,7 +41,8 @@ def __init__(self, model_save_dir, model_load_dir): if model_load_dir: assert os.path.isdir(model_load_dir) print("Restoring model from {}.".format(model_load_dir)) - flow.load_variables(flow.checkpoint.get(model_load_dir)) + flow.load_variables(flow.checkpoint.get(model_load_dir), ignore_mismatch=False) + # flow.checkpoint.save('loaded_init_ckpt') else: # flow.checkpoint.save("initial_model") print("Init model on demand.") @@ -84,6 +85,15 @@ def match_top_k(predictions, labels, top_k=1): return num_matched, match_array.shape[0] +def dump_outputs(outputs, step, dump_dir='output'): + for k, v in outputs.items(): + root = os.path.join(dump_dir, str(step)) + if not os.path.isdir(root): + os.makedirs(root) + path = os.path.join(root, k) + np.save(path, v.numpy()) + + class Metric(object): def __init__( self, @@ -142,6 +152,7 @@ def callback(outputs): self.num_samples += num_samples if (step + 1) % self.calculate_batches == 0: + dump_outputs(outputs, step) throughput = self.num_samples / self.timer.split() if self.prediction_key: top_1_accuracy = self.top_1_num_matched / self.num_samples @@ -180,3 +191,20 @@ def callback(outputs): self._clear() return callback + + +from oneflow.compatible.single_client import typing as tp + +def build_watch_cb(name, iter=0, root='output'): + path = os.path.join(root, str(iter), f'{name}.npy') + def cb(blob: tp.Numpy): + np.save(path, blob) + return cb + + +def build_watch_diff_cb(name, iter=0, root='output'): + path = os.path.join(root, str(iter), f'{name}_grad.npy') + def cb(blob: tp.Numpy): + np.save(path, blob) + return cb + From d8a281d0bb7e83b3e16f31bb17f32e374d8c2f56 Mon Sep 17 00:00:00 2001 From: ShawnXuan Date: Thu, 26 Aug 2021 14:52:13 +0800 Subject: [PATCH 4/7] train.sh --- Classification/cnns/train.sh | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/Classification/cnns/train.sh b/Classification/cnns/train.sh index 5a78d34..5444cf4 100755 --- a/Classification/cnns/train.sh +++ b/Classification/cnns/train.sh @@ -19,7 +19,6 @@ echo DATA_ROOT=$DATA_ROOT LOG_FOLDER=../logs mkdir -p $LOG_FOLDER LOGFILE=$LOG_FOLDER/resnet_training.log -export PYTHONUNBUFFERED=1 python3 of_cnn_train_val.py \ --train_data_dir=$DATA_ROOT/train \ @@ -27,7 +26,7 @@ python3 of_cnn_train_val.py \ --val_data_dir=$DATA_ROOT/validation \ --val_data_part_num=256 \ --num_nodes=1 \ - --gpu_num_per_node=4 \ + --gpu_num_per_node=8 \ --optimizer="sgd" \ --momentum=0.875 \ --label_smoothing=0.1 \ @@ -37,6 +36,5 @@ python3 of_cnn_train_val.py \ --val_batch_size_per_device=50 \ --num_epoch=$NUM_EPOCH \ --model="resnet50" 2>&1 | tee ${LOGFILE} - #--model="resnet50" 2>&1 | tee ${LOGFILE} echo "Writting log to ${LOGFILE}" From 4a4cbda064a52491aa4a028aab1f4aef67d5fc7d Mon Sep 17 00:00:00 2001 From: ShawnXuan Date: Thu, 26 Aug 2021 14:55:23 +0800 Subject: [PATCH 5/7] manual modify --- Classification/cnns/train.sh | 2 +- Classification/cnns/train_fp16.sh | 8 ++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/Classification/cnns/train.sh b/Classification/cnns/train.sh index 5444cf4..6aa2b80 100755 --- a/Classification/cnns/train.sh +++ b/Classification/cnns/train.sh @@ -32,7 +32,7 @@ python3 of_cnn_train_val.py \ --label_smoothing=0.1 \ --learning_rate=1.024 \ --loss_print_every_n_iter=100 \ - --batch_size_per_device=32 \ + --batch_size_per_device=128 \ --val_batch_size_per_device=50 \ --num_epoch=$NUM_EPOCH \ --model="resnet50" 2>&1 | tee ${LOGFILE} diff --git a/Classification/cnns/train_fp16.sh b/Classification/cnns/train_fp16.sh index 0c59ef0..7ecfa5c 100755 --- a/Classification/cnns/train_fp16.sh +++ b/Classification/cnns/train_fp16.sh @@ -26,14 +26,18 @@ export NCCL_LAUNCH_MODE=PARALLEL echo NCCL_LAUNCH_MODE=$NCCL_LAUNCH_MODE python3 of_cnn_train_val.py \ + --train_data_dir=$DATA_ROOT/train \ + --train_data_part_num=256 \ + --val_data_dir=$DATA_ROOT/validation \ + --val_data_part_num=256 \ --num_nodes=1 \ - --gpu_num_per_node=1 \ + --gpu_num_per_node=8 \ --optimizer="sgd" \ --momentum=0.875 \ --label_smoothing=0.1 \ --learning_rate=1.536 \ --loss_print_every_n_iter=100 \ - --batch_size_per_device=64 \ + --batch_size_per_device=192 \ --val_batch_size_per_device=50 \ --use_fp16 \ --channel_last=True \ From b95528d31131993edc6b6aba5829771f8fbb2150 Mon Sep 17 00:00:00 2001 From: ShawnXuan Date: Thu, 26 Aug 2021 16:10:13 +0800 Subject: [PATCH 6/7] fix conv algo --- Classification/cnns/align.sh | 6 +- Classification/cnns/job_function_util.py | 3 + Classification/cnns/resnet_rename.py | 107 +++++++++++------------ 3 files changed, 59 insertions(+), 57 deletions(-) diff --git a/Classification/cnns/align.sh b/Classification/cnns/align.sh index ca6cc06..88d2273 100755 --- a/Classification/cnns/align.sh +++ b/Classification/cnns/align.sh @@ -43,14 +43,14 @@ python3 of_cnn_train_val.py \ --batch_size_per_device=64 \ --val_batch_size_per_device=64 \ --channel_last=False \ - --pad_output \ - --fuse_bn_relu=True \ - --fuse_bn_add_relu=True \ + --fuse_bn_relu=False \ + --fuse_bn_add_relu=False \ --nccl_fusion_threshold_mb=16 \ --nccl_fusion_max_ops=24 \ --gpu_image_decoder=True \ --num_epoch=$NUM_EPOCH \ --model="resnet50" 2>&1 | tee ${LOGFILE} # --use_fp16 \ + #--pad_output \ echo "Writting log to ${LOGFILE}" diff --git a/Classification/cnns/job_function_util.py b/Classification/cnns/job_function_util.py index c651e95..3eaafba 100755 --- a/Classification/cnns/job_function_util.py +++ b/Classification/cnns/job_function_util.py @@ -26,6 +26,9 @@ def _default_config(args): if args.use_xla: config.use_xla_jit(True) config.enable_fuse_add_to_output(True) + config.cudnn_conv_force_fwd_algo(0) + config.cudnn_conv_force_bwd_data_algo(1) + config.cudnn_conv_force_bwd_filter_algo(1) return config diff --git a/Classification/cnns/resnet_rename.py b/Classification/cnns/resnet_rename.py index eac8c7a..784f924 100644 --- a/Classification/cnns/resnet_rename.py +++ b/Classification/cnns/resnet_rename.py @@ -96,56 +96,55 @@ def _batch_norm(self, inputs, name=None, last=False): ) def _batch_norm_relu(self, inputs, name=None, last=False): - # if self.fuse_bn_relu: - # initializer = flow.zeros_initializer() if last else flow.ones_initializer() - # axis = 1 - # if self.data_format == "NHWC": - # axis = 3 - # return flow.layers.batch_normalization_relu( - # inputs=inputs, - # axis=axis, - # momentum=0.9, - # epsilon=1e-5, - # center=True, - # scale=True, - # trainable=self.trainable, - # training=self.training, - # gamma_initializer=initializer, - # moving_variance_initializer=initializer, - # gamma_regularizer=self.weight_regularizer, - # beta_regularizer=self.weight_regularizer, - # name=name + "_bn_relu", - # ) - # else: - # return flow.nn.relu(self._batch_norm(inputs, name + "_bn", last=last)) - return flow.nn.relu(self._batch_norm(inputs, name, last=last)) + if self.fuse_bn_relu: + initializer = flow.zeros_initializer() if last else flow.ones_initializer() + axis = 1 + if self.data_format == "NHWC": + axis = 3 + return flow.layers.batch_normalization_relu( + inputs=inputs, + axis=axis, + momentum=0.9, + epsilon=1e-5, + center=True, + scale=True, + trainable=self.trainable, + training=self.training, + gamma_initializer=initializer, + moving_variance_initializer=initializer, + gamma_regularizer=self.weight_regularizer, + beta_regularizer=self.weight_regularizer, + name=name + "_bn_relu", + ) + else: + return flow.nn.relu(self._batch_norm(inputs, name, last=last)) def _batch_norm_add_relu(self, inputs, addend, name=None, last=False): - # if self.fuse_bn_add_relu: - # initializer = flow.zeros_initializer() if last else flow.ones_initializer() - # axis = 1 - # if self.data_format == "NHWC": - # axis = 3 - # return flow.layers.batch_normalization_add_relu( - # inputs=inputs, - # addend=addend, - # axis=axis, - # momentum=0.9, - # epsilon=1e-5, - # center=True, - # scale=True, - # trainable=self.trainable, - # training=self.training, - # gamma_initializer=initializer, - # moving_variance_initializer=initializer, - # gamma_regularizer=self.weight_regularizer, - # beta_regularizer=self.weight_regularizer, - # name=name + "_bn_add_relu", - # ) - # else: - return flow.nn.relu( - self._batch_norm(inputs, name, last=last) + addend - ) + if self.fuse_bn_add_relu: + initializer = flow.zeros_initializer() if last else flow.ones_initializer() + axis = 1 + if self.data_format == "NHWC": + axis = 3 + return flow.layers.batch_normalization_add_relu( + inputs=inputs, + addend=addend, + axis=axis, + momentum=0.9, + epsilon=1e-5, + center=True, + scale=True, + trainable=self.trainable, + training=self.training, + gamma_initializer=initializer, + moving_variance_initializer=initializer, + gamma_regularizer=self.weight_regularizer, + beta_regularizer=self.weight_regularizer, + name=name + "_bn_add_relu", + ) + else: + return flow.nn.relu( + self._batch_norm(inputs, name, last=last) + addend + ) def conv2d_affine(self, input, name, filters, kernel_size, strides): padding = "SAME" if strides > 1 or kernel_size > 1 else "VALID" @@ -231,12 +230,12 @@ def resnet50(images, args, trainable=True, training=True): args.fuse_bn_add_relu, ) - # if args.pad_output: - # if args.channel_last: - # paddings = ((0, 0), (0, 0), (0, 0), (0, 1)) - # else: - # paddings = ((0, 0), (0, 1), (0, 0), (0, 0)) - # images = flow.pad(images, paddings=paddings) + if args.pad_output: + if args.channel_last: + paddings = ((0, 0), (0, 0), (0, 0), (0, 1)) + else: + paddings = ((0, 0), (0, 1), (0, 0), (0, 0)) + images = flow.pad(images, paddings=paddings) # with flow.scope.namespace("resnet50"): stem = builder.resnet_stem(images) body = builder.resnet_conv_x_body(stem) From 708e293ccf851fc45e5790b1830186247a3a6a1f Mon Sep 17 00:00:00 2001 From: ShawnXuan Date: Thu, 26 Aug 2021 16:16:50 +0800 Subject: [PATCH 7/7] rm rename.py --- Classification/cnns/of_cnn_train_val.py | 9 +- Classification/cnns/resnet_model.py | 83 ++++---- Classification/cnns/resnet_rename.py | 264 ------------------------ 3 files changed, 48 insertions(+), 308 deletions(-) delete mode 100644 Classification/cnns/resnet_rename.py diff --git a/Classification/cnns/of_cnn_train_val.py b/Classification/cnns/of_cnn_train_val.py index 6ea6864..440fbf9 100755 --- a/Classification/cnns/of_cnn_train_val.py +++ b/Classification/cnns/of_cnn_train_val.py @@ -21,8 +21,7 @@ import config as configs from util import Snapshot, InitNodes, Metric from job_function_util import get_train_config, get_val_config -# import resnet_model -import resnet_rename as resnet_model +import resnet_model import resnext_model import vgg_model import alexnet_model @@ -151,9 +150,9 @@ def main(): for i in range(epoch_size): # TrainNet().async_get(metric.metric_cb(epoch, i)) a = TrainNet().get() - snapshot.save("epoch_{}_iter{}".format(epoch, i)) - print(a['loss'].numpy()) - if i>=1: + # snapshot.save("epoch_{}_iter{}".format(epoch, i)) + print('loss:', a['loss'].numpy()) + if i>=100: break break diff --git a/Classification/cnns/resnet_model.py b/Classification/cnns/resnet_model.py index 7e9c1fc..784f924 100755 --- a/Classification/cnns/resnet_model.py +++ b/Classification/cnns/resnet_model.py @@ -15,6 +15,7 @@ """ import oneflow.compatible.single_client as flow +from util import build_watch_cb, build_watch_diff_cb BLOCK_COUNTS = [3, 4, 6, 3] BLOCK_FILTERS = [256, 512, 1024, 2048] @@ -50,7 +51,7 @@ def _conv2d( else: shape = (filters, input.shape[1], kernel_size, kernel_size) weight = flow.get_variable( - name + "-weight", + name + ".weight", shape=shape, dtype=input.dtype, initializer=self.weight_initializer, @@ -58,6 +59,9 @@ def _conv2d( model_name="weight", trainable=self.trainable, ) + if 'conv1' == name: + flow.watch(weight, build_watch_cb('conv1_weight')) + flow.watch_diff(weight, build_watch_diff_cb('conv1_weight_grad')) return flow.nn.conv2d( input, @@ -113,7 +117,7 @@ def _batch_norm_relu(self, inputs, name=None, last=False): name=name + "_bn_relu", ) else: - return flow.nn.relu(self._batch_norm(inputs, name + "_bn", last=last)) + return flow.nn.relu(self._batch_norm(inputs, name, last=last)) def _batch_norm_add_relu(self, inputs, addend, name=None, last=False): if self.fuse_bn_add_relu: @@ -139,7 +143,7 @@ def _batch_norm_add_relu(self, inputs, addend, name=None, last=False): ) else: return flow.nn.relu( - self._batch_norm(inputs, name + "_bn", last=last) + addend + self._batch_norm(inputs, name, last=last) + addend ) def conv2d_affine(self, input, name, filters, kernel_size, strides): @@ -150,21 +154,21 @@ def conv2d_affine(self, input, name, filters, kernel_size, strides): def bottleneck_transformation( self, input, block_name, filters, filters_inner, strides ): - a = self.conv2d_affine(input, block_name + "_branch2a", filters_inner, 1, 1) - a = self._batch_norm_relu(a, block_name + "_branch2a") + a = self.conv2d_affine(input, block_name + ".conv1", filters_inner, 1, 1) + a = self._batch_norm_relu(a, block_name + ".bn1") - b = self.conv2d_affine(a, block_name + "_branch2b", filters_inner, 3, strides) - b = self._batch_norm_relu(b, block_name + "_branch2b") + b = self.conv2d_affine(a, block_name + ".conv2", filters_inner, 3, strides) + b = self._batch_norm_relu(b, block_name + ".bn2") - c = self.conv2d_affine(b, block_name + "_branch2c", filters, 1, 1) + c = self.conv2d_affine(b, block_name + ".conv3", filters, 1, 1) return c def residual_block(self, input, block_name, filters, filters_inner, strides_init): - if strides_init != 1 or block_name == "res2_0": + if strides_init != 1 or block_name == "layer1.0": shortcut = self.conv2d_affine( - input, block_name + "_branch1", filters, 1, strides_init + input, block_name + ".downsample.0", filters, 1, strides_init ) - shortcut = self._batch_norm(shortcut, block_name + "_branch1_bn") + shortcut = self._batch_norm(shortcut, block_name + ".downsample.1") else: shortcut = input @@ -172,7 +176,7 @@ def residual_block(self, input, block_name, filters, filters_inner, strides_init input, block_name, filters, filters_inner, strides_init, ) return self._batch_norm_add_relu( - bottleneck, shortcut, block_name + "_branch2c", last=True + bottleneck, shortcut, block_name + ".bn3", last=True ) def residual_stage( @@ -180,7 +184,7 @@ def residual_stage( ): output = input for i in range(counts): - block_name = "%s_%d" % (stage_name, i) + block_name = "%s.%d" % (stage_name, i) output = self.residual_block( output, block_name, filters, filters_inner, stride_init if i == 0 else 1 ) @@ -192,7 +196,7 @@ def resnet_conv_x_body(self, input): for i, (counts, filters, filters_inner) in enumerate( zip(BLOCK_COUNTS, BLOCK_FILTERS, BLOCK_FILTERS_INNER) ): - stage_name = "res%d" % (i + 2) + stage_name = "layer%d" % (i + 1) output = self.residual_stage( output, stage_name, counts, filters, filters_inner, 1 if i == 0 else 2 ) @@ -201,7 +205,7 @@ def resnet_conv_x_body(self, input): def resnet_stem(self, input): conv1 = self._conv2d("conv1", input, 64, 7, 2) - conv1_bn = self._batch_norm_relu(conv1, "conv1") + conv1_bn = self._batch_norm_relu(conv1, "bn1") pool1 = flow.nn.max_pool2d( conv1_bn, ksize=3, @@ -232,28 +236,29 @@ def resnet50(images, args, trainable=True, training=True): else: paddings = ((0, 0), (0, 1), (0, 0), (0, 0)) images = flow.pad(images, paddings=paddings) - with flow.scope.namespace("Resnet"): - stem = builder.resnet_stem(images) - body = builder.resnet_conv_x_body(stem) - pool5 = flow.nn.avg_pool2d( - body, - ksize=7, - strides=1, - padding="VALID", - data_format=builder.data_format, - name="pool5", - ) - fc1001 = flow.layers.dense( - flow.reshape(pool5, (pool5.shape[0], -1)), - units=1000, - use_bias=True, - kernel_initializer=flow.variance_scaling_initializer( - 2, "fan_in", "random_normal" - ), - bias_initializer=flow.zeros_initializer(), - kernel_regularizer=weight_regularizer, - bias_regularizer=weight_regularizer, - trainable=trainable, - name="fc1001", - ) + # with flow.scope.namespace("resnet50"): + stem = builder.resnet_stem(images) + body = builder.resnet_conv_x_body(stem) + pool5 = flow.nn.avg_pool2d( + body, + ksize=7, + strides=1, + padding="VALID", + data_format=builder.data_format, + name="avgpool", + ) + fc1001 = flow.layers.dense( + flow.reshape(pool5, (pool5.shape[0], -1)), + units=1000, + use_bias=True, + kernel_initializer=flow.variance_scaling_initializer( + 2, "fan_in", "random_normal" + ), + bias_initializer=flow.zeros_initializer(), + kernel_regularizer=weight_regularizer, + bias_regularizer=weight_regularizer, + trainable=trainable, + name="fc", + ) return fc1001 + diff --git a/Classification/cnns/resnet_rename.py b/Classification/cnns/resnet_rename.py deleted file mode 100644 index 784f924..0000000 --- a/Classification/cnns/resnet_rename.py +++ /dev/null @@ -1,264 +0,0 @@ -""" -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -""" - -import oneflow.compatible.single_client as flow -from util import build_watch_cb, build_watch_diff_cb - -BLOCK_COUNTS = [3, 4, 6, 3] -BLOCK_FILTERS = [256, 512, 1024, 2048] -BLOCK_FILTERS_INNER = [64, 128, 256, 512] - - -class ResnetBuilder(object): - def __init__( - self, - weight_regularizer, - trainable=True, - training=True, - channel_last=False, - fuse_bn_relu=True, - fuse_bn_add_relu=True, - ): - self.data_format = "NHWC" if channel_last else "NCHW" - self.weight_initializer = flow.variance_scaling_initializer( - 2, "fan_in", "random_normal", data_format=self.data_format - ) - self.weight_regularizer = weight_regularizer - self.trainable = trainable - self.training = training - self.fuse_bn_relu = fuse_bn_relu - self.fuse_bn_add_relu = fuse_bn_add_relu - - def _conv2d( - self, name, input, filters, kernel_size, strides=1, padding="SAME", dilations=1, - ): - # There are different shapes of weight metric between 'NCHW' and 'NHWC' mode - if self.data_format == "NHWC": - shape = (filters, kernel_size, kernel_size, input.shape[3]) - else: - shape = (filters, input.shape[1], kernel_size, kernel_size) - weight = flow.get_variable( - name + ".weight", - shape=shape, - dtype=input.dtype, - initializer=self.weight_initializer, - regularizer=self.weight_regularizer, - model_name="weight", - trainable=self.trainable, - ) - if 'conv1' == name: - flow.watch(weight, build_watch_cb('conv1_weight')) - flow.watch_diff(weight, build_watch_diff_cb('conv1_weight_grad')) - - return flow.nn.conv2d( - input, - weight, - strides, - padding, - None, - self.data_format, - dilations, - name=name, - ) - - def _batch_norm(self, inputs, name=None, last=False): - initializer = flow.zeros_initializer() if last else flow.ones_initializer() - axis = 1 - if self.data_format == "NHWC": - axis = 3 - return flow.layers.batch_normalization( - inputs=inputs, - axis=axis, - momentum=0.9, # 97, - epsilon=1e-5, - center=True, - scale=True, - trainable=self.trainable, - training=self.training, - gamma_initializer=initializer, - moving_variance_initializer=initializer, - gamma_regularizer=self.weight_regularizer, - beta_regularizer=self.weight_regularizer, - name=name, - ) - - def _batch_norm_relu(self, inputs, name=None, last=False): - if self.fuse_bn_relu: - initializer = flow.zeros_initializer() if last else flow.ones_initializer() - axis = 1 - if self.data_format == "NHWC": - axis = 3 - return flow.layers.batch_normalization_relu( - inputs=inputs, - axis=axis, - momentum=0.9, - epsilon=1e-5, - center=True, - scale=True, - trainable=self.trainable, - training=self.training, - gamma_initializer=initializer, - moving_variance_initializer=initializer, - gamma_regularizer=self.weight_regularizer, - beta_regularizer=self.weight_regularizer, - name=name + "_bn_relu", - ) - else: - return flow.nn.relu(self._batch_norm(inputs, name, last=last)) - - def _batch_norm_add_relu(self, inputs, addend, name=None, last=False): - if self.fuse_bn_add_relu: - initializer = flow.zeros_initializer() if last else flow.ones_initializer() - axis = 1 - if self.data_format == "NHWC": - axis = 3 - return flow.layers.batch_normalization_add_relu( - inputs=inputs, - addend=addend, - axis=axis, - momentum=0.9, - epsilon=1e-5, - center=True, - scale=True, - trainable=self.trainable, - training=self.training, - gamma_initializer=initializer, - moving_variance_initializer=initializer, - gamma_regularizer=self.weight_regularizer, - beta_regularizer=self.weight_regularizer, - name=name + "_bn_add_relu", - ) - else: - return flow.nn.relu( - self._batch_norm(inputs, name, last=last) + addend - ) - - def conv2d_affine(self, input, name, filters, kernel_size, strides): - padding = "SAME" if strides > 1 or kernel_size > 1 else "VALID" - output = self._conv2d(name, input, filters, kernel_size, strides, padding) - return output - - def bottleneck_transformation( - self, input, block_name, filters, filters_inner, strides - ): - a = self.conv2d_affine(input, block_name + ".conv1", filters_inner, 1, 1) - a = self._batch_norm_relu(a, block_name + ".bn1") - - b = self.conv2d_affine(a, block_name + ".conv2", filters_inner, 3, strides) - b = self._batch_norm_relu(b, block_name + ".bn2") - - c = self.conv2d_affine(b, block_name + ".conv3", filters, 1, 1) - return c - - def residual_block(self, input, block_name, filters, filters_inner, strides_init): - if strides_init != 1 or block_name == "layer1.0": - shortcut = self.conv2d_affine( - input, block_name + ".downsample.0", filters, 1, strides_init - ) - shortcut = self._batch_norm(shortcut, block_name + ".downsample.1") - else: - shortcut = input - - bottleneck = self.bottleneck_transformation( - input, block_name, filters, filters_inner, strides_init, - ) - return self._batch_norm_add_relu( - bottleneck, shortcut, block_name + ".bn3", last=True - ) - - def residual_stage( - self, input, stage_name, counts, filters, filters_inner, stride_init=2 - ): - output = input - for i in range(counts): - block_name = "%s.%d" % (stage_name, i) - output = self.residual_block( - output, block_name, filters, filters_inner, stride_init if i == 0 else 1 - ) - - return output - - def resnet_conv_x_body(self, input): - output = input - for i, (counts, filters, filters_inner) in enumerate( - zip(BLOCK_COUNTS, BLOCK_FILTERS, BLOCK_FILTERS_INNER) - ): - stage_name = "layer%d" % (i + 1) - output = self.residual_stage( - output, stage_name, counts, filters, filters_inner, 1 if i == 0 else 2 - ) - - return output - - def resnet_stem(self, input): - conv1 = self._conv2d("conv1", input, 64, 7, 2) - conv1_bn = self._batch_norm_relu(conv1, "bn1") - pool1 = flow.nn.max_pool2d( - conv1_bn, - ksize=3, - strides=2, - padding="SAME", - data_format=self.data_format, - name="pool1", - ) - return pool1 - - -def resnet50(images, args, trainable=True, training=True): - weight_regularizer = ( - flow.regularizers.l2(args.wd) if args.wd > 0.0 and args.wd < 1.0 else None - ) - builder = ResnetBuilder( - weight_regularizer, - trainable, - training, - args.channel_last, - args.fuse_bn_relu, - args.fuse_bn_add_relu, - ) - - if args.pad_output: - if args.channel_last: - paddings = ((0, 0), (0, 0), (0, 0), (0, 1)) - else: - paddings = ((0, 0), (0, 1), (0, 0), (0, 0)) - images = flow.pad(images, paddings=paddings) - # with flow.scope.namespace("resnet50"): - stem = builder.resnet_stem(images) - body = builder.resnet_conv_x_body(stem) - pool5 = flow.nn.avg_pool2d( - body, - ksize=7, - strides=1, - padding="VALID", - data_format=builder.data_format, - name="avgpool", - ) - fc1001 = flow.layers.dense( - flow.reshape(pool5, (pool5.shape[0], -1)), - units=1000, - use_bias=True, - kernel_initializer=flow.variance_scaling_initializer( - 2, "fan_in", "random_normal" - ), - bias_initializer=flow.zeros_initializer(), - kernel_regularizer=weight_regularizer, - bias_regularizer=weight_regularizer, - trainable=trainable, - name="fc", - ) - return fc1001 -