diff --git a/Classification/cnns/align.sh b/Classification/cnns/align.sh
new file mode 100755
index 0000000..88d2273
--- /dev/null
+++ b/Classification/cnns/align.sh
@@ -0,0 +1,56 @@
+rm -rf core.*
+rm -rf ./output/snapshots/*
+
+if [ -n "$1" ]; then
+    NUM_EPOCH=$1
+else
+    NUM_EPOCH=50
+fi
+echo NUM_EPOCH=$NUM_EPOCH
+
+# training with imagenet
+if [ -n "$2" ]; then
+    DATA_ROOT=$2
+else
+    DATA_ROOT=/dataset/ImageNet/ofrecord
+fi
+echo DATA_ROOT=$DATA_ROOT
+
+LOG_FOLDER=../logs
+mkdir -p $LOG_FOLDER
+LOGFILE=$LOG_FOLDER/resnet_training.log
+
+export PYTHONUNBUFFERED=1
+echo PYTHONUNBUFFERED=$PYTHONUNBUFFERED
+export NCCL_LAUNCH_MODE=PARALLEL
+echo NCCL_LAUNCH_MODE=$NCCL_LAUNCH_MODE
+
+     #--momentum=0.875 \
+python3 of_cnn_train_val.py \
+     --train_data_dir=$DATA_ROOT/train \
+     --train_data_part_num=256 \
+     --val_data_dir=$DATA_ROOT/validation \
+     --val_data_part_num=256 \
+     --num_nodes=1 \
+     --model_load_dir=/ssd/xiexuan/models/resnet50/init_ckpt \
+     --gpu_num_per_node=1 \
+     --optimizer="sgd" \
+     --momentum=0.0 \
+     --lr_decay="none" \
+     --label_smoothing=0.1 \
+     --learning_rate=0.1 \
+     --loss_print_every_n_iter=1 \
+     --batch_size_per_device=64 \
+     --val_batch_size_per_device=64 \
+     --channel_last=False \
+     --fuse_bn_relu=False \
+     --fuse_bn_add_relu=False \
+     --nccl_fusion_threshold_mb=16 \
+     --nccl_fusion_max_ops=24 \
+     --gpu_image_decoder=True \
+     --num_epoch=$NUM_EPOCH \
+     --model="resnet50" 2>&1 | tee ${LOGFILE}
+     # --use_fp16 \
+     #--pad_output \
+
+echo "Writting log to ${LOGFILE}"
diff --git a/Classification/cnns/job_function_util.py b/Classification/cnns/job_function_util.py
index c651e95..3eaafba 100755
--- a/Classification/cnns/job_function_util.py
+++ b/Classification/cnns/job_function_util.py
@@ -26,6 +26,9 @@ def _default_config(args):
     if args.use_xla:
         config.use_xla_jit(True)
     config.enable_fuse_add_to_output(True)
+    config.cudnn_conv_force_fwd_algo(0)
+    config.cudnn_conv_force_bwd_data_algo(1)
+    config.cudnn_conv_force_bwd_filter_algo(1)
     return config
 
 
diff --git a/Classification/cnns/of_cnn_train_val.py b/Classification/cnns/of_cnn_train_val.py
index 3d5cbbd..440fbf9 100755
--- a/Classification/cnns/of_cnn_train_val.py
+++ b/Classification/cnns/of_cnn_train_val.py
@@ -27,6 +27,7 @@
 import alexnet_model
 import inception_model
 import mobilenet_v2_model
+from util import build_watch_cb, build_watch_diff_cb
 
 parser = configs.get_parser()
 args = parser.parse_args()
@@ -51,7 +52,7 @@
 
 
 flow.config.gpu_device_num(args.gpu_num_per_node)
-# flow.config.enable_debug_mode(True)
+flow.config.enable_debug_mode(True)
 
 if args.use_fp16 and args.num_nodes * args.gpu_num_per_node > 1:
     flow.config.collective_boxing.nccl_fusion_all_reduce_use_buffer(False)
@@ -84,12 +85,15 @@ def TrainNet():
     if args.train_data_dir:
         assert os.path.exists(args.train_data_dir)
         print("Loading data from {}".format(args.train_data_dir))
-        (labels, images) = ofrecord_util.load_imagenet_for_training(args)
+        #(labels, images) = ofrecord_util.load_imagenet_for_training(args)
+        (labels, images) = ofrecord_util.load_imagenet_for_validation(args)
 
     else:
         print("Loading synthetic data.")
         (labels, images) = ofrecord_util.load_synthetic(args)
     logits = model_dict[args.model](images, args)
+    flow.watch(logits, build_watch_cb('logits'))
+    flow.watch_diff(logits, build_watch_diff_cb('logits_grad'))
     if args.label_smoothing > 0:
         one_hot_labels = label_smoothing(
             labels, args.num_classes, args.label_smoothing, logits.dtype
@@ -104,7 +108,7 @@ def TrainNet():
 
     loss = flow.math.reduce_mean(loss)
     predictions = flow.nn.softmax(logits)
-    outputs = {"loss": loss, "predictions": predictions, "labels": labels}
+    outputs = {"loss": loss, "predictions": predictions, "labels": labels, 'images': images, 'logits': logits}
 
     # set up warmup,learning rate and optimizer
     optimizer_util.set_up_optimizer(loss, args)
@@ -144,7 +148,13 @@ def main():
             loss_key="loss",
         )
         for i in range(epoch_size):
-            TrainNet().async_get(metric.metric_cb(epoch, i))
+            # TrainNet().async_get(metric.metric_cb(epoch, i))
+            a = TrainNet().get()
+            # snapshot.save("epoch_{}_iter{}".format(epoch, i))
+            print('loss:', a['loss'].numpy())
+            if i>=100:
+                break
+        break
 
         if args.val_data_dir:
             metric = Metric(
diff --git a/Classification/cnns/optimizer_util.py b/Classification/cnns/optimizer_util.py
index 43cd977..a5f4193 100755
--- a/Classification/cnns/optimizer_util.py
+++ b/Classification/cnns/optimizer_util.py
@@ -116,6 +116,11 @@ def set_up_optimizer(loss, args):
             staircase=False,
             warmup=warmup,
         )
+    elif args.lr_decay == "none":
+        lr_scheduler = flow.optimizer.PiecewiseConstantScheduler(
+            boundaries=[],
+            values=[args.learning_rate],
+        )
     else:
         lr_scheduler = flow.optimizer.PiecewiseScalingScheduler(
             base_lr=args.learning_rate,
@@ -134,7 +139,7 @@ def set_up_optimizer(loss, args):
         print("Optimizer:  SGD")
         flow.optimizer.SGD(
             lr_scheduler,
-            momentum=args.momentum if args.momentum > 0 else None,
+            momentum=args.momentum if args.momentum > 0 else 0.0,
             grad_clipping=grad_clipping,
             loss_scale_policy=loss_scale_policy,
         ).minimize(loss)
diff --git a/Classification/cnns/resnet_model.py b/Classification/cnns/resnet_model.py
index 7e9c1fc..784f924 100755
--- a/Classification/cnns/resnet_model.py
+++ b/Classification/cnns/resnet_model.py
@@ -15,6 +15,7 @@
 """
 
 import oneflow.compatible.single_client as flow
+from util import build_watch_cb, build_watch_diff_cb
 
 BLOCK_COUNTS = [3, 4, 6, 3]
 BLOCK_FILTERS = [256, 512, 1024, 2048]
@@ -50,7 +51,7 @@ def _conv2d(
         else:
             shape = (filters, input.shape[1], kernel_size, kernel_size)
         weight = flow.get_variable(
-            name + "-weight",
+            name + ".weight",
             shape=shape,
             dtype=input.dtype,
             initializer=self.weight_initializer,
@@ -58,6 +59,9 @@ def _conv2d(
             model_name="weight",
             trainable=self.trainable,
         )
+        if 'conv1' ==  name:
+            flow.watch(weight, build_watch_cb('conv1_weight'))
+            flow.watch_diff(weight, build_watch_diff_cb('conv1_weight_grad'))
 
         return flow.nn.conv2d(
             input,
@@ -113,7 +117,7 @@ def _batch_norm_relu(self, inputs, name=None, last=False):
                 name=name + "_bn_relu",
             )
         else:
-            return flow.nn.relu(self._batch_norm(inputs, name + "_bn", last=last))
+            return flow.nn.relu(self._batch_norm(inputs, name, last=last))
 
     def _batch_norm_add_relu(self, inputs, addend, name=None, last=False):
         if self.fuse_bn_add_relu:
@@ -139,7 +143,7 @@ def _batch_norm_add_relu(self, inputs, addend, name=None, last=False):
             )
         else:
             return flow.nn.relu(
-                self._batch_norm(inputs, name + "_bn", last=last) + addend
+                self._batch_norm(inputs, name, last=last) + addend
             )
 
     def conv2d_affine(self, input, name, filters, kernel_size, strides):
@@ -150,21 +154,21 @@ def conv2d_affine(self, input, name, filters, kernel_size, strides):
     def bottleneck_transformation(
         self, input, block_name, filters, filters_inner, strides
     ):
-        a = self.conv2d_affine(input, block_name + "_branch2a", filters_inner, 1, 1)
-        a = self._batch_norm_relu(a, block_name + "_branch2a")
+        a = self.conv2d_affine(input, block_name + ".conv1", filters_inner, 1, 1)
+        a = self._batch_norm_relu(a, block_name + ".bn1")
 
-        b = self.conv2d_affine(a, block_name + "_branch2b", filters_inner, 3, strides)
-        b = self._batch_norm_relu(b, block_name + "_branch2b")
+        b = self.conv2d_affine(a, block_name + ".conv2", filters_inner, 3, strides)
+        b = self._batch_norm_relu(b, block_name + ".bn2")
 
-        c = self.conv2d_affine(b, block_name + "_branch2c", filters, 1, 1)
+        c = self.conv2d_affine(b, block_name + ".conv3", filters, 1, 1)
         return c
 
     def residual_block(self, input, block_name, filters, filters_inner, strides_init):
-        if strides_init != 1 or block_name == "res2_0":
+        if strides_init != 1 or block_name == "layer1.0":
             shortcut = self.conv2d_affine(
-                input, block_name + "_branch1", filters, 1, strides_init
+                input, block_name + ".downsample.0", filters, 1, strides_init
             )
-            shortcut = self._batch_norm(shortcut, block_name + "_branch1_bn")
+            shortcut = self._batch_norm(shortcut, block_name + ".downsample.1")
         else:
             shortcut = input
 
@@ -172,7 +176,7 @@ def residual_block(self, input, block_name, filters, filters_inner, strides_init
             input, block_name, filters, filters_inner, strides_init,
         )
         return self._batch_norm_add_relu(
-            bottleneck, shortcut, block_name + "_branch2c", last=True
+            bottleneck, shortcut, block_name + ".bn3", last=True
         )
 
     def residual_stage(
@@ -180,7 +184,7 @@ def residual_stage(
     ):
         output = input
         for i in range(counts):
-            block_name = "%s_%d" % (stage_name, i)
+            block_name = "%s.%d" % (stage_name, i)
             output = self.residual_block(
                 output, block_name, filters, filters_inner, stride_init if i == 0 else 1
             )
@@ -192,7 +196,7 @@ def resnet_conv_x_body(self, input):
         for i, (counts, filters, filters_inner) in enumerate(
             zip(BLOCK_COUNTS, BLOCK_FILTERS, BLOCK_FILTERS_INNER)
         ):
-            stage_name = "res%d" % (i + 2)
+            stage_name = "layer%d" % (i + 1)
             output = self.residual_stage(
                 output, stage_name, counts, filters, filters_inner, 1 if i == 0 else 2
             )
@@ -201,7 +205,7 @@ def resnet_conv_x_body(self, input):
 
     def resnet_stem(self, input):
         conv1 = self._conv2d("conv1", input, 64, 7, 2)
-        conv1_bn = self._batch_norm_relu(conv1, "conv1")
+        conv1_bn = self._batch_norm_relu(conv1, "bn1")
         pool1 = flow.nn.max_pool2d(
             conv1_bn,
             ksize=3,
@@ -232,28 +236,29 @@ def resnet50(images, args, trainable=True, training=True):
         else:
             paddings = ((0, 0), (0, 1), (0, 0), (0, 0))
         images = flow.pad(images, paddings=paddings)
-    with flow.scope.namespace("Resnet"):
-        stem = builder.resnet_stem(images)
-        body = builder.resnet_conv_x_body(stem)
-        pool5 = flow.nn.avg_pool2d(
-            body,
-            ksize=7,
-            strides=1,
-            padding="VALID",
-            data_format=builder.data_format,
-            name="pool5",
-        )
-        fc1001 = flow.layers.dense(
-            flow.reshape(pool5, (pool5.shape[0], -1)),
-            units=1000,
-            use_bias=True,
-            kernel_initializer=flow.variance_scaling_initializer(
-                2, "fan_in", "random_normal"
-            ),
-            bias_initializer=flow.zeros_initializer(),
-            kernel_regularizer=weight_regularizer,
-            bias_regularizer=weight_regularizer,
-            trainable=trainable,
-            name="fc1001",
-        )
+    # with flow.scope.namespace("resnet50"):
+    stem = builder.resnet_stem(images)
+    body = builder.resnet_conv_x_body(stem)
+    pool5 = flow.nn.avg_pool2d(
+        body,
+        ksize=7,
+        strides=1,
+        padding="VALID",
+        data_format=builder.data_format,
+        name="avgpool",
+    )
+    fc1001 = flow.layers.dense(
+        flow.reshape(pool5, (pool5.shape[0], -1)),
+        units=1000,
+        use_bias=True,
+        kernel_initializer=flow.variance_scaling_initializer(
+            2, "fan_in", "random_normal"
+        ),
+        bias_initializer=flow.zeros_initializer(),
+        kernel_regularizer=weight_regularizer,
+        bias_regularizer=weight_regularizer,
+        trainable=trainable,
+        name="fc",
+    )
     return fc1001
+
diff --git a/Classification/cnns/util.py b/Classification/cnns/util.py
index 6b7ce9a..c865cec 100755
--- a/Classification/cnns/util.py
+++ b/Classification/cnns/util.py
@@ -41,7 +41,8 @@ def __init__(self, model_save_dir, model_load_dir):
         if model_load_dir:
             assert os.path.isdir(model_load_dir)
             print("Restoring model from {}.".format(model_load_dir))
-            flow.load_variables(flow.checkpoint.get(model_load_dir))
+            flow.load_variables(flow.checkpoint.get(model_load_dir), ignore_mismatch=False)
+            # flow.checkpoint.save('loaded_init_ckpt')
         else:
             # flow.checkpoint.save("initial_model")
             print("Init model on demand.")
@@ -84,6 +85,15 @@ def match_top_k(predictions, labels, top_k=1):
     return num_matched, match_array.shape[0]
 
 
+def dump_outputs(outputs, step, dump_dir='output'):
+    for k, v in outputs.items():
+        root = os.path.join(dump_dir, str(step))
+        if not os.path.isdir(root):
+            os.makedirs(root)
+        path = os.path.join(root, k)
+        np.save(path, v.numpy())
+
+
 class Metric(object):
     def __init__(
         self,
@@ -142,6 +152,7 @@ def callback(outputs):
             self.num_samples += num_samples
 
             if (step + 1) % self.calculate_batches == 0:
+                dump_outputs(outputs, step)
                 throughput = self.num_samples / self.timer.split()
                 if self.prediction_key:
                     top_1_accuracy = self.top_1_num_matched / self.num_samples
@@ -180,3 +191,20 @@ def callback(outputs):
                 self._clear()
 
         return callback
+
+
+from oneflow.compatible.single_client import typing as tp
+
+def build_watch_cb(name, iter=0, root='output'):
+    path = os.path.join(root, str(iter), f'{name}.npy')
+    def cb(blob: tp.Numpy):
+        np.save(path, blob)
+    return cb
+
+
+def build_watch_diff_cb(name, iter=0, root='output'):
+    path = os.path.join(root, str(iter), f'{name}_grad.npy')
+    def cb(blob: tp.Numpy):
+        np.save(path, blob)
+    return cb
+