From f7cc7239911eefa407030c7e95f5306787f32572 Mon Sep 17 00:00:00 2001 From: lyon Date: Thu, 15 Apr 2021 16:21:38 +0800 Subject: [PATCH 1/9] add scripts --- .../Megatron-LM/scripts/run_multi_node.sh | 24 +++++ .../Megatron-LM/scripts/run_single_node.sh | 45 +++++++++ DeepSpeed/Megatron-LM/scripts/run_two_node.sh | 22 +++++ DeepSpeed/Megatron-LM/scripts/runner.sh | 94 +++++++++++++++++++ 4 files changed, 185 insertions(+) create mode 100644 DeepSpeed/Megatron-LM/scripts/run_multi_node.sh create mode 100644 DeepSpeed/Megatron-LM/scripts/run_single_node.sh create mode 100644 DeepSpeed/Megatron-LM/scripts/run_two_node.sh create mode 100644 DeepSpeed/Megatron-LM/scripts/runner.sh diff --git a/DeepSpeed/Megatron-LM/scripts/run_multi_node.sh b/DeepSpeed/Megatron-LM/scripts/run_multi_node.sh new file mode 100644 index 00000000..075e629e --- /dev/null +++ b/DeepSpeed/Megatron-LM/scripts/run_multi_node.sh @@ -0,0 +1,24 @@ +#!/usr/bin/bash +SHELL_FOLDER=$(dirname $(readlink -f "$0")) +MODEL=${1:-gpt2-small} +BATCH_SIZE_PER_DEVICE=${2:-16} +ZERO_STAGE=${3:-2} +CHECKPOINT_ACTIVATIONS=${4:-"on"} +DTYPE=${5:-'fp16'} +TEST_NUM=${6:-4} + +export NODE1=10.11.0.2 +export NODE2=10.11.0.3 +export NODE3=10.11.0.4 +export NODE4=10.11.0.5 + + +i=1 +while [ $i -le ${TEST_NUM} ] +do + bash $SHELL_FOLDER/runner.sh $MODEL $BATCH_SIZE_PER_DEVICE 4 8 $ZERO_STAGE $CHECKPOINT_ACTIVATIONS $DTYPE ${i} + echo " >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< " + let i++ + pkill python3 + sleep 30s +done diff --git a/DeepSpeed/Megatron-LM/scripts/run_single_node.sh b/DeepSpeed/Megatron-LM/scripts/run_single_node.sh new file mode 100644 index 00000000..715e388b --- /dev/null +++ b/DeepSpeed/Megatron-LM/scripts/run_single_node.sh @@ -0,0 +1,45 @@ +#!/usr/bin/bash +SHELL_FOLDER=$(dirname $(readlink -f "$0")) +MODEL=${1:-gpt2-small} +BATCH_SIZE_PER_DEVICE=${2:-4} +ZERO_STAGE=${3:-2} +CHECKPOINT_ACTIVATIONS=${4:-"on"} +DTYPE=${5:-'fp16'} +TEST_NUM=${6:-5} + + +export NODE1=10.11.0.2 +export NODE2=10.11.0.3 +export NODE3=10.11.0.4 +export NODE4=10.11.0.5 + + +i=4 +while [ $i -le 4 ] +do + bash $SHELL_FOLDER/runner.sh $MODEL $BATCH_SIZE_PER_DEVICE 1 1 $ZERO_STAGE $CHECKPOINT_ACTIVATIONS $DTYPE ${i} + echo " >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< " + let i++ + pkill python3 + sleep 30s +done + +# i=1 +# while [ $i -le ${TEST_NUM} ] +# do +# bash $SHELL_FOLDER/runner.sh $MODEL $BATCH_SIZE_PER_DEVICE 1 4 $ZERO_STAGE $CHECKPOINT_ACTIVATIONS $DTYPE ${i} +# echo " >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< " +# let i++ +# pkill python3 +# sleep 30s +# done + +# i=1 +# while [ $i -le ${TEST_NUM} ] +# do +# bash $SHELL_FOLDER/runner.sh $MODEL $BATCH_SIZE_PER_DEVICE 1 8 $ZERO_STAGE $CHECKPOINT_ACTIVATIONS $DTYPE ${i} +# echo " >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< " +# let i++ +# pkill python3 +# sleep 30s +# done diff --git a/DeepSpeed/Megatron-LM/scripts/run_two_node.sh b/DeepSpeed/Megatron-LM/scripts/run_two_node.sh new file mode 100644 index 00000000..ec4d2c96 --- /dev/null +++ b/DeepSpeed/Megatron-LM/scripts/run_two_node.sh @@ -0,0 +1,22 @@ +#!/usr/bin/bash +SHELL_FOLDER=$(dirname $(readlink -f "$0")) +MODEL=${1:-gpt2-small} +BATCH_SIZE_PER_DEVICE=${2:-16} +ZERO_STAGE=${3:-2} +CHECKPOINT_ACTIVATIONS=${4:-"on"} +DTYPE=${5:-'fp16'} +TEST_NUM=${6:-4} + +export NODE1=10.11.0.2 +export NODE2=10.11.0.3 + + +i=1 +while [ $i -le ${TEST_NUM} ] +do + bash $SHELL_FOLDER/runner.sh $MODEL $BATCH_SIZE_PER_DEVICE 2 8 $ZERO_STAGE $CHECKPOINT_ACTIVATIONS $DTYPE ${i} + echo " >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< " + let i++ + pkill python3 + sleep 30s +done diff --git a/DeepSpeed/Megatron-LM/scripts/runner.sh b/DeepSpeed/Megatron-LM/scripts/runner.sh new file mode 100644 index 00000000..83944189 --- /dev/null +++ b/DeepSpeed/Megatron-LM/scripts/runner.sh @@ -0,0 +1,94 @@ +#! /bin/bash +MODEL=${1:-gpt2-small} +BATCH_SIZE_PER_DEVICE=${2:-8} +NUM_WORKERS=${3:-1} +NUM_GPUS_PER_WORKER=${4:-8} +ZERO_STAGE=${5:-2} +CHECKPOINT_ACTIVATIONS=${6:-"on"} +DTYPE=${7:-'fp16'} +TEST_NUM=${8:-1} +ITER_NUM=${9:-200} +MP_SIZE=${10:-1} + +script_path=$(realpath $0) +script_dir=$(dirname $script_path) + +a=`expr ${#GPUS} + 1` +gpu_num_per_node=`expr ${a} / 2` +gpu_num=`expr ${NUM_GPUS_PER_WORKER} \* ${NUM_WORKERS}` +total_bz=`expr ${BATCH_SIZE_PER_DEVICE} \* ${gpu_num}` + +sed -i "s/\"train_batch_size\":.*$/\"train_batch_size\": $total_bz,/" $script_dir/ds_zero2_config.json +if [ ${CHECKPOINT_ACTIVATIONS} == "on" ];then + sed -i "s/\"partition_activations\":.*$/\"partition_activations\": true,/" $script_dir/ds_zero2_config.json +else + sed -i "s/\"partition_activations\":.*$/\"partition_activations\": false,/" $script_dir/ds_zero2_config.json +fi +sed -i "s/\"stage\":.*$/\"stage\": $ZERO_STAGE/" $script_dir/ds_zero2_config.json + + +if [ ${MODEL} == "gpt2-small" ];then + echo "Using network >> gpt2-small" + num_layers=12 + num_attention_heads=12 + hidden_size=768 +elif [ ${MODEL} == "gpt2-medium" ];then + echo "Using network >> gpt2-medium" + num_layers=24 + num_attention_heads=16 + hidden_size=1024 +fi + +PREFIX=logs-20210414-stage${ZERO_STAGE}-${CHECKPOINT_ACTIVATIONS}-activation +rm -rf test-checkpoints +LOG_FOLDER=./${PREFIX}/deepspeed/${MODEL}/bz${BATCH_SIZE_PER_DEVICE}/${NUM_WORKERS}n${NUM_GPUS_PER_WORKER}g +mkdir -p $LOG_FOLDER +LOG=${LOG_FOLDER}/${MODEL}_b${BATCH_SIZE_PER_DEVICE}_fp16_${TEST_NUM}.log + + +config_json="$script_dir/ds_zero2_config.json" +gpt_options=" \ + --save test-checkpoints \ + --model-parallel-size ${MP_SIZE} \ + --num-layers ${num_layers} \ + --hidden-size ${hidden_size} \ + --num-attention-heads ${num_attention_heads} \ + --batch-size ${BATCH_SIZE_PER_DEVICE} \ + --seq-length 1024 \ + --max-position-embeddings 1024 \ + --train-iters ${ITER_NUM} \ + --resume-dataloader \ + --train-data wikipedia \ + --lazy-loader \ + --tokenizer-type GPT2BPETokenizer \ + --split 949,50,1 \ + --distributed-backend nccl \ + --lr 0.00015 \ + --no-load-optim \ + --lr-decay-style cosine \ + --weight-decay 1e-2 \ + --clip-grad 1.0 \ + --warmup .01 \ +" + +if [ "$DTYPE" = "fp16" ] ; then + echo "Using data type >> fp16" + gpt_options="${gpt_options} --fp16 " +else + echo "Using data type >> fp32" +fi + + +if [ ${CHECKPOINT_ACTIVATIONS} == "on" ];then + gpt_options="${gpt_options} + --checkpoint-activations --deepspeed-activation-checkpointing --deepspeed --deepspeed_config ${config_json} " +else + gpt_options="${gpt_options} + --deepspeed \ + --deepspeed_config ${config_json} \ + " +fi + +run_cmd="deepspeed --hostfile=deepspeed_hosts --num_nodes ${NUM_WORKERS} --num_gpus ${NUM_GPUS_PER_WORKER} pretrain_gpt2.py ${gpt_options} " +echo ${run_cmd} +eval ${run_cmd} 2>&1 | tee ${LOG} From 2f4ee9100a1215d5b79949a6f33d1ed6e08eb76f Mon Sep 17 00:00:00 2001 From: lyon Date: Thu, 15 Apr 2021 16:28:31 +0800 Subject: [PATCH 2/9] refine --- .../Megatron-LM/scripts/run_multi_node.sh | 3 +- .../Megatron-LM/scripts/run_single_node.sh | 44 ++++++++++--------- DeepSpeed/Megatron-LM/scripts/run_two_node.sh | 4 +- 3 files changed, 28 insertions(+), 23 deletions(-) diff --git a/DeepSpeed/Megatron-LM/scripts/run_multi_node.sh b/DeepSpeed/Megatron-LM/scripts/run_multi_node.sh index 075e629e..671a7d09 100644 --- a/DeepSpeed/Megatron-LM/scripts/run_multi_node.sh +++ b/DeepSpeed/Megatron-LM/scripts/run_multi_node.sh @@ -5,7 +5,7 @@ BATCH_SIZE_PER_DEVICE=${2:-16} ZERO_STAGE=${3:-2} CHECKPOINT_ACTIVATIONS=${4:-"on"} DTYPE=${5:-'fp16'} -TEST_NUM=${6:-4} +TEST_NUM=${6:-5} export NODE1=10.11.0.2 export NODE2=10.11.0.3 @@ -22,3 +22,4 @@ do pkill python3 sleep 30s done + diff --git a/DeepSpeed/Megatron-LM/scripts/run_single_node.sh b/DeepSpeed/Megatron-LM/scripts/run_single_node.sh index 715e388b..6ff8502d 100644 --- a/DeepSpeed/Megatron-LM/scripts/run_single_node.sh +++ b/DeepSpeed/Megatron-LM/scripts/run_single_node.sh @@ -14,8 +14,8 @@ export NODE3=10.11.0.4 export NODE4=10.11.0.5 -i=4 -while [ $i -le 4 ] +i=1 +while [ $i -le ${TEST_NUM} ] do bash $SHELL_FOLDER/runner.sh $MODEL $BATCH_SIZE_PER_DEVICE 1 1 $ZERO_STAGE $CHECKPOINT_ACTIVATIONS $DTYPE ${i} echo " >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< " @@ -24,22 +24,24 @@ do sleep 30s done -# i=1 -# while [ $i -le ${TEST_NUM} ] -# do -# bash $SHELL_FOLDER/runner.sh $MODEL $BATCH_SIZE_PER_DEVICE 1 4 $ZERO_STAGE $CHECKPOINT_ACTIVATIONS $DTYPE ${i} -# echo " >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< " -# let i++ -# pkill python3 -# sleep 30s -# done - -# i=1 -# while [ $i -le ${TEST_NUM} ] -# do -# bash $SHELL_FOLDER/runner.sh $MODEL $BATCH_SIZE_PER_DEVICE 1 8 $ZERO_STAGE $CHECKPOINT_ACTIVATIONS $DTYPE ${i} -# echo " >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< " -# let i++ -# pkill python3 -# sleep 30s -# done +i=1 +while [ $i -le ${TEST_NUM} ] +do + bash $SHELL_FOLDER/runner.sh $MODEL $BATCH_SIZE_PER_DEVICE 1 4 $ZERO_STAGE $CHECKPOINT_ACTIVATIONS $DTYPE ${i} + echo " >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< " + let i++ + pkill python3 + sleep 30s +done + +i=1 +while [ $i -le ${TEST_NUM} ] +do + bash $SHELL_FOLDER/runner.sh $MODEL $BATCH_SIZE_PER_DEVICE 1 8 $ZERO_STAGE $CHECKPOINT_ACTIVATIONS $DTYPE ${i} + echo " >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< " + let i++ + pkill python3 + sleep 30s +done + + diff --git a/DeepSpeed/Megatron-LM/scripts/run_two_node.sh b/DeepSpeed/Megatron-LM/scripts/run_two_node.sh index ec4d2c96..74f3db55 100644 --- a/DeepSpeed/Megatron-LM/scripts/run_two_node.sh +++ b/DeepSpeed/Megatron-LM/scripts/run_two_node.sh @@ -5,7 +5,7 @@ BATCH_SIZE_PER_DEVICE=${2:-16} ZERO_STAGE=${3:-2} CHECKPOINT_ACTIVATIONS=${4:-"on"} DTYPE=${5:-'fp16'} -TEST_NUM=${6:-4} +TEST_NUM=${6:-5} export NODE1=10.11.0.2 export NODE2=10.11.0.3 @@ -20,3 +20,5 @@ do pkill python3 sleep 30s done + + From fd65b7cf11a857ea0db459afbc7372138dc004a9 Mon Sep 17 00:00:00 2001 From: lyon Date: Thu, 15 Apr 2021 21:44:20 +0800 Subject: [PATCH 3/9] add extract_deepspeed_logs.py --- .../Megatron-LM/extract_deepspeed_logs.py | 130 ++++++++++++++++++ 1 file changed, 130 insertions(+) create mode 100644 DeepSpeed/Megatron-LM/extract_deepspeed_logs.py diff --git a/DeepSpeed/Megatron-LM/extract_deepspeed_logs.py b/DeepSpeed/Megatron-LM/extract_deepspeed_logs.py new file mode 100644 index 00000000..ed9525bd --- /dev/null +++ b/DeepSpeed/Megatron-LM/extract_deepspeed_logs.py @@ -0,0 +1,130 @@ +import os +import re +import sys +import glob +import json +import argparse +import pprint + +import numpy as np + +pp = pprint.PrettyPrinter(indent=1) +os.chdir(sys.path[0]) + +parser = argparse.ArgumentParser(description="flags for benchmark") +parser.add_argument("--log_dir", type=str, default="./logs/deepspeed/gpt2-small/bz8", required=True) +parser.add_argument("--output_dir", type=str, default="./result", required=False) +parser.add_argument('--warmup_batches', type=int, default=100) +parser.add_argument('--train_batches', type=int, default=200) +parser.add_argument('--batch_size_per_device', type=int, default=8) + +args = parser.parse_args() + + +class AutoVivification(dict): + """Implementation of perl's autovivification feature.""" + + def __getitem__(self, item): + try: + return dict.__getitem__(self, item) + except KeyError: + value = self[item] = type(self)() + return value + + +def extract_info_from_file(log_file, result_dict, speed_dict): + # extract info from file name + fname = os.path.basename(log_file) + run_case = log_file.split("/")[-2] # eg: 1n1g + model = fname.split("_")[0] + batch_size = int(fname.split("_")[1].strip("b")) + pricition = fname.split("_")[2] + test_iter = int(fname.split("_")[3].strip(".log")) + node_num = int(run_case[0]) + if len(run_case) == 4: + card_num = int(run_case[-2]) + elif len(run_case) == 5: + card_num = int(run_case[-3:-1]) + + total_batch_size = node_num * card_num * batch_size + tmp_dict = { + 'average_speed': 0, + 'batch_size_per_device': batch_size, + } + + avg_speed_list = [] + # extract info from file content + with open(log_file) as f: + lines = f.readlines() + for line in lines: + if "SamplesPerSec" in line: + p1 = re.compile(r"SamplesPerSec=(.*\.?.*)\n", re.S) + item = re.findall(p1, line) + a = float(item[0].strip()) + avg_speed_list.append(round(a, 4)) + + # compute avg throughoutput + begin_index=args.warmup_batches-2 + avg_speed = round(np.mean(avg_speed_list[begin_index:args.train_batches]), 2) + tmp_dict['average_speed'] = avg_speed + + result_dict[model][run_case]['average_speed'] = tmp_dict['average_speed'] + result_dict[model][run_case]['batch_size_per_device'] = tmp_dict['batch_size_per_device'] + + speed_dict[model][run_case][test_iter] = avg_speed + + print(log_file, speed_dict[model][run_case]) + + +def compute_median(iter_dict): + speed_list = [i for i in iter_dict.values()] + return round(np.median(speed_list), 2) + + +def compute_speedup(result_dict, speed_dict): + model_list = [key for key in result_dict] # eg.['vgg16', 'rn50'] + for m in model_list: + run_case = [key for key in result_dict[m]] # eg.['4n8g', '2n8g', '1n8g', '1n4g', '1n1g'] + for d in run_case: + speed_up = 1.0 + if result_dict[m]['1n1g']['average_speed']: + result_dict[m][d]['average_speed'] = compute_average(speed_dict[m][d]) + result_dict[m][d]['median_speed'] = compute_median(speed_dict[m][d]) + speed_up = result_dict[m][d]['median_speed'] / compute_median(speed_dict[m]['1n1g']) + result_dict[m][d]['speedup'] = round(speed_up, 2) + + +def compute_average(iter_dict): + i = 0 + total_speed = 0 + for iter in iter_dict: + i += 1 + total_speed += iter_dict[iter] + return round(total_speed / i, 2) + + +def extract_result(): + result_dict = AutoVivification() + speed_dict = AutoVivification() + logs_list = glob.glob(os.path.join(args.log_dir, "*/*.log")) + for l in logs_list: + extract_info_from_file(l, result_dict, speed_dict) + + # compute speedup + compute_speedup(result_dict, speed_dict) + + # print result + pp.pprint(result_dict) + + # write to file as JSON format + os.makedirs(args.output_dir, exist_ok=True) + framwork = args.log_dir.split('/')[-1] + result_file_name = os.path.join(args.output_dir, framwork + "_result.json") + print("Saving result to {}".format(result_file_name)) + with open(result_file_name, 'w') as f: + json.dump(result_dict, f) + + +if __name__ == "__main__": + extract_result() + From 84e8c3eb97f7d58f680ae05cc363c7c1b6def7c2 Mon Sep 17 00:00:00 2001 From: lyon Date: Fri, 16 Apr 2021 11:48:59 +0800 Subject: [PATCH 4/9] update scripts --- DeepSpeed/Megatron-LM/scripts/run_multi_node.sh | 10 +++++----- DeepSpeed/Megatron-LM/scripts/run_single_node.sh | 7 ++----- DeepSpeed/Megatron-LM/scripts/run_two_node.sh | 6 +++--- DeepSpeed/Megatron-LM/scripts/runner.sh | 2 ++ 4 files changed, 12 insertions(+), 13 deletions(-) diff --git a/DeepSpeed/Megatron-LM/scripts/run_multi_node.sh b/DeepSpeed/Megatron-LM/scripts/run_multi_node.sh index 671a7d09..c313196e 100644 --- a/DeepSpeed/Megatron-LM/scripts/run_multi_node.sh +++ b/DeepSpeed/Megatron-LM/scripts/run_multi_node.sh @@ -1,16 +1,16 @@ #!/usr/bin/bash SHELL_FOLDER=$(dirname $(readlink -f "$0")) MODEL=${1:-gpt2-small} -BATCH_SIZE_PER_DEVICE=${2:-16} +BATCH_SIZE_PER_DEVICE=${2:-8} ZERO_STAGE=${3:-2} CHECKPOINT_ACTIVATIONS=${4:-"on"} DTYPE=${5:-'fp16'} TEST_NUM=${6:-5} -export NODE1=10.11.0.2 -export NODE2=10.11.0.3 -export NODE3=10.11.0.4 -export NODE4=10.11.0.5 +# export NODE1=10.11.0.2 +# export NODE2=10.11.0.3 +# export NODE3=10.11.0.4 +# export NODE4=10.11.0.5 i=1 diff --git a/DeepSpeed/Megatron-LM/scripts/run_single_node.sh b/DeepSpeed/Megatron-LM/scripts/run_single_node.sh index 6ff8502d..8980833a 100644 --- a/DeepSpeed/Megatron-LM/scripts/run_single_node.sh +++ b/DeepSpeed/Megatron-LM/scripts/run_single_node.sh @@ -1,17 +1,14 @@ #!/usr/bin/bash SHELL_FOLDER=$(dirname $(readlink -f "$0")) MODEL=${1:-gpt2-small} -BATCH_SIZE_PER_DEVICE=${2:-4} +BATCH_SIZE_PER_DEVICE=${2:-8} ZERO_STAGE=${3:-2} CHECKPOINT_ACTIVATIONS=${4:-"on"} DTYPE=${5:-'fp16'} TEST_NUM=${6:-5} -export NODE1=10.11.0.2 -export NODE2=10.11.0.3 -export NODE3=10.11.0.4 -export NODE4=10.11.0.5 +# export NODE1=10.11.0.2 i=1 diff --git a/DeepSpeed/Megatron-LM/scripts/run_two_node.sh b/DeepSpeed/Megatron-LM/scripts/run_two_node.sh index 74f3db55..aa7c2228 100644 --- a/DeepSpeed/Megatron-LM/scripts/run_two_node.sh +++ b/DeepSpeed/Megatron-LM/scripts/run_two_node.sh @@ -1,14 +1,14 @@ #!/usr/bin/bash SHELL_FOLDER=$(dirname $(readlink -f "$0")) MODEL=${1:-gpt2-small} -BATCH_SIZE_PER_DEVICE=${2:-16} +BATCH_SIZE_PER_DEVICE=${2:-8} ZERO_STAGE=${3:-2} CHECKPOINT_ACTIVATIONS=${4:-"on"} DTYPE=${5:-'fp16'} TEST_NUM=${6:-5} -export NODE1=10.11.0.2 -export NODE2=10.11.0.3 +# export NODE1=10.11.0.2 +# export NODE2=10.11.0.3 i=1 diff --git a/DeepSpeed/Megatron-LM/scripts/runner.sh b/DeepSpeed/Megatron-LM/scripts/runner.sh index 83944189..3ef5d349 100644 --- a/DeepSpeed/Megatron-LM/scripts/runner.sh +++ b/DeepSpeed/Megatron-LM/scripts/runner.sh @@ -92,3 +92,5 @@ fi run_cmd="deepspeed --hostfile=deepspeed_hosts --num_nodes ${NUM_WORKERS} --num_gpus ${NUM_GPUS_PER_WORKER} pretrain_gpt2.py ${gpt_options} " echo ${run_cmd} eval ${run_cmd} 2>&1 | tee ${LOG} + + From ce01e8b9cb5a5d923d679cd8ac9d933bed0cdda6 Mon Sep 17 00:00:00 2001 From: lyon Date: Fri, 16 Apr 2021 11:49:51 +0800 Subject: [PATCH 5/9] remove useless file --- .../ds_zero2_pretrain_gpt2_model_parallel.sh | 87 ------------------- 1 file changed, 87 deletions(-) delete mode 100644 DeepSpeed/Megatron-LM/scripts/ds_zero2_pretrain_gpt2_model_parallel.sh diff --git a/DeepSpeed/Megatron-LM/scripts/ds_zero2_pretrain_gpt2_model_parallel.sh b/DeepSpeed/Megatron-LM/scripts/ds_zero2_pretrain_gpt2_model_parallel.sh deleted file mode 100644 index 78e20d50..00000000 --- a/DeepSpeed/Megatron-LM/scripts/ds_zero2_pretrain_gpt2_model_parallel.sh +++ /dev/null @@ -1,87 +0,0 @@ -#! /bin/bash - -# Change for multinode config -BATCH_SIZE=${1:-4} -NUM_GPUS_PER_WORKER=${2:-8} -ZERO_STAGE=${3:-0} -CHECKPOINT_ACTIVATIONS=${4:-"off"} -NUM_WORKERS=${5:-1} -MP_SIZE=${6:-1} -ITER_NUM=${7:-1000} - -script_path=$(realpath $0) -script_dir=$(dirname $script_path) - -echo "BATCH_SIZE: ${BATCH_SIZE}, NUM_GPUS_PER_WORKER:${NUM_GPUS_PER_WORKER}, ZERO_STAGE:${ZERO_STAGE}, CHECKPOINT_ACTIVATIONS:${CHECKPOINT_ACTIVATIONS} " - -a=`expr ${#GPUS} + 1` -gpu_num_per_node=`expr ${a} / 2` -gpu_num=`expr ${NUM_GPUS_PER_WORKER} \* ${NUM_WORKERS}` -total_bz=`expr ${BATCH_SIZE} \* ${gpu_num}` - -sed -i "s/\"train_batch_size\":.*$/\"train_batch_size\": $total_bz,/" $script_dir/ds_zero2_config.json -if [ ${CHECKPOINT_ACTIVATIONS} == "on" ];then - sed -i "s/\"partition_activations\":.*$/\"partition_activations\": true,/" $script_dir/ds_zero2_config.json -else - sed -i "s/\"partition_activations\":.*$/\"partition_activations\": false,/" $script_dir/ds_zero2_config.json -fi -sed -i "s/\"stage\":.*$/\"stage\": $ZERO_STAGE/" $script_dir/ds_zero2_config.json - -# gpt2-small -num_layers=12 -num_attention_heads=12 -hidden_size=768 - -# # gpt2-medium -# num_layers=24 -# num_attention_heads=16 -# hidden_size=1024 - - -PREFIX=20201209-test_zero_gpt2-small -rm -rf checkpoints -LOG_FOLDER=./logs -mkdir -p $LOG_FOLDER -LOG=${LOG_FOLDER}/${PREFIX}_${NUM_WORKERS}n${NUM_GPUS_PER_WORKER}g_bz${BATCH_SIZE}_zero_stage${ZERO_STAGE}_${CHECKPOINT_ACTIVATIONS}_checkpoint_activation.log - - - -config_json="$script_dir/ds_zero2_config.json" -gpt_options=" \ - --save $PREFIX_checkpoint_${NUM_WORKERS}n${NUM_GPUS_PER_WORKER}g_bz${BATCH_SIZE}_zero_stage${ZERO_STAGE}_${CHECKPOINT_ACTIVATIONS}_checkpoint_activation \ - --model-parallel-size ${MP_SIZE} \ - --num-layers ${num_layers} \ - --hidden-size ${hidden_size} \ - --num-attention-heads ${num_attention_heads} \ - --batch-size ${BATCH_SIZE} \ - --seq-length 1024 \ - --max-position-embeddings 1024 \ - --train-iters ${ITER_NUM} \ - --resume-dataloader \ - --train-data wikipedia \ - --lazy-loader \ - --tokenizer-type GPT2BPETokenizer \ - --split 949,50,1 \ - --distributed-backend nccl \ - --lr 0.00015 \ - --no-load-optim \ - --lr-decay-style cosine \ - --weight-decay 1e-2 \ - --clip-grad 1.0 \ - --warmup .01 \ - --fp16 \ -" - -if [ ${CHECKPOINT_ACTIVATIONS} == "on" ];then - gpt_options="${gpt_options} - --checkpoint-activations --deepspeed-activation-checkpointing --deepspeed --deepspeed_config ${config_json} " -else - gpt_options="${gpt_options} - --deepspeed \ - --deepspeed_config ${config_json} \ - " -fi - -run_cmd="deepspeed --hostfile=deepspeed_hosts --num_nodes ${NUM_WORKERS} --num_gpus ${NUM_GPUS_PER_WORKER} pretrain_gpt2.py ${gpt_options} " -echo ${run_cmd} -eval ${run_cmd} 2>&1 | tee ${LOG} \ No newline at end of file From 96699668c6ba9b03715388e0801e8b74d1d578a2 Mon Sep 17 00:00:00 2001 From: lyon Date: Fri, 16 Apr 2021 11:50:21 +0800 Subject: [PATCH 6/9] update readme --- DeepSpeed/Megatron-LM/README.md | 212 ++++++++++++++++++++------------ 1 file changed, 134 insertions(+), 78 deletions(-) diff --git a/DeepSpeed/Megatron-LM/README.md b/DeepSpeed/Megatron-LM/README.md index abbe1add..61bdc52b 100644 --- a/DeepSpeed/Megatron-LM/README.md +++ b/DeepSpeed/Megatron-LM/README.md @@ -2,25 +2,26 @@ ## 概述 Overview -本次测评提供了多组真实测试数据。测评基于微软[DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples/tree/a79272cc8b8f0c5b66c803e581a1355341eacb77) 仓库中的[Megatron-LM](https://github.com/microsoft/DeepSpeedExamples/tree/a79272cc8b8f0c5b66c803e581a1355341eacb77/Megatron-LM)实现,框架依赖[DeepSpeed](https://github.com/microsoft/DeepSpeed/tree/7d4d742bf03f8e1707130391e0b39bd6d93a702a) 以及pytorch,基于以上环境,对gpt-2 small、gpt-2 medium在单机单卡~4机32卡情况下进行了多组测试。 +本测评基于微软[DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples/tree/a79272cc8b8f0c5b66c803e581a1355341eacb77) 仓库中的[Megatron-LM](https://github.com/microsoft/DeepSpeedExamples/tree/a79272cc8b8f0c5b66c803e581a1355341eacb77/Megatron-LM)实现,框架依赖[DeepSpeed](https://github.com/microsoft/DeepSpeed/tree/7d4d742bf03f8e1707130391e0b39bd6d93a702a) 以及pytorch,基于以上环境,对gpt-2 small在单机单卡~4机32卡情况下进行了多组测试。测评主要目标在于评价ZeRO不同优化阶段时的训练速度、加速比情况。 -测评背景:DeepSpeed是一个深度学习优化库,使分布式训练简单、高效和有效,DeepSpeed实现了[论文](https://arxiv.org/abs/1910.02054)中提出的ZeRo内存优化技术,减少了显存占用以支持更大模型,这里主要关注的是Optimizer state的拆分。 - -测评目标:主要验证的是stage 1的显存缩减。 +测评背景:DeepSpeed是一个深度学习优化库,使分布式训练简单、高效和有效,DeepSpeed实现了[论文](https://arxiv.org/abs/1910.02054)中提出的ZeRo内存优化技术,减少了显存占用以支持更大模型。 ## 环境 Environment -### 系统 +所有的测试都是在4台配置了8张 V100-SXM2-16GB GPU的服务器中,主要硬软件配置信息如下: - #### 硬件 - GPU:8x Tesla V100-SXM2-16GB + - InfiniBand 100 Gb/sec (4X EDR), Mellanox Technologies MT27700 Family + - Intel(R) Xeon(R) Gold 5118 CPU @ 2.30GHz + - Memory 384G - #### 软件 - - 驱动:NVIDIA 440.33.01 + - 驱动:Driver Version: 460.67 - - 系统:[ Ubuntu 16.04](http://releases.ubuntu.com/16.04/) + - 系统:[ Ubuntu 16.04.4 LTS (GNU/Linux 4.4.0-116-generic x86_64)](http://releases.ubuntu.com/16.04/) - CUDA:10.2 @@ -31,10 +32,36 @@ - Python:3.7.9 - #### 框架 - + - **pytorch 1.6.0** - **deepspeed 0.3.0+7d4d742** +- #### GPU拓扑 + +```python + + GPU0 GPU1 GPU2 GPU3 GPU4 GPU5 GPU6 GPU7 mlx5_0 CPU Affinity +GPU0 X NV1 NV1 NV2 NV2 SYS SYS SYS NODE 0-11,24-35 +GPU1 NV1 X NV2 NV1 SYS NV2 SYS SYS NODE 0-11,24-35 +GPU2 NV1 NV2 X NV2 SYS SYS NV1 SYS PIX 0-11,24-35 +GPU3 NV2 NV1 NV2 X SYS SYS SYS NV1 PIX 0-11,24-35 +GPU4 NV2 SYS SYS SYS X NV1 NV1 NV2 SYS 12-23,36-47 +GPU5 SYS NV2 SYS SYS NV1 X NV2 NV1 SYS 12-23,36-47 +GPU6 SYS SYS NV1 SYS NV1 NV2 X NV2 SYS 12-23,36-47 +GPU7 SYS SYS SYS NV1 NV2 NV1 NV2 X SYS 12-23,36-47 +mlx5_0 NODE NODE PIX PIX SYS SYS SYS SYS X + +Legend: + + X = Self + SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI) + NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node + PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU) + PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge) + PIX = Connection traversing at most a single PCIe bridge + NV# = Connection traversing a bonded set of # NVLinks +``` + ## 快速开始 Quick Start @@ -190,19 +217,18 @@ class wikipedia(json_dataset): 将本仓库scripts目录下的文件放入`DeepSpeed/DeepSpeedExamples/Megatron-LM/scripts` -- ds_zero2_pretrain_gpt2_model_parallel.sh为测试使用的主要脚本 +- run_xxx_node.sh为单机~多机测试的启动脚本 +- runner.sh为训练主脚本(会在启动脚本中被调用) - ds_zero2_config.json为测试脚本相关的配置文件 -如果需要进行多机测试,需要在`DeepSpeed/DeepSpeedExamples/Megatron-LM`下新建集群的hosts文件,可参考本仓库中的deepspeed_hosts: +如需要进行多机测试,需要在`DeepSpeed/DeepSpeedExamples/Megatron-LM`下新建集群的hosts文件,可参考本仓库中的deepspeed_hosts: ```shell vs002 slots=8 vs003 slots=8 -vs004 slots=8 -vs005 slots=8 ``` -表示集群使用4台机器,每个机器使用8张GPU设备 +表示集群使用2台机器,每个机器使用8张GPU设备;4机及以上,相应地在deepspeed_hosts中增加集群配置即可。 ### 4.测试 @@ -217,95 +243,123 @@ vs005 slots=8 #### 参数及配置 -测试使用`ds_zero2_pretrain_gpt2_model_parallel.sh`脚本,其中,默认使用gpt-2 small的网络配置,主要参数如下: +测试主脚本为 `runner.sh` ,其中,默认使用gpt-2 small的网络配置,主要参数说明如下: -- BATCH_SIZE为单卡batch_size,默认为4 -- NUM_GPUS_PER_WORKER 每台机器使用的gpu数,默认为8 -- ZERO_STAGE zero优化阶段,默认为0,可选0,1,2(3目前DeepSpeed框架暂不支持) -- CHECKPOINT_ACTIVATIONS 是否开启亚线性activation优化,默认为off关闭 -- NUM_WORKERS 分布式训练集群中,机器节点数(单机情况下设为1;4机情况下设为4,根据情况设置) -- MP_SIZE 模型并行度,可以在1~NUM_GPUS_PER_WORKER数字中设置(默认1为不开启模型并行) -- ITER_NUM 测试迭代的iter数,默认迭代1000 iter +- `BATCH_SIZE_PER_DEVICE` 为单卡batch_size,默认为8 +- `NUM_WORKERS` 分布式训练集群中,机器节点数(单机情况下设为1;4机情况下设为4,根据情况设置) +- `NUM_GPUS_PER_WORKER` 每台机器使用的gpu数,默认为8 +- `ZERO_STAGE` zero优化阶段,默认为0,可选0,1,2(3目前DeepSpeed框架暂不支持) +- `CHECKPOINT_ACTIVATIONS` 是否开启activation/gradient checkpointing优化,默认为off关闭 +- `MP_SIZE` 模型并行度,可以在1~NUM_GPUS_PER_WORKER数字中设置(默认1为不开启模型并行) +- `ITER_NUM` 测试迭代的iter数,默认迭代200 iter -**默认测试的网络为gpt2-small** ,也可以修改脚本中的参数设置不同型号的gpt2网络,如: +配置相关的参数如下: ```shell -# gpt2-small -num_layers=12 -num_attention_heads=12 -hidden_size=768 -# # gpt2-medium -# num_layers=24 -# num_attention_heads=16 -# hidden_size=1024 +MODEL=${1:-gpt2-small} +BATCH_SIZE_PER_DEVICE=${2:-8} +NUM_WORKERS=${3:-1} +NUM_GPUS_PER_WORKER=${4:-8} +ZERO_STAGE=${5:-2} +CHECKPOINT_ACTIVATIONS=${6:-"on"} +DTYPE=${7:-'fp16'} +TEST_NUM=${8:-1} +ITER_NUM=${9:-200 ``` -配置相关的参数如下: +#### 单机测试 -```shell -BATCH_SIZE=${1:-4} -NUM_GPUS_PER_WORKER=${2:-8} -ZERO_STAGE=${3:-0} -CHECKPOINT_ACTIVATIONS=${4:-"off"} -NUM_WORKERS=${5:-1} -MP_SIZE=${6:-1} -ITER_NUM=${7:-1000} -``` +运行 `bach scripts/run_single_node.sh` 即可,默认测试条件为:gpt-2-small网络、batch size为8、fp16混合精度、zero-stage-2优化阶段,也可自定义参数,如不同的batch size和不同zero优化阶段: `bach scripts/run_single_node.sh gpt2-small 16 1` 。 + +#### 多机测试 + +多机测试时需要保证多机上的数据集及路径、ds_zero2_config.json配置完全一样,然后运行相应脚本即可,如2机,可运行: `bach scripts/run_two_node.sh` ,4机可运行: `bach scripts/run_multi_node.sh` 。 -#### 运行脚本 +#### 其他测试 -运行以下脚本将对单机单卡~4机32卡进行测试 +除了以上测试外,还可以通过设置不同参数进行多种类型测试。如,可以将CHECKPOINT_ACTIVATIONS设置为off来测试关闭checkpointing的情况,由于checkpointing关闭后,内存占用较大,故可以相应地降低batch size(如设置为4)。 + +off-checkpointing 的测试脚本示例如下: ```shell # 单机1卡 -bash scripts/ds_zero2_pretrain_gpt2_model_parallel.sh 4 1 0 off 1 +bash scripts/runner.sh gpt2-small 4 1 1 0 off # 单机4卡 -bash scripts/ds_zero2_pretrain_gpt2_model_parallel.sh 4 4 0 off 1 +bash scripts/runner.sh gpt2-small 4 1 4 0 off # 单机8卡 -bash scripts/ds_zero2_pretrain_gpt2_model_parallel.sh 4 8 0 off 1 +bash scripts/runner.sh gpt2-small 4 1 8 0 off # 2机16卡 -bash scripts/ds_zero2_pretrain_gpt2_model_parallel.sh 4 8 0 off 2 +bash scripts/runner.sh gpt2-small 4 2 8 0 off # 4机32卡 -bash scripts/ds_zero2_pretrain_gpt2_model_parallel.sh 4 8 0 off 4 +bash scripts/runner.sh gpt2-small 4 4 8 0 off ``` -## 测试结果 Performance +### 4. 吞吐率及加速比 -#### 测试环境 +执行以下命令,即可根据logs文件计算出训练吞吐率及加速比: -所有的测试都是在4台配置了8张 V100-SXM2-16GB GPU的服务器中,主要硬软件配置信息: +`python3 extract_deepspeed_logs.py --log_dir=./logs/deepspeed/gpt2-small/bz8` -```shell -Tesla V100-SXM2-16GB x 8 -InfiniBand 100 Gb/sec (4X EDR), Mellanox Technologies MT27700 Family -Intel(R) Xeon(R) Gold 5118 CPU @ 2.30GHz -Memory 384G -Ubuntu 16.04.4 LTS (GNU/Linux 4.4.0-116-generic x86_64) -CUDA Version: 10.2, Driver Version: 440.33.01 +输出: - GPU0 GPU1 GPU2 GPU3 GPU4 GPU5 GPU6 GPU7 mlx5_0 CPU Affinity -GPU0 X NV1 NV1 NV2 NV2 SYS SYS SYS NODE 0-11,24-35 -GPU1 NV1 X NV2 NV1 SYS NV2 SYS SYS NODE 0-11,24-35 -GPU2 NV1 NV2 X NV2 SYS SYS NV1 SYS PIX 0-11,24-35 -GPU3 NV2 NV1 NV2 X SYS SYS SYS NV1 PIX 0-11,24-35 -GPU4 NV2 SYS SYS SYS X NV1 NV1 NV2 SYS 12-23,36-47 -GPU5 SYS NV2 SYS SYS NV1 X NV2 NV1 SYS 12-23,36-47 -GPU6 SYS SYS NV1 SYS NV1 NV2 X NV2 SYS 12-23,36-47 -GPU7 SYS SYS SYS NV1 NV2 NV1 NV2 X SYS 12-23,36-47 -mlx5_0 NODE NODE PIX PIX SYS SYS SYS SYS X - -Legend: +```python3 - X = Self - SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI) - NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node - PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU) - PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge) - PIX = Connection traversing at most a single PCIe bridge - NV# = Connection traversing a bonded set of # NVLinks ``` -#### 测试结果 + + +## 测试结果 Result + +### Gpt2-small & AMP + +#### zero-stage-0 + +| node_num | batch_size_per_device | gpu_num_per_node | samples/s | speedup | +| -------- | --------------------- | ---------------- | --------- | ------- | +| 1 | 8 | 1 | | 1.00 | +| 1 | 8 | 4 | | | +| 1 | 8 | 8 | | | +| 2 | 8 | 8 | | | +| 4 | 8 | 8 | | | +| 1 | 16(max) | 1 | | 1.00 | +| 1 | 16 | 4 | | | +| 1 | 16 | 8 | | | +| 2 | 16 | 8 | | | +| 4 | 16 | 8 | | | + +#### zero-stage-1 + +| node_num | batch_size_per_device | gpu_num_per_node | samples/s | speedup | +| -------- | --------------------- | ---------------- | --------- | ------- | +| 1 | 8 | 1 | | 1.00 | +| 1 | 8 | 4 | | | +| 1 | 8 | 8 | | | +| 2 | 8 | 8 | | | +| 4 | 8 | 8 | | | +| 1 | 16(max) | 1 | | 1.00 | +| 1 | 16 | 4 | | | +| 1 | 16 | 8 | | | +| 2 | 16 | 8 | | | +| 4 | 16 | 8 | | | + +#### zero-stage-2 + +| node_num | batch_size_per_device | gpu_num_per_node | samples/s | speedup | +| -------- | --------------------- | ---------------- | --------- | ------- | +| 1 | 8 | 1 | | 1.00 | +| 1 | 8 | 4 | | | +| 1 | 8 | 8 | | | +| 2 | 8 | 8 | | | +| 4 | 8 | 8 | | | +| 1 | 16(max) | 1 | | 1.00 | +| 1 | 16 | 4 | | | +| 1 | 16 | 8 | | | +| 2 | 16 | 8 | | | +| 4 | 16 | 8 | | | + + + +#### off-checkpointing测试结果 | date | test_num | test_desc | xn_xg_xdp_xmp_xbs | gpu_mem(mB) | gpu_util(%) | throuthput(sample/sec) | | -------- | -------- | ------------ | ----------------- | ------------- | ----------- | ---------------------- | @@ -328,7 +382,9 @@ Legend: ### 日志下载 -详细 Log 信息可点击下载:[deepspeed-logs.zip](https://oneflow-public.oss-cn-beijing.aliyuncs.com/DLPerf/logs/DeepSpeed/gpt2/osdi-deepspeed-logs.zip) +详细 Log 信息可点击下载: + +- [deepspeed-off-checkpointing-logs.zip](https://oneflow-public.oss-cn-beijing.aliyuncs.com/DLPerf/logs/DeepSpeed/gpt2/deepspeed-off-checkpointing-logs.zip) From 81bcb12415a72aacebce22f1a0af358db57af8b3 Mon Sep 17 00:00:00 2001 From: lyon Date: Tue, 20 Apr 2021 10:18:44 +0800 Subject: [PATCH 7/9] update data --- DeepSpeed/Megatron-LM/README.md | 48 ++++++++++++++++----------------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/DeepSpeed/Megatron-LM/README.md b/DeepSpeed/Megatron-LM/README.md index 61bdc52b..4db406c8 100644 --- a/DeepSpeed/Megatron-LM/README.md +++ b/DeepSpeed/Megatron-LM/README.md @@ -316,45 +316,45 @@ bash scripts/runner.sh gpt2-small 4 4 8 0 off | node_num | batch_size_per_device | gpu_num_per_node | samples/s | speedup | | -------- | --------------------- | ---------------- | --------- | ------- | -| 1 | 8 | 1 | | 1.00 | -| 1 | 8 | 4 | | | -| 1 | 8 | 8 | | | -| 2 | 8 | 8 | | | +| 1 | 8 | 1 | 22.65 | 1 | +| 1 | 8 | 4 | 89.1 | 3.93 | +| 1 | 8 | 8 | 178.09 | 7.86 | +| 2 | 8 | 8 | 292.63 | 12.92 | | 4 | 8 | 8 | | | -| 1 | 16(max) | 1 | | 1.00 | -| 1 | 16 | 4 | | | -| 1 | 16 | 8 | | | -| 2 | 16 | 8 | | | +| 1 | 16(max) | 1 | 23.96 | 1 | +| 1 | 16 | 4 | 95.16 | 3.97 | +| 1 | 16 | 8 | 190.36 | 7.94 | +| 2 | 16 | 8 | 332.98 | 13.9 | | 4 | 16 | 8 | | | #### zero-stage-1 | node_num | batch_size_per_device | gpu_num_per_node | samples/s | speedup | | -------- | --------------------- | ---------------- | --------- | ------- | -| 1 | 8 | 1 | | 1.00 | -| 1 | 8 | 4 | | | -| 1 | 8 | 8 | | | -| 2 | 8 | 8 | | | +| 1 | 8 | 1 | 21.93 | 1 | +| 1 | 8 | 4 | 88.29 | 4.03 | +| 1 | 8 | 8 | 176.66 | 8.06 | +| 2 | 8 | 8 | 290.72 | 13.26 | | 4 | 8 | 8 | | | -| 1 | 16(max) | 1 | | 1.00 | -| 1 | 16 | 4 | | | -| 1 | 16 | 8 | | | -| 2 | 16 | 8 | | | +| 1 | 16(max) | 1 | 23.5 | 1 | +| 1 | 16 | 4 | 94.96 | 4.04 | +| 1 | 16 | 8 | 189.44 | 8.06 | +| 2 | 16 | 8 | 334.97 | 14.25 | | 4 | 16 | 8 | | | #### zero-stage-2 | node_num | batch_size_per_device | gpu_num_per_node | samples/s | speedup | | -------- | --------------------- | ---------------- | --------- | ------- | -| 1 | 8 | 1 | | 1.00 | -| 1 | 8 | 4 | | | -| 1 | 8 | 8 | | | -| 2 | 8 | 8 | | | +| 1 | 8 | 1 | 22.08 | 1 | +| 1 | 8 | 4 | 90.92 | 4.12 | +| 1 | 8 | 8 | 183.7 | 8.32 | +| 2 | 8 | 8 | 278.19 | 12.6 | | 4 | 8 | 8 | | | -| 1 | 16(max) | 1 | | 1.00 | -| 1 | 16 | 4 | | | -| 1 | 16 | 8 | | | -| 2 | 16 | 8 | | | +| 1 | 16(max) | 1 | 23.65 | 1 | +| 1 | 16 | 4 | 96.07 | 4.06 | +| 1 | 16 | 8 | 193.01 | 8.16 | +| 2 | 16 | 8 | 330.08 | 13.96 | | 4 | 16 | 8 | | | From 773d64e8d25b8b18a0c614d80bc1dc7b4785f1f3 Mon Sep 17 00:00:00 2001 From: lyon Date: Thu, 22 Apr 2021 16:10:29 +0800 Subject: [PATCH 8/9] update scripts --- DeepSpeed/Megatron-LM/scripts/run_multi_node.sh | 5 ----- DeepSpeed/Megatron-LM/scripts/run_single_node.sh | 2 -- 2 files changed, 7 deletions(-) diff --git a/DeepSpeed/Megatron-LM/scripts/run_multi_node.sh b/DeepSpeed/Megatron-LM/scripts/run_multi_node.sh index c313196e..adc8dc93 100644 --- a/DeepSpeed/Megatron-LM/scripts/run_multi_node.sh +++ b/DeepSpeed/Megatron-LM/scripts/run_multi_node.sh @@ -7,11 +7,6 @@ CHECKPOINT_ACTIVATIONS=${4:-"on"} DTYPE=${5:-'fp16'} TEST_NUM=${6:-5} -# export NODE1=10.11.0.2 -# export NODE2=10.11.0.3 -# export NODE3=10.11.0.4 -# export NODE4=10.11.0.5 - i=1 while [ $i -le ${TEST_NUM} ] diff --git a/DeepSpeed/Megatron-LM/scripts/run_single_node.sh b/DeepSpeed/Megatron-LM/scripts/run_single_node.sh index 8980833a..7f5ea686 100644 --- a/DeepSpeed/Megatron-LM/scripts/run_single_node.sh +++ b/DeepSpeed/Megatron-LM/scripts/run_single_node.sh @@ -8,8 +8,6 @@ DTYPE=${5:-'fp16'} TEST_NUM=${6:-5} -# export NODE1=10.11.0.2 - i=1 while [ $i -le ${TEST_NUM} ] From 0c9a475d81e8e21045a562fccb8f7944b24caae6 Mon Sep 17 00:00:00 2001 From: lyon Date: Thu, 22 Apr 2021 16:10:54 +0800 Subject: [PATCH 9/9] add multi node testing data --- DeepSpeed/Megatron-LM/README.md | 72 ++++++++++++++++++++++++++++----- 1 file changed, 63 insertions(+), 9 deletions(-) diff --git a/DeepSpeed/Megatron-LM/README.md b/DeepSpeed/Megatron-LM/README.md index 4db406c8..62f5a1a3 100644 --- a/DeepSpeed/Megatron-LM/README.md +++ b/DeepSpeed/Megatron-LM/README.md @@ -298,12 +298,56 @@ bash scripts/runner.sh gpt2-small 4 4 8 0 off 执行以下命令,即可根据logs文件计算出训练吞吐率及加速比: -`python3 extract_deepspeed_logs.py --log_dir=./logs/deepspeed/gpt2-small/bz8` +`python3 extract_deepspeed_logs.py --log_dir=./logs/logs-20210414-stage2-on-activation/deepspeed/gpt2-small/bz16` 输出: ```python3 - +./logs/logs-20210414-stage2-on-activation/deepspeed/gpt2-small/bz16/1n1g/gpt2-small_b16_fp16_4.log {4: 23.7} +./logs/logs-20210414-stage2-on-activation/deepspeed/gpt2-small/bz16/1n1g/gpt2-small_b16_fp16_5.log {4: 23.7, 5: 23.6} +./logs/logs-20210414-stage2-on-activation/deepspeed/gpt2-small/bz16/1n1g/gpt2-small_b16_fp16_2.log {4: 23.7, 5: 23.6, 2: 23.7} +./logs/logs-20210414-stage2-on-activation/deepspeed/gpt2-small/bz16/1n1g/gpt2-small_b16_fp16_1.log {4: 23.7, 5: 23.6, 2: 23.7, 1: 23.58} +./logs/logs-20210414-stage2-on-activation/deepspeed/gpt2-small/bz16/1n8g/gpt2-small_b16_fp16_4.log {4: 193.13} +./logs/logs-20210414-stage2-on-activation/deepspeed/gpt2-small/bz16/1n8g/gpt2-small_b16_fp16_5.log {4: 193.13, 5: 193.01} +./logs/logs-20210414-stage2-on-activation/deepspeed/gpt2-small/bz16/1n8g/gpt2-small_b16_fp16_2.log {4: 193.13, 5: 193.01, 2: 192.99} +./logs/logs-20210414-stage2-on-activation/deepspeed/gpt2-small/bz16/1n8g/gpt2-small_b16_fp16_3.log {4: 193.13, 5: 193.01, 2: 192.99, 3: 193.01} +./logs/logs-20210414-stage2-on-activation/deepspeed/gpt2-small/bz16/1n8g/gpt2-small_b16_fp16_1.log {4: 193.13, 5: 193.01, 2: 192.99, 3: 193.01, 1: 193.44} +./logs/logs-20210414-stage2-on-activation/deepspeed/gpt2-small/bz16/4n8g/gpt2-small_b16_fp16_4.log {4: 630.52} +./logs/logs-20210414-stage2-on-activation/deepspeed/gpt2-small/bz16/4n8g/gpt2-small_b16_fp16_5.log {4: 630.52, 5: 629.65} +./logs/logs-20210414-stage2-on-activation/deepspeed/gpt2-small/bz16/4n8g/gpt2-small_b16_fp16_2.log {4: 630.52, 5: 629.65, 2: 631.74} +./logs/logs-20210414-stage2-on-activation/deepspeed/gpt2-small/bz16/4n8g/gpt2-small_b16_fp16_3.log {4: 630.52, 5: 629.65, 2: 631.74, 3: 630.87} +./logs/logs-20210414-stage2-on-activation/deepspeed/gpt2-small/bz16/4n8g/gpt2-small_b16_fp16_1.log {4: 630.52, 5: 629.65, 2: 631.74, 3: 630.87, 1: 637.21} +./logs/logs-20210414-stage2-on-activation/deepspeed/gpt2-small/bz16/2n8g/gpt2-small_b16_fp16_4.log {4: 330.14} +./logs/logs-20210414-stage2-on-activation/deepspeed/gpt2-small/bz16/2n8g/gpt2-small_b16_fp16_5.log {4: 330.14, 5: 330.08} +./logs/logs-20210414-stage2-on-activation/deepspeed/gpt2-small/bz16/2n8g/gpt2-small_b16_fp16_2.log {4: 330.14, 5: 330.08, 2: 325.84} +./logs/logs-20210414-stage2-on-activation/deepspeed/gpt2-small/bz16/2n8g/gpt2-small_b16_fp16_3.log {4: 330.14, 5: 330.08, 2: 325.84, 3: 324.32} +./logs/logs-20210414-stage2-on-activation/deepspeed/gpt2-small/bz16/2n8g/gpt2-small_b16_fp16_1.log {4: 330.14, 5: 330.08, 2: 325.84, 3: 324.32, 1: 330.73} +./logs/logs-20210414-stage2-on-activation/deepspeed/gpt2-small/bz16/1n4g/gpt2-small_b16_fp16_4.log {4: 95.99} +./logs/logs-20210414-stage2-on-activation/deepspeed/gpt2-small/bz16/1n4g/gpt2-small_b16_fp16_5.log {4: 95.99, 5: 96.1} +./logs/logs-20210414-stage2-on-activation/deepspeed/gpt2-small/bz16/1n4g/gpt2-small_b16_fp16_2.log {4: 95.99, 5: 96.1, 2: 96.18} +./logs/logs-20210414-stage2-on-activation/deepspeed/gpt2-small/bz16/1n4g/gpt2-small_b16_fp16_3.log {4: 95.99, 5: 96.1, 2: 96.18, 3: 96.07} +./logs/logs-20210414-stage2-on-activation/deepspeed/gpt2-small/bz16/1n4g/gpt2-small_b16_fp16_1.log {4: 95.99, 5: 96.1, 2: 96.18, 3: 96.07, 1: 95.99} +{'gpt2-small': {'1n1g': {'average_speed': 23.64, + 'batch_size_per_device': 16, + 'median_speed': 23.65, + 'speedup': 1.0}, + '1n4g': {'average_speed': 96.07, + 'batch_size_per_device': 16, + 'median_speed': 96.07, + 'speedup': 4.06}, + '1n8g': {'average_speed': 193.12, + 'batch_size_per_device': 16, + 'median_speed': 193.01, + 'speedup': 8.16}, + '2n8g': {'average_speed': 328.22, + 'batch_size_per_device': 16, + 'median_speed': 330.08, + 'speedup': 13.96}, + '4n8g': {'average_speed': 632.0, + 'batch_size_per_device': 16, + 'median_speed': 630.87, + 'speedup': 26.68}}} +Saving result to ./result/bz16_result.json ``` @@ -320,12 +364,12 @@ bash scripts/runner.sh gpt2-small 4 4 8 0 off | 1 | 8 | 4 | 89.1 | 3.93 | | 1 | 8 | 8 | 178.09 | 7.86 | | 2 | 8 | 8 | 292.63 | 12.92 | -| 4 | 8 | 8 | | | +| 4 | 8 | 8 | 581.0 | 25.65 | | 1 | 16(max) | 1 | 23.96 | 1 | | 1 | 16 | 4 | 95.16 | 3.97 | | 1 | 16 | 8 | 190.36 | 7.94 | | 2 | 16 | 8 | 332.98 | 13.9 | -| 4 | 16 | 8 | | | +| 4 | 16 | 8 | 660.7 | 27.58 | #### zero-stage-1 @@ -335,12 +379,12 @@ bash scripts/runner.sh gpt2-small 4 4 8 0 off | 1 | 8 | 4 | 88.29 | 4.03 | | 1 | 8 | 8 | 176.66 | 8.06 | | 2 | 8 | 8 | 290.72 | 13.26 | -| 4 | 8 | 8 | | | +| 4 | 8 | 8 | 572.33 | 26.1 | | 1 | 16(max) | 1 | 23.5 | 1 | | 1 | 16 | 4 | 94.96 | 4.04 | | 1 | 16 | 8 | 189.44 | 8.06 | | 2 | 16 | 8 | 334.97 | 14.25 | -| 4 | 16 | 8 | | | +| 4 | 16 | 8 | 648.69 | 27.6 | #### zero-stage-2 @@ -350,12 +394,18 @@ bash scripts/runner.sh gpt2-small 4 4 8 0 off | 1 | 8 | 4 | 90.92 | 4.12 | | 1 | 8 | 8 | 183.7 | 8.32 | | 2 | 8 | 8 | 278.19 | 12.6 | -| 4 | 8 | 8 | | | +| 4 | 8 | 8 | 548.5 | 24.84 | | 1 | 16(max) | 1 | 23.65 | 1 | | 1 | 16 | 4 | 96.07 | 4.06 | | 1 | 16 | 8 | 193.01 | 8.16 | | 2 | 16 | 8 | 330.08 | 13.96 | -| 4 | 16 | 8 | | | +| 4 | 16 | 8 | 630.87 | 26.68 | + +注: + +1.zero-stage-0表示未开启zero优化,zero-stage-1、zero-stage-2则分别表示开始1阶段、2阶段优化。 + +2.通过对比各优化阶段的速度和加速比可以看出,开启stage-1、stage-2优化后,速度和加速比并没有提升(甚至有一定程度的下降),因为zero 1、2阶段优化的主要目标是缩减GPU显存占用,而不是优化速度。 @@ -376,7 +426,9 @@ bash scripts/runner.sh gpt2-small 4 4 8 0 off | 20201209 | test-18-1 | zero-stage-0 | 4n_8g_dp_4bs | 15336 | 95 | 533 | | | test-18-2 | zero-stage-1 | 4n_8g_dp_4bs | 13975(-1361,↓8.9%) | 92 | 506(↓5.3%) | -注:xn_xg_xdp_xmp_xbs表示x node, x gpu, x data parallel, x model parallel, x batch size per gpu +注: + +1.xn_xg_xdp_xmp_xbs表示x node, x gpu, x data parallel, x model parallel, x batch size per gpu @@ -384,6 +436,8 @@ bash scripts/runner.sh gpt2-small 4 4 8 0 off 详细 Log 信息可点击下载: +- [logs.zip](https://oneflow-public.oss-cn-beijing.aliyuncs.com/DLPerf/logs/DeepSpeed/gpt2/logs.zip) + - [deepspeed-off-checkpointing-logs.zip](https://oneflow-public.oss-cn-beijing.aliyuncs.com/DLPerf/logs/DeepSpeed/gpt2/deepspeed-off-checkpointing-logs.zip)