diff --git a/PaddlePaddle/PLSC/README.md b/PaddlePaddle/PLSC/README.md new file mode 100644 index 00000000..83628982 --- /dev/null +++ b/PaddlePaddle/PLSC/README.md @@ -0,0 +1,235 @@ +# Overview + +本次复现采用了[PaddlePaddle-PLSC官方仓库](https://github.com/PaddlePaddle/PLSC/tree/9bba9c90f542a5e1d8e6d461fcd1f6af40da0918)中的paddle版的arcface人脸分类模型,目的在于速度测评,同时根据测速结果给出1机、2机、4机情况下的加速比,评判框架在分布式多机训练情况下的横向拓展能力。 + +目前,该测试覆盖了FP32精度,后续将持续维护,增加更多方式的测评。 + + + +# Environment + +## 系统 + +- 系统:Ubuntu 16.04.4 LTS (GNU/Linux 4.4.0-116-generic x86_64) +- 显卡:Tesla V100-SXM2-16GB x 8 +- 驱动:NVIDIA 440.33.01 +- CUDA:10.2 +- cuDNN:7.6.5 +- NCCL:2.7.3 + +## 框架 + +- **paddlepaddle-gpu==1.8.5.post107** + +## Feature support matrix + +| Feature | Paddle | +| ----------------------------- | ------ | +| Multi-node,multi-gpu training | Yes | +| NVIDIA NCCL | Yes | + +# Quick Start + +## 项目代码 + +- [PaddlePaddle-PLSC官方仓库](https://github.com/PaddlePaddle/PLSC/tree/9bba9c90f542a5e1d8e6d461fcd1f6af40da0918) + +下载官方源码: + +```shell +git clone https://github.com/PaddlePaddle/PLSC.git +cd PLSC +git checkout 9bba9c90f542a5e1d8e6d461fcd1f6af40da0918 +``` + +将本页面中scripts文件夹中的脚本和代码全部放入:`PLSC/`路径下。 + +修改PLSC/plsc/entry.py,在[Line:984](https://github.com/PaddlePaddle/PLSC/blob/9bba9c90f542a5e1d8e6d461fcd1f6af40da0918/plsc/entry.py#L984)下加入代码: + +```python +local_train_info = [[], [], [], []] + if batch_id==150: + exit() +``` + +以使测试进行150 iter 后自动退出。 + + + +## 依赖安装 + + +### 环境 + +1.本测试使用 conda 环境, 可以使用如下命令创建 plsc 环境: + + +``` +conda env create -f environment.yaml +``` + +### NCCL + +paddle的分布式训练底层依赖NCCL库,需要从[NVIDIA-NCCL官网下载](https://developer.nvidia.com/nccl/nccl-download)并安装和操作系统、CUDA版本适配的NCCL。本次测试中安装2.7.3版本的NCCL: + +```shell +sudo dpkg -i nccl-repo-ubuntu1604-2.7.3-ga-cuda10.2_1-1_amd64.deb +sudo apt update +sudo apt install libnccl2=2.7.3-1+cuda10.2 libnccl-dev=2.7.3-1+cuda10.2 +``` + +## 数据集 + +本次训练使用虚拟合成数据,无需准备数据集,如需真实训练数据集准备过程可参考[官方README](https://github.com/PaddlePaddle/PLSC/blob/master/docs/source/md/quick_start.md#%E6%95%B0%E6%8D%AE%E5%87%86%E5%A4%87) + + +# Training + +集群中有4台节点: + + +- NODE1=10.11.0.2 +- NODE2=10.11.0.3 +- NODE3=10.11.0.4 +- NODE4=10.11.0.5 + +每个节点有8张显卡,这里默认使用resnet50作为backbone,batch size设为128,分别在1机1卡~4机32卡的情况下进行了多组训练。 + +## 单机 + +`PLSC/`目录下,执行脚本: + +```shell +bash run_single_node.sh +``` + +对单机1卡、4卡、8卡分别做5组测试,默认测试fp32精度,batch_size=128。 + +## 2机16卡 + +2机、4机等多机情况下,需要在所有机器节点上相同路径准备同样的代码和脚本以完成分布式训练。 + +如2机:NODE1='10.11.0.2' NODE2='10.11.0.3' 的训练,需在NODE1节点`PLSC/`目录下执行脚本: + +```shell +bash run_two_node.sh r50 128 fp32 10.11.0.2 5 +``` + +NODE2节点`PLSC/`目录下,执行: + +```shell +bash run_two_node.sh r50 128 fp32 10.11.0.3 5 +``` + +## 4机32卡 + +流程同上,在4个机器节点上分别执行: + +```shell +bash run_two_node.sh r50 128 fp32 $NODE 5 +``` + +# Result + +## 吞吐率及加速比 + +执行以下命令,即可计算各种测试配置下的吞吐率及加速比: + +```shell +python extract_paddle_logs.py --log_dir=logs/paddle-plsc/arcface/bz128 --batch_size_per_device=128 +``` + +输出: + +```shell +logs/paddle-plsc/arcface/bz128/4n8g/r50_b128_fp32_1.log {1: 11165.17} +logs/paddle-plsc/arcface/bz128/4n8g/r50_b128_fp32_4.log {1: 11165.17, 4: 11077.58} +logs/paddle-plsc/arcface/bz128/4n8g/r50_b128_fp32_2.log {1: 11165.17, 4: 11077.58, 2: 11109.91} +logs/paddle-plsc/arcface/bz128/4n8g/r50_b128_fp32_3.log {1: 11165.17, 4: 11077.58, 2: 11109.91, 3: 11084.53} +logs/paddle-plsc/arcface/bz128/4n8g/r50_b128_fp32_5.log {1: 11165.17, 4: 11077.58, 2: 11109.91, 3: 11084.53, 5: 10997.67} +logs/paddle-plsc/arcface/bz128/1n8g/r50_b128_fp32_1.log {1: 2536.04} +logs/paddle-plsc/arcface/bz128/1n8g/r50_b128_fp32_4.log {1: 2536.04, 4: 2554.13} +logs/paddle-plsc/arcface/bz128/1n8g/r50_b128_fp32_2.log {1: 2536.04, 4: 2554.13, 2: 2545.3} +logs/paddle-plsc/arcface/bz128/1n8g/r50_b128_fp32_3.log {1: 2536.04, 4: 2554.13, 2: 2545.3, 3: 2563.28} +logs/paddle-plsc/arcface/bz128/1n8g/r50_b128_fp32_5.log {1: 2536.04, 4: 2554.13, 2: 2545.3, 3: 2563.28, 5: 2542.14} +logs/paddle-plsc/arcface/bz128/1n4g/r50_b128_fp32_1.log {1: 1555.18} +logs/paddle-plsc/arcface/bz128/1n4g/r50_b128_fp32_4.log {1: 1555.18, 4: 1539.66} +logs/paddle-plsc/arcface/bz128/1n4g/r50_b128_fp32_2.log {1: 1555.18, 4: 1539.66, 2: 1534.62} +logs/paddle-plsc/arcface/bz128/1n4g/r50_b128_fp32_3.log {1: 1555.18, 4: 1539.66, 2: 1534.62, 3: 1540.66} +logs/paddle-plsc/arcface/bz128/1n4g/r50_b128_fp32_5.log {1: 1555.18, 4: 1539.66, 2: 1534.62, 3: 1540.66, 5: 1535.04} +logs/paddle-plsc/arcface/bz128/1n1g/r50_b128_fp32_1.log {1: 397.44} +logs/paddle-plsc/arcface/bz128/1n1g/r50_b128_fp32_4.log {1: 397.44, 4: 397.64} +logs/paddle-plsc/arcface/bz128/1n1g/r50_b128_fp32_2.log {1: 397.44, 4: 397.64, 2: 398.05} +logs/paddle-plsc/arcface/bz128/1n1g/r50_b128_fp32_3.log {1: 397.44, 4: 397.64, 2: 398.05, 3: 398.98} +logs/paddle-plsc/arcface/bz128/1n1g/r50_b128_fp32_5.log {1: 397.44, 4: 397.64, 2: 398.05, 3: 398.98, 5: 397.78} +logs/paddle-plsc/arcface/bz128/2n8g/r50_b128_fp32_1.log {1: 5950.34} +logs/paddle-plsc/arcface/bz128/2n8g/r50_b128_fp32_4.log {1: 5950.34, 4: 5961.68} +logs/paddle-plsc/arcface/bz128/2n8g/r50_b128_fp32_2.log {1: 5950.34, 4: 5961.68, 2: 5953.84} +logs/paddle-plsc/arcface/bz128/2n8g/r50_b128_fp32_3.log {1: 5950.34, 4: 5961.68, 2: 5953.84, 3: 5982.75} +logs/paddle-plsc/arcface/bz128/2n8g/r50_b128_fp32_5.log {1: 5950.34, 4: 5961.68, 2: 5953.84, 3: 5982.75, 5: 5934.43} +{'r50': {'1n1g': {'average_speed': 397.98, + 'batch_size_per_device': 128, + 'median_speed': 397.78, + 'speedup': 1.0}, + '1n4g': {'average_speed': 1541.03, + 'batch_size_per_device': 128, + 'median_speed': 1539.66, + 'speedup': 3.87}, + '1n8g': {'average_speed': 2548.18, + 'batch_size_per_device': 128, + 'median_speed': 2545.3, + 'speedup': 6.4}, + '2n8g': {'average_speed': 5956.61, + 'batch_size_per_device': 128, + 'median_speed': 5953.84, + 'speedup': 14.97}, + '4n8g': {'average_speed': 11086.97, + 'batch_size_per_device': 128, + 'median_speed': 11084.53, + 'speedup': 27.87}}} +Saving result to ./result/bz128_result.json +``` + +## 计算规则 + +### 1.测速脚本 + +- extract_paddle_logs.py + +extract_paddle_logs.py根据官方在log中打印的速度,在150个iter中,排除前50iter,取后100个iter的速度做平均; + +### 2.均值速度和中值速度 + +- average_speed均值速度 + +- median_speed中值速度 + + 每个batch size进行5次训练测试,记为一组,每一组取average_speed为均值速度,median_speed为中值速度 + +### 3.加速比以中值速度计算 + +脚本和表格中的 **加速比** 是以单机单卡下的中值速度为基准进行计算的。例如: + +单机单卡情况下速度为200(samples/s),单机2卡速度为400,单机4卡速度为700,则加速比分别为:1.0、2.0、3.5 + + + +## ResNet 50 FP32 + + +### batch size = 128 + +| node_num | gpu_num | samples/s | speedup | +| -------- | ------- | --------- | ------- | +| 1 | 1 | 397.78 | 1 | +| 1 | 4 | 1539.66 | 3.87 | +| 1 | 8 | 2545.3 | 6.4 | +| 2 | 16 | 5953.84 | 14.97 | +| 4 | 32 | 11084.53 | 27.87 | + + + + +## 完整日志 + +[logs-20210312](https://oneflow-public.oss-cn-beijing.aliyuncs.com/DLPerf/logs/PaddlePaddle/plsc/logs-20210312.zip) \ No newline at end of file diff --git a/PaddlePaddle/PLSC/extract_paddle_logs.py b/PaddlePaddle/PLSC/extract_paddle_logs.py new file mode 100755 index 00000000..1f8169cf --- /dev/null +++ b/PaddlePaddle/PLSC/extract_paddle_logs.py @@ -0,0 +1,132 @@ +import os +import re +import sys +import glob +import json +import argparse +import pprint + +import numpy as np + +pp = pprint.PrettyPrinter(indent=1) +os.chdir(sys.path[0]) + +parser = argparse.ArgumentParser(description="flags for benchmark") +parser.add_argument("--log_dir", type=str, default="./logs/paddle-plsc/arcface", required=True) +parser.add_argument("--output_dir", type=str, default="./result", required=False) +parser.add_argument('--warmup_batches', type=int, default=50) +parser.add_argument('--train_batches', type=int, default=150) +parser.add_argument('--batch_size_per_device', type=int, default=128) + +args = parser.parse_args() + + +class AutoVivification(dict): + """Implementation of perl's autovivification feature.""" + + def __getitem__(self, item): + try: + return dict.__getitem__(self, item) + except KeyError: + value = self[item] = type(self)() + return value + + +def extract_info_from_file(log_file, result_dict, speed_dict): + # extract info from file name + fname = os.path.basename(log_file) + run_case = log_file.split("/")[-2] # eg: 1n1g + model = fname.split("_")[0] + batch_size = int(fname.split("_")[1].strip("b")) + pricition = fname.split("_")[2] + test_iter = int(fname.split("_")[3].strip(".log")) + node_num = int(run_case[0]) + if len(run_case) == 4: + card_num = int(run_case[-2]) + elif len(run_case) == 5: + card_num = int(run_case[-3:-1]) + + total_batch_size = node_num * card_num * batch_size + tmp_dict = { + 'average_speed': 0, + 'batch_size_per_device': batch_size, + } + + avg_speed_list = [] + start_str = "Pass:" + str(args.warmup_batches) + " batch:" + temp_num = args.warmup_batches + end_ste = "Pass:" + str(args.train_batches) + " batch:" + # extract info from file content + with open(log_file) as f: + lines = f.readlines() + for line in lines: + if "Pass:0 batch:" in line: + p1 = re.compile(r" qps:([0-9]+\.[0-9]+)", re.S) + item = re.findall(p1, line) + a = float(item[0].strip()) + avg_speed_list.append(a) + + # compute avg throughoutput + avg_speed = round(np.mean(avg_speed_list[args.warmup_batches//10*node_num:args.train_batches//10*node_num]), 2) + tmp_dict['average_speed'] = avg_speed + + result_dict[model][run_case]['average_speed'] = tmp_dict['average_speed'] + result_dict[model][run_case]['batch_size_per_device'] = tmp_dict['batch_size_per_device'] + + speed_dict[model][run_case][test_iter] = avg_speed + + print(log_file, speed_dict[model][run_case]) + + +def compute_median(iter_dict): + speed_list = [i for i in iter_dict.values()] + return round(np.median(speed_list), 2) + + +def compute_speedup(result_dict, speed_dict): + model_list = [key for key in result_dict] # eg.['vgg16', 'rn50'] + for m in model_list: + run_case = [key for key in result_dict[m]] # eg.['4n8g', '2n8g', '1n8g', '1n4g', '1n1g'] + for d in run_case: + speed_up = 1.0 + if result_dict[m]['1n1g']['average_speed']: + result_dict[m][d]['average_speed'] = compute_average(speed_dict[m][d]) + result_dict[m][d]['median_speed'] = compute_median(speed_dict[m][d]) + speed_up = result_dict[m][d]['median_speed'] / compute_median(speed_dict[m]['1n1g']) + result_dict[m][d]['speedup'] = round(speed_up, 2) + + +def compute_average(iter_dict): + i = 0 + total_speed = 0 + for iter in iter_dict: + i += 1 + total_speed += iter_dict[iter] + return round(total_speed / i, 2) + + +def extract_result(): + result_dict = AutoVivification() + speed_dict = AutoVivification() + logs_list = glob.glob(os.path.join(args.log_dir, "*/*.log")) + for l in logs_list: + extract_info_from_file(l, result_dict, speed_dict) + + # compute speedup + compute_speedup(result_dict, speed_dict) + + # print result + pp.pprint(result_dict) + + # write to file as JSON format + os.makedirs(args.output_dir, exist_ok=True) + framwork = args.log_dir.split('/')[-1] + result_file_name = os.path.join(args.output_dir, framwork + "_result.json") + print("Saving result to {}".format(result_file_name)) + with open(result_file_name, 'w') as f: + json.dump(result_dict, f) + + +if __name__ == "__main__": + extract_result() + diff --git a/PaddlePaddle/PLSC/scripts/environment.yaml b/PaddlePaddle/PLSC/scripts/environment.yaml new file mode 100644 index 00000000..ffaab118 --- /dev/null +++ b/PaddlePaddle/PLSC/scripts/environment.yaml @@ -0,0 +1,14 @@ +name: plsc +dependencies: + - python=3.7 + - pip + - requests + - ipython + - jupyter + - numpy=1.18.* + - scipy==1.3.1 + - scikit-learn + - opencv==3.4.2 + - pip: + - paddlepaddle-gpu==1.8.5.post107 + - easydict \ No newline at end of file diff --git a/PaddlePaddle/PLSC/scripts/run_multi_node.sh b/PaddlePaddle/PLSC/scripts/run_multi_node.sh new file mode 100644 index 00000000..3bdd5f53 --- /dev/null +++ b/PaddlePaddle/PLSC/scripts/run_multi_node.sh @@ -0,0 +1,22 @@ +#!/usr/bin/bash +export NODE1=10.11.0.2 +export NODE2=10.11.0.3 +export NODE3=10.11.0.4 +export NODE4=10.11.0.5 +shell_folder=$(dirname $(readlink -f "$0")) +model=${1:-r50} +batch_size_per_device=${2:-128} +dtype=${3:-'fp32'} +current_node=${4:-$NODE1} +test_num=${5:-5} + + +i=1 +while [ $i -le ${test_num} ] +do + bash $shell_folder/runner.sh ${model} ${batch_size_per_device} 0,1,2,3,4,5,6,7 4 $dtype $current_node ${i} + echo " >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< " + let i++ + sleep 20s +done + diff --git a/PaddlePaddle/PLSC/scripts/run_single_node.sh b/PaddlePaddle/PLSC/scripts/run_single_node.sh new file mode 100644 index 00000000..4eb681a3 --- /dev/null +++ b/PaddlePaddle/PLSC/scripts/run_single_node.sh @@ -0,0 +1,41 @@ +#!/usr/bin/bash +export NODE1=10.11.0.2 +export NODE2=10.11.0.3 +export NODE3=10.11.0.4 +export NODE4=10.11.0.5 +shell_folder=$(dirname $(readlink -f "$0")) +model=${1:-r50} +batch_size_per_device=${2:-128} +dtype=${3:-'fp32'} +current_node=${4:-$NODE1} +test_num=${5:-5} + + +i=1 +while [ $i -le ${test_num} ] +do + bash $shell_folder/runner.sh ${model} ${batch_size_per_device} 0 1 $dtype $current_node ${i} + echo " >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< " + let i++ + sleep 20s +done + + +i=1 +while [ $i -le ${test_num} ] +do + bash $shell_folder/runner.sh ${model} ${batch_size_per_device} 0,1,2,3 1 $dtype $current_node ${i} + echo " >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< " + let i++ + sleep 20s +done + + +i=1 +while [ $i -le ${test_num} ] +do + bash $shell_folder/runner.sh ${model} ${batch_size_per_device} 0,1,2,3,4,5,6,7 1 $dtype $current_node ${i} + echo " >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< " + let i++ + sleep 20s +done diff --git a/PaddlePaddle/PLSC/scripts/run_two_node.sh b/PaddlePaddle/PLSC/scripts/run_two_node.sh new file mode 100644 index 00000000..6fb582a2 --- /dev/null +++ b/PaddlePaddle/PLSC/scripts/run_two_node.sh @@ -0,0 +1,22 @@ +#!/usr/bin/bash +export NODE1=10.11.0.2 +export NODE2=10.11.0.3 +export NODE3=10.11.0.4 +export NODE4=10.11.0.5 +shell_folder=$(dirname $(readlink -f "$0")) +model=${1:-r50} +batch_size_per_device=${2:-128} +dtype=${3:-'fp32'} +current_node=${4:-$NODE1} +test_num=${5:-5} + + +i=1 +while [ $i -le ${test_num} ] +do + bash $shell_folder/runner.sh ${model} ${batch_size_per_device} 0,1,2,3,4,5,6,7 2 $dtype $current_node ${i} + echo " >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< " + let i++ + sleep 20s +done + diff --git a/PaddlePaddle/PLSC/scripts/runner.sh b/PaddlePaddle/PLSC/scripts/runner.sh new file mode 100644 index 00000000..8f328a66 --- /dev/null +++ b/PaddlePaddle/PLSC/scripts/runner.sh @@ -0,0 +1,41 @@ +model=${1:-"r50"} +batch_size_per_device=${2:-128} +gpus=${3:-0} +node_num=${4:-1} +dtype=${5:-"fp32"} +current_node=${6:-$NODE1} +test_num=${7:-1} +a=`expr ${#gpus} + 1` +gpu_num_per_node=`expr ${a} / 2` + + +if [ ${node_num} -eq 1 ] ; then + node_ips=${current_node} +elif [ ${node_num} -eq 2 ] ; then + node_ips=${NODE1},${NODE2} +elif [ ${node_num} -eq 4 ] ; then + node_ips=${NODE1},${NODE2},${NODE3},${NODE4} +else + echo "Not a valid node." +fi + + +log_dir=./logs/paddle-plsc/arcface/bz${batch_size_per_device}/${node_num}n${gpu_num_per_node}g +mkdir -p $log_dir +log_file=$log_dir/${model}_b${batch_size_per_device}_${dtype}_${test_num}.log + + +sed -i "s/\(ins.set_train_batch_size=\)\S*/ins.set_train_batch_size=${batch_size_per_device}/" train.py + +if [ ${gpu_num_per_node} -eq 1 ] ; then + sed -i "s/\(LOSS_TYPE = \)\S*/LOSS_TYPE = 'arcface'/" train.py + python3 train.py 2>&1 | tee $log_file +else + sed -i "s/\(LOSS_TYPE = \)\S*/LOSS_TYPE = 'dist_arcface'/" train.py + python3 -m paddle.distributed.launch \ + --cluster_node_ips="${node_ips}" \ + --node_ip="${current_node}" \ + --selected_gpus=${gpus} train.py 2>&1 | tee $log_file +fi + + diff --git a/PaddlePaddle/PLSC/scripts/train.py b/PaddlePaddle/PLSC/scripts/train.py new file mode 100644 index 00000000..31afdcf0 --- /dev/null +++ b/PaddlePaddle/PLSC/scripts/train.py @@ -0,0 +1,44 @@ +import numpy as np + +import paddle +from plsc import Entry + +NUM_EPOCHES = 1 +LOSS_TYPE = 'dist_arcface' + +NUM_SAMPLES = 5822653 +NUM_CLASSES = 85742 + + +def arc_train(*args): + def reader(): + for i in range(NUM_SAMPLES): + yield np.random.normal(size=(3, 112, 112)), int(np.random.randint(NUM_CLASSES)) + + def mapper(x): + return x + + THREAD=8 + BUF_SIZE=5000 + + return paddle.reader.xmap_readers(mapper, reader, THREAD, BUF_SIZE) + + + +def main(): + ins = Entry() + ins.set_model_save_dir("./checkpoints") + ins.set_train_epochs(NUM_EPOCHES) + ins.set_loss_type(LOSS_TYPE) + ins.set_train_batch_size=128 + + ins.set_with_test(False) + ins.set_class_num(NUM_CLASSES) + ins.set_log_period(10) + + ins.train_reader = arc_train() + ins.train() + + +if __name__ == "__main__": + main()