From bcf466637a74dc25e7a42678aabb60ca1e0215fd Mon Sep 17 00:00:00 2001 From: ccddyy416 <1482342831@qq.com> Date: Mon, 10 Jan 2022 14:29:12 +0800 Subject: [PATCH 01/20] Hugectr DLRM baseline --- HugeCTR/dlrm/README.md | 119 ++++++++++++++++++++++++++++++++ HugeCTR/dlrm/dlrm.py | 153 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 272 insertions(+) create mode 100644 HugeCTR/dlrm/README.md create mode 100644 HugeCTR/dlrm/dlrm.py diff --git a/HugeCTR/dlrm/README.md b/HugeCTR/dlrm/README.md new file mode 100644 index 00000000..1c7886a1 --- /dev/null +++ b/HugeCTR/dlrm/README.md @@ -0,0 +1,119 @@ +# NVIDIA HugeCTR DLRM Benchmark Test +This folder holds NVIDIA HugeCTR DLRM Benchmark Test scripts, tools and reports. + +You can refer to [HugeCTR User Guide](https://github.com/NVIDIA/HugeCTR/blob/master/docs/hugectr_user_guide.md) for additional information. + +## folder structure +## Benchmark Test Cases + +This report summarized HugeCTR test on 1 nodes with 8 x Tesla V100 on Dec 2021 + +### Test Environment +- 1 nodes with Tesla V100-SXM2-16GB x 8 +- InfiniBand 100 Gb/sec (4X EDR), Mellanox Technologies MT27700 Family +- Intel(R) Xeon(R) Gold 6271C CPU @ 2.60GHz ($ cat /proc/cpuinfo | grep name | cut -f2 -d: | uniq -c*) +- Memory 384G ($ cat /proc/meminfo) +- Ubuntu 20.04.3 LTS ($ cat /etc/issue/) (GNU/Linux 5.4.0-26-generic x86_64) ($ uname -a) +- CUDA Version: 11.4 ($ nvcc -V), Driver Version: 470.57.02 ($ cat /proc/driver/nvidia/version) +- HugeCTR version: 3.2 +- `nvidia-smi topo -m` + +``` + GPU0 GPU1 GPU2 GPU3 GPU4 GPU5 GPU6 GPU7 mlx5_0 mlx5_1 CPU Affinity NUMA Affinity +GPU0 X NV1 NV2 NV1 SYS SYS SYS NV2 NODE SYS 0-23,48-71 0 +GPU1 NV1 X NV1 NV2 SYS SYS NV2 SYS NODE SYS 0-23,48-71 0 +GPU2 NV2 NV1 X NV2 SYS NV1 SYS SYS PIX SYS 0-23,48-71 0 +GPU3 NV1 NV2 NV2 X NV1 SYS SYS SYS PIX SYS 0-23,48-71 0 +GPU4 SYS SYS SYS NV1 X NV2 NV2 NV1 SYS NODE 24-47,72-95 1 +GPU5 SYS SYS NV1 SYS NV2 X NV1 NV2 SYS NODE 24-47,72-95 1 +GPU6 SYS NV2 SYS SYS NV2 NV1 X NV1 SYS PIX 24-47,72-95 1 +GPU7 NV2 SYS SYS SYS NV1 NV2 NV1 X SYS PIX 24-47,72-95 1 +mlx5_0 NODE NODE PIX PIX SYS SYS SYS SYS X SYS +mlx5_1 SYS SYS SYS SYS NODE NODE PIX PIX SYS X + +Legend: + + X = Self + SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI) + NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node + PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU) + PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge) + PIX = Connection traversing at most a single PCIe bridge + NV# = Connection traversing a bonded set of # NVLinks +``` + + + +### baseline + +command: python dlrm.py --gpu_num_pre_node 4 + +gpu数量需大于等于2,目前单gpu运行报错: + +Traceback (most recent call last): + File "dlrm_kaggle_fp32.py", line 146, in + model.compile() +RuntimeError: Runtime error: out of memory /var/tmp/HugeCTR/HugeCTR/include/general_buffer2.hpp:57 + +baseline运行默认参数: + +batch_size=65536 + +learning_rate=0.5 (base learning rate) + +warmup_steps=300 (warmup期间 lr = step_ * base_lr_ / warmup_steps_) + +decay_start=0 + +workspace_size_per_gpu_in_mb=11645 + +embedding_vec_size=128 + +max_iter=600 + +eval_interval=50 + +### baseline 运行log + +[HUGECTR][03:13:26][INFO][RANK0]: Iter: 50 Time(50 iters): 20.430227s Loss: 0.558395 lr:0.085000 +[HUGECTR][03:13:43][INFO][RANK0]: Evaluation, AUC: 0.675481 +[HUGECTR][03:13:43][INFO][RANK0]: Eval Time for 70 iters: 17.182972s +[HUGECTR][03:14:03][INFO][RANK0]: Iter: 100 Time(50 iters): 37.239699s Loss: 0.536418 lr:0.168333 +[HUGECTR][03:14:21][INFO][RANK0]: Evaluation, AUC: 0.695105 +[HUGECTR][03:14:21][INFO][RANK0]: Eval Time for 70 iters: 17.082333s +[HUGECTR][03:14:41][INFO][RANK0]: Iter: 150 Time(50 iters): 37.131152s Loss: 0.528158 lr:0.251667 +[HUGECTR][03:14:58][INFO][RANK0]: Evaluation, AUC: 0.708745 +[HUGECTR][03:14:58][INFO][RANK0]: Eval Time for 70 iters: 17.140160s +[HUGECTR][03:15:18][INFO][RANK0]: Iter: 200 Time(50 iters): 37.168557s Loss: 0.546343 lr:0.335000 +[HUGECTR][03:15:35][INFO][RANK0]: Evaluation, AUC: 0.715132 +[HUGECTR][03:15:35][INFO][RANK0]: Eval Time for 70 iters: 17.145760s +[HUGECTR][03:15:55][INFO][RANK0]: Iter: 250 Time(50 iters): 37.172421s Loss: 0.534963 lr:0.418333 +[HUGECTR][03:16:12][INFO][RANK0]: Evaluation, AUC: 0.720022 +[HUGECTR][03:16:12][INFO][RANK0]: Eval Time for 70 iters: 17.117377s +[HUGECTR][03:16:33][INFO][RANK0]: Iter: 300 Time(50 iters): 37.171914s Loss: 0.495738 lr:0.500000 +[HUGECTR][03:16:50][INFO][RANK0]: Evaluation, AUC: 0.724995 +[HUGECTR][03:16:50][INFO][RANK0]: Eval Time for 70 iters: 17.130679s +[HUGECTR][03:17:10][INFO][RANK0]: Iter: 350 Time(50 iters): 37.130778s Loss: 0.530376 lr:0.500000 +[HUGECTR][03:17:27][INFO][RANK0]: Evaluation, AUC: 0.727772 +[HUGECTR][03:17:27][INFO][RANK0]: Eval Time for 70 iters: 17.159518s +[HUGECTR][03:17:47][INFO][RANK0]: Iter: 400 Time(50 iters): 37.222825s Loss: 0.526999 lr:0.500000 +[HUGECTR][03:18:04][INFO][RANK0]: Evaluation, AUC: 0.728558 +[HUGECTR][03:18:04][INFO][RANK0]: Eval Time for 70 iters: 17.187404s +[HUGECTR][03:18:25][INFO][RANK0]: Iter: 450 Time(50 iters): 37.232422s Loss: 0.516090 lr:0.500000 +[HUGECTR][03:18:42][INFO][RANK0]: Evaluation, AUC: 0.732136 +[HUGECTR][03:18:42][INFO][RANK0]: Eval Time for 70 iters: 17.184398s +[HUGECTR][03:19:02][INFO][RANK0]: Iter: 500 Time(50 iters): 37.203517s Loss: 0.503241 lr:0.500000 +[HUGECTR][03:19:19][INFO][RANK0]: Evaluation, AUC: 0.735191 +[HUGECTR][03:19:19][INFO][RANK0]: Eval Time for 70 iters: 17.160128s +[HUGECTR][03:19:39][INFO][RANK0]: Iter: 550 Time(50 iters): 37.228689s Loss: 0.504160 lr:0.500000 +[HUGECTR][03:19:57][INFO][RANK0]: Evaluation, AUC: 0.737055 +[HUGECTR][03:19:57][INFO][RANK0]: Eval Time for 70 iters: 17.186027s + +... +[HUGECTR][04:17:04][INFO][RANK0]: Evaluation, AUC: 0.759263 +[HUGECTR][04:17:04][INFO][RANK0]: Eval Time for 70 iters: 17.218921s +[HUGECTR][04:17:24][INFO][RANK0]: Finish 1200 iterations with batchsize: 65536 in 879.05s. + + + +### \ No newline at end of file diff --git a/HugeCTR/dlrm/dlrm.py b/HugeCTR/dlrm/dlrm.py new file mode 100644 index 00000000..cba254d8 --- /dev/null +++ b/HugeCTR/dlrm/dlrm.py @@ -0,0 +1,153 @@ +import hugectr +from mpi4py import MPI + +def DLRM(args): + vvgpu = [[g for g in range(args.gpu_num_per_node)] for n in range(args.num_nodes)] + solver = hugectr.CreateSolver(max_eval_batches = args.eval_batchs, + batchsize_eval = args.batch_size, + batchsize = args.batch_size, + lr = args.learning_rate, + warmup_steps = args.warmup_steps, + decay_start = args.decay_start, + decay_steps = args.decay_steps, + decay_power = args.decay_power, + end_lr = args.end_lr, + vvgpu = vvgpu, + repeat_dataset = True) + reader = hugectr.DataReaderParams(data_reader_type = hugectr.DataReaderType_t.Raw, + source = [f"{args.data_dir}/train_data.bin"], + eval_source = f"{args.data_dir}/test_data.bin", + num_samples = 36672493, + eval_num_samples = 4584062, + check_type = hugectr.Check_t.Non) + optimizer = hugectr.CreateOptimizer(optimizer_type = hugectr.Optimizer_t.SGD, + update_type = hugectr.Update_t.Local, + atomic_update = True) + model = hugectr.Model(solver, reader, optimizer) + model.add(hugectr.Input(label_dim = 1, label_name = "label", + dense_dim = 13, dense_name = "dense", + data_reader_sparse_param_array = + [hugectr.DataReaderSparseParam("data1", 2, False, 26)])) + model.add(hugectr.SparseEmbedding(embedding_type = hugectr.Embedding_t.LocalizedSlotSparseEmbeddingOneHot, + slot_size_array = [1460,583,10131227,2202608,305,24,12517,633,3,93145,5683,8351593,3194,27,14992,5461306,10,5652,2173,4,7046547,18,15,286181,105,142572], + workspace_size_per_gpu_in_mb = args.workspace_size_per_gpu_in_mb, + embedding_vec_size = args.embedding_vec_size, + combiner = "sum", + sparse_embedding_name = "sparse_embedding1", + bottom_name = "data1", + optimizer = optimizer)) + model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.InnerProduct, + bottom_names = ["dense"], + top_names = ["fc1"], + num_output=512)) + model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.ReLU, + bottom_names = ["fc1"], + top_names = ["relu1"])) + model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.InnerProduct, + bottom_names = ["relu1"], + top_names = ["fc2"], + num_output=256)) + model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.ReLU, + bottom_names = ["fc2"], + top_names = ["relu2"])) + model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.InnerProduct, + bottom_names = ["relu2"], + top_names = ["fc3"], + num_output=128)) + model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.ReLU, + bottom_names = ["fc3"], + top_names = ["relu3"])) + model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.Interaction, + bottom_names = ["relu3","sparse_embedding1"], + top_names = ["interaction1"])) + model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.InnerProduct, + bottom_names = ["interaction1"], + top_names = ["fc4"], + num_output=1024)) + model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.ReLU, + bottom_names = ["fc4"], + top_names = ["relu4"])) + model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.InnerProduct, + bottom_names = ["relu4"], + top_names = ["fc5"], + num_output=1024)) + model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.ReLU, + bottom_names = ["fc5"], + top_names = ["relu5"])) + model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.InnerProduct, + bottom_names = ["relu5"], + top_names = ["fc6"], + num_output=512)) + model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.ReLU, + bottom_names = ["fc6"], + top_names = ["relu6"])) + model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.InnerProduct, + bottom_names = ["relu6"], + top_names = ["fc7"], + num_output=256)) + model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.ReLU, + bottom_names = ["fc7"], + top_names = ["relu7"])) + model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.InnerProduct, + bottom_names = ["relu7"], + top_names = ["fc8"], + num_output=1)) + model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.BinaryCrossEntropyLoss, + bottom_names = ["fc8", "label"], + top_names = ["loss"])) + return model + +def get_args(print_args=True): + import argparse + parser = argparse.ArgumentParser() + parser.add_argument('--gpu_num_per_node', type=int, default=1) + parser.add_argument('--num_nodes', type=int, default=1, + help='node/machine number for training') + parser.add_argument('--eval_batchs', type=int, default=70) + parser.add_argument('--batch_size', type=int, default=65536) + parser.add_argument('--learning_rate', type=float, default=0.5) + parser.add_argument('--warmup_steps', type=int, default=300) + parser.add_argument('--decay_start', type=int, default=0) + parser.add_argument('--decay_steps', type=int, default=1) + parser.add_argument('--decay_power', type=int, default=2) + parser.add_argument('--end_lr', type=int, default=0) + parser.add_argument('--data_dir', type=str, default='/dataset/f9f659c5/hugectr_dlrm') + parser.add_argument('--workspace_size_per_gpu_in_mb', type=int, default=11645) + parser.add_argument('--embedding_vec_size', type=int, default=128) + parser.add_argument('--max_iter', type=int, default=600) + parser.add_argument('--loss_print_every_n_iter', type=int, default=50) + parser.add_argument('--eval_interval', type=int, default=1000) + + + FLAGS = parser.parse_args() + + def _print_args(args): + from datetime import datetime + print("=".ljust(66, "=")) + print( + "Running {}: gpu_num_per_node = {}, num_nodes = {}.".format( + "HugeCTR-WDL", args.gpu_num_per_node, args.num_nodes + ) + ) + print("=".ljust(66, "=")) + for arg in vars(args): + print("{} = {}".format(arg, getattr(args, arg))) + print("-".ljust(66, "-")) + print("Time stamp: {}".format(str(datetime.now().strftime("%Y-%m-%d-%H:%M:%S")))) + + if print_args: + _print_args(FLAGS) + return FLAGS + + +if __name__ == "__main__": + args = get_args() + model=DLRM(args) + model.compile() + model.summary() + model.fit( + max_iter = args.max_iter, + display = args.loss_print_every_n_iter, + eval_interval = args.eval_interval, + snapshot = 10000000, + snapshot_prefix = "dlrm") From 0149daa5c3a3350170b742e47639691e5167e2b6 Mon Sep 17 00:00:00 2001 From: ccddyy416 <1482342831@qq.com> Date: Mon, 10 Jan 2022 14:47:43 +0800 Subject: [PATCH 02/20] Update README.md --- HugeCTR/dlrm/README.md | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/HugeCTR/dlrm/README.md b/HugeCTR/dlrm/README.md index 1c7886a1..687621b0 100644 --- a/HugeCTR/dlrm/README.md +++ b/HugeCTR/dlrm/README.md @@ -46,14 +46,7 @@ Legend: ### baseline -command: python dlrm.py --gpu_num_pre_node 4 - -gpu数量需大于等于2,目前单gpu运行报错: - -Traceback (most recent call last): - File "dlrm_kaggle_fp32.py", line 146, in - model.compile() -RuntimeError: Runtime error: out of memory /var/tmp/HugeCTR/HugeCTR/include/general_buffer2.hpp:57 +command: python dlrm.py --gpu_num_per_node 4 baseline运行默认参数: From 3fd1b36ed66442ee3d50b89e2b104a95207f6123 Mon Sep 17 00:00:00 2001 From: ccddyy416 <1482342831@qq.com> Date: Thu, 13 Jan 2022 21:38:47 +0800 Subject: [PATCH 03/20] update baseline info --- HugeCTR/dlrm/README.md | 76 ++++++----------------------- HugeCTR/dlrm/baseline_log_info.csv | Bin 0 -> 13785 bytes HugeCTR/dlrm/dlrm.sh | 14 ++++++ 3 files changed, 30 insertions(+), 60 deletions(-) create mode 100644 HugeCTR/dlrm/baseline_log_info.csv create mode 100644 HugeCTR/dlrm/dlrm.sh diff --git a/HugeCTR/dlrm/README.md b/HugeCTR/dlrm/README.md index 687621b0..b9a322f7 100644 --- a/HugeCTR/dlrm/README.md +++ b/HugeCTR/dlrm/README.md @@ -46,67 +46,23 @@ Legend: ### baseline -command: python dlrm.py --gpu_num_per_node 4 - -baseline运行默认参数: - -batch_size=65536 - -learning_rate=0.5 (base learning rate) - -warmup_steps=300 (warmup期间 lr = step_ * base_lr_ / warmup_steps_) - -decay_start=0 - -workspace_size_per_gpu_in_mb=11645 - -embedding_vec_size=128 - -max_iter=600 - -eval_interval=50 +command: bash dlrm.sh + +| gpu_num_per_node | 8 | +| ---------------------------- | ------------------------------ | +| num_nodes | 1 | +| eval_batchs | 70 | +| batch_size | 65536 | +| learning_rate | 0.5 | +| warmup_steps | 1000 | +| data_dir | /dataset/f9f659c5/hugectr_dlrm | +| workspace_size_per_gpu_in_mb | 11645 | +| embedding_vec_size | 128 | +| max_iter | 12000 | +| loss_print_every_n_iter | 100 | +| eval_interval | 100 | ### baseline 运行log -[HUGECTR][03:13:26][INFO][RANK0]: Iter: 50 Time(50 iters): 20.430227s Loss: 0.558395 lr:0.085000 -[HUGECTR][03:13:43][INFO][RANK0]: Evaluation, AUC: 0.675481 -[HUGECTR][03:13:43][INFO][RANK0]: Eval Time for 70 iters: 17.182972s -[HUGECTR][03:14:03][INFO][RANK0]: Iter: 100 Time(50 iters): 37.239699s Loss: 0.536418 lr:0.168333 -[HUGECTR][03:14:21][INFO][RANK0]: Evaluation, AUC: 0.695105 -[HUGECTR][03:14:21][INFO][RANK0]: Eval Time for 70 iters: 17.082333s -[HUGECTR][03:14:41][INFO][RANK0]: Iter: 150 Time(50 iters): 37.131152s Loss: 0.528158 lr:0.251667 -[HUGECTR][03:14:58][INFO][RANK0]: Evaluation, AUC: 0.708745 -[HUGECTR][03:14:58][INFO][RANK0]: Eval Time for 70 iters: 17.140160s -[HUGECTR][03:15:18][INFO][RANK0]: Iter: 200 Time(50 iters): 37.168557s Loss: 0.546343 lr:0.335000 -[HUGECTR][03:15:35][INFO][RANK0]: Evaluation, AUC: 0.715132 -[HUGECTR][03:15:35][INFO][RANK0]: Eval Time for 70 iters: 17.145760s -[HUGECTR][03:15:55][INFO][RANK0]: Iter: 250 Time(50 iters): 37.172421s Loss: 0.534963 lr:0.418333 -[HUGECTR][03:16:12][INFO][RANK0]: Evaluation, AUC: 0.720022 -[HUGECTR][03:16:12][INFO][RANK0]: Eval Time for 70 iters: 17.117377s -[HUGECTR][03:16:33][INFO][RANK0]: Iter: 300 Time(50 iters): 37.171914s Loss: 0.495738 lr:0.500000 -[HUGECTR][03:16:50][INFO][RANK0]: Evaluation, AUC: 0.724995 -[HUGECTR][03:16:50][INFO][RANK0]: Eval Time for 70 iters: 17.130679s -[HUGECTR][03:17:10][INFO][RANK0]: Iter: 350 Time(50 iters): 37.130778s Loss: 0.530376 lr:0.500000 -[HUGECTR][03:17:27][INFO][RANK0]: Evaluation, AUC: 0.727772 -[HUGECTR][03:17:27][INFO][RANK0]: Eval Time for 70 iters: 17.159518s -[HUGECTR][03:17:47][INFO][RANK0]: Iter: 400 Time(50 iters): 37.222825s Loss: 0.526999 lr:0.500000 -[HUGECTR][03:18:04][INFO][RANK0]: Evaluation, AUC: 0.728558 -[HUGECTR][03:18:04][INFO][RANK0]: Eval Time for 70 iters: 17.187404s -[HUGECTR][03:18:25][INFO][RANK0]: Iter: 450 Time(50 iters): 37.232422s Loss: 0.516090 lr:0.500000 -[HUGECTR][03:18:42][INFO][RANK0]: Evaluation, AUC: 0.732136 -[HUGECTR][03:18:42][INFO][RANK0]: Eval Time for 70 iters: 17.184398s -[HUGECTR][03:19:02][INFO][RANK0]: Iter: 500 Time(50 iters): 37.203517s Loss: 0.503241 lr:0.500000 -[HUGECTR][03:19:19][INFO][RANK0]: Evaluation, AUC: 0.735191 -[HUGECTR][03:19:19][INFO][RANK0]: Eval Time for 70 iters: 17.160128s -[HUGECTR][03:19:39][INFO][RANK0]: Iter: 550 Time(50 iters): 37.228689s Loss: 0.504160 lr:0.500000 -[HUGECTR][03:19:57][INFO][RANK0]: Evaluation, AUC: 0.737055 -[HUGECTR][03:19:57][INFO][RANK0]: Eval Time for 70 iters: 17.186027s - -... -[HUGECTR][04:17:04][INFO][RANK0]: Evaluation, AUC: 0.759263 -[HUGECTR][04:17:04][INFO][RANK0]: Eval Time for 70 iters: 17.218921s -[HUGECTR][04:17:24][INFO][RANK0]: Finish 1200 iterations with batchsize: 65536 in 879.05s. - - +见baseline_log_info.csv -### \ No newline at end of file diff --git a/HugeCTR/dlrm/baseline_log_info.csv b/HugeCTR/dlrm/baseline_log_info.csv new file mode 100644 index 0000000000000000000000000000000000000000..b15e98ce55140963db22b3d262a0ae7397f50140 GIT binary patch literal 13785 zcmeHu^+S|f*Y-#w9a4fwcMRR#jUwIM-JMcHr_v(bjkMCz-Cfck-QVEx9DRJw_ZPg+ z{4n?2_w04;YpuQ3-fQ+=qaY0lg$aNGzySaNB7h^CdanBu03Z+=0KfpiL1+luSUVb7 zJL)L8*%~=$(Ysn%zWe|ML754F0Js0&>;JF?ij@ZBAb_YXDVNA8+DIBlx*)w3Uw0++ zqbKIv~gs)9n#J;~gSR4|vF7$=8lj)7kL zJYf)$ZrPeg3ZY1uILOvE*JH9!v>dYrWab|(Ny&Eb@Pdp&mah!4US)^)jA@GHV4EX< zZ<+Io=5mFQCol(gWIiA=*ElLQ6}x#9t&h1Yf@&|+3#IaL(k8-G^EKU0f zelH$n7xhSr`E;Wu_tmq}RpvlZH~jVa@pa6Eh{}8M3$%MrTqA9F6|B+gzBUR+)G3^v z+h$Fnx153@)pov9avm_eoti~Z#zLCSCq)RC5ne*U%5qWW^f5FPdQl#r3=Q{GA(3=E z_(ANWu@6>`6ile3mE|_0!ksN9>oE=$2Bs$OpE&qxY~pfV!TsWRpH$Xr;hMRl*B^6!&TNzWqA4z!S04Lv+5%QXGzs$xTx0Qt~$G+Rh1< zn%q87%&vH)1I>BjY~uW#xRe{Eb8{4RQGI@vbng<0*x0d11?mvJ3LZR04*pv#-gkcL zU9xJ6`j@3oCWKz^m%J^nW6#`)8%pt+O3d9x;t%1J*&BWLtj9s$WTw=!$CBvm3h&Kp zQ%#;y=$A}G;@xDKXFlrHEXRFmZXN9K?L|FD zz5d~Z`HSGy>%W7_^243b+z?N~fWN@*ux6rq-v3Ok5ir-k-q;O&0d*|H2 zcDOtxM}?zKlk%ODT6i<=w4WrA07QA;bLIo|jvm?U!R=6%#e`p$Xwy7_I!rS?NKYxzNyV-vz6fw+ov(dZQ8y|at$B*@Ze1ysXPh^= z5{Z-6SEb1!J5pJc~rg8mRFd%8ASRRwIWBc`+N$Tc9i=rjRjrx1njVu z^8`~Y-6l$UZx5W`sWfS~#8UBBfwvk(9wplad3w3e6xq}VFNMf$PuqU1yH1gAL#l{R<{W5Tu;mGBa>>$z zb71(XugM?T33k*||2nWP8(K|J`~)(>X7T{2^dQVU+d>mOLEs|aD~8YVA$RuNkBxIK zu))vLc4x!2VLJkaC5bG=(wMW{NVSpJQ47Lg{n{MOTo8zGMk03HMj6ztFn~sK@+{|` z4cgqRjMWR;Q<8|wQUY!OCpW6VBu2e)p! zb11?*jaMS7zL_w=zK0f-F}vcEP?TI$Qb@1^v?O79cKzW^7E$x%Aa(aPQ<@_a?$k@u-TgyVK%0lpJyp|N{NSPg?`b@TWeraP z0RZ?Q0s!EETl}2H9ZZdk932>bT!4@Bd1_o=!eM7N`XC_KcT>E>4 zats#!Q3jaw^vjw`XA2%L?U-J%ji|EeEUC=Y!p3=9u*|IyW_M@*jcCDT>kyfp}FKf@J;#^ym zX^4CvmjN0e*D}3WhJMpxCE2;~?v1wY82gulq4IjIQar=gW35-)?{u{~h+8q&ruiE* zS`^IW-e9 zv`1UV;`4jOddJj_{}88H9tyL!chtb!z?&0)zbOGpN2qjZ?V=-C+UQ$bRov9BYEY>( zu!7dqMk{()4ARtlCR9$UzG~I3HR74ElV&Ltd~G%w{u#E&C*xY~YqqiTs4lNLYGnEp zUgF;UkICc5=*OoF*w2Zf2I`1*8X#j~J=pdZRvPezNF`E#;-Ko?AhpTR9*oB6*L`_0 zAep@qC4EF9vp3&-cYJ%hw!0S9ebYGAdjHAo&XGDVrLL8*wRJ_FFnDaBHOI&E`f|Uu zY{*7_XvxQui~7&T?LO9aKi2p-nVpxK2}z)AT0_B|DMD!ydpCJOP#tVRin+f6@H!NY+rcB zYME$Gn%TuXj+$v3NuzYQI6KSfE0t(YoS(})fs+q@HgP@G^J+p)F2R%D1j3w8SoX=& zt$Lvd>jLvF0JU0SP^S8zl0)Vf&Qhhg?83BEbFW5rl{aQMxNGf5ceGE6(l@@cdL&Fv zK2fdIG|9UCJ|)5#_eTD@49>#-0!cB)DKIexxX3_#Z|Y4dR=s#P%Ps9QbQ+|{i=4a9N>fsp zFEr{NoKOa9#9Z_v-4cj_?3L&Ca9u5;ry3u8VsPMWboaqvmf_R2A`8SoRPFj%X(IKm z5{}DTv
    JT;r1CaszzVMdp^5>w2Jz!f$p2jM;5LU0caudxhjxi7v>CR^2OB!U^> zR82Ilxq5&)=55>!%o&1pK{H`jv!*g&4ohc0?}2ka?w`MYdM0%ey z3Jqa6eEV;u@?jyX&3*K$TB?G@lAs*}qW!;pG7|Cj6aOIPm6gUXRAt z2F-~7G`=(uOjk0;rKg^l!Q@ymcJ5CW0XP^_Ils}|eA$Kn!hA+kl(@ z0oVZiR+ZKw$1{M&)0iO9h=tj7cQ zU!14LPV~1#DDc?Y^*1Hu^!ussFnXT?-TrbH&QM_9?~sxpPSO8r{!dIepr6S8QowvG z7RO*kxkmJ z4x!%}4(j)F{$ThR4&W%}3H;x5e}ei;7K55sXB zLjUQ5u}iZfS^AeB0>R>s&HsrDY~UvP-zEX8_bCLJ^t*bZ{dK=PCB1&Xk;zRX>n1u| zn>0gEn|S){UTNdPMQx*o8NFS5BMXWEj#Z-Z!u&y}8uLkQ;N8AD?@PUTwfB9PV}sVt zhBT5rg!BuLm|dAUBXs++j-~^&QqvQqk5965Mk4kH1_zxDu{s-1M|lPGEYV%w{dpFhc(Q5b zM%W^unR6zwYIgBz0#fSFwX?heS(YBSXm2-Lf8rwykXhQ@P=MDz2tzbb?<@=2!Von@Cp2*@_ zbw57w2oSqeVxmk^%&s(;+b>syua)hNu%6<4c=)kN@gvD$i4=VLMF3H>jAf2^|GPtVHT8$jUY;BRBC!YhMoV1 zQc}<~3;~tw!P@{g)iUSh_^x_GgR2@1TNAdm8zD5|D6TP>%1|r62kYQ-R0=@_?rX!V z7x*-KmAYz&f~vBRYUc!$t>d{2TdzCA&$vWIidz3Q`*z}cXBG$Eg9PC{9uEA-z8y`C ztc)0by#GkN_tZ4P=fp6*8BY1(nz>sfE1Nr>Th7SM?vWGieq6V+j4J7kWD#RtjDYux za{COg~T#EoOGWY6v%dm#>*UA^6rcyr7yRd%tH0!h|8d-!{7)zwpNoP zL+&7&9H*Y-T#0smJ+fSvrBJNeTr(?v{QJ^&XesiLN~pR8vdx2>g(&`p5NDA06jWR=tw6dd>V!;ulGu7Ve|=jIFD^2J zJ;WSo7nhAD`4u)aO*W93`%5i@b|F2A;5VVm(l18GIfbhCh4?X?c{(lvcNIFIc}yFh zooyw$FeHx1S&MXHX$)9r-rG0M>)NZn_R6gqX?Q(!(wtd!UItLIT)mF)9!8nG#{@)l zGme7755^8_8+iG<%`^8#R~x6kO%%aspXe+t&mV6Lwb3YSF3q)}zz~UyfIq*zpaYtX z3q3x5i^6skA(gkqir54p8c!Ph{972|c>9FuR5Vq#O7T(RArfBAyqllDxvEOip8i+# zijqlv$D|xcfW%8tf6_?R!DGybtT17bUP`jT7w<$BTpi*Wkp~jgjT8#neXdT%6T;wT zKTA|4Hu=IYnZ|nJ^U3vURxikZE-DgC=l5{_eze`&t|R|~m&@(aVX@BMXj)6KSEkP6 zs$Ip$>*CYN7Q<4j=dIJ^nHkU2F@yK@YNt(W>sMY-JzVFBO?s2-_4-wv&i(P4LHLDS zm>8F*gnoaT#pU#;UVWU*ApRFQHpV~@Dv(wIs)erfhM38BDluf%R?G73k@Hh(14wP{ zdDi+DBHKc!(nBiQ(|`kJu}0b&nB~Gf>F^`lnEt)AJOlz4j4Tq3B_b&h)CIOfw2Hn1 zuGn;@TfhgAOkBBg+jVB=kP0J57dYT*gM1g&nk98AcgHrdK2mQq2qZ)eDvieZURjM6 zsPw#Irgh&DwQ|m+7$<`J{?a8EDSxt9fD2cFzlLL=w!Ql2l3b?P>W-qIgPs zYev0*L*^5tn2TuGcuUy}EB^?`3_7Sf0Zk_Ir}unCR9SiMMy41JsZXM*+oaCJAW z*audJ->>?dvW8zScB*TY%&(OoTu^153eZw5u9@!->S@16%v7|Z9v`%Cj!6->eP@G# zI8Q73wM+8dB`D&gIU+ZRhX;;12de4g{3g=Ud zS3k#4&eGgB5S-%?3&Td}qtrT=IoB}PsA!`_*V+gP?e>!@)Gj)Co;z#a!w1wVg{+45zjh`q|5_?D_{rE^gRwySNqLy+ z8-bYZ#PtRzS9?dbf@ZN?By{yp??|heK*La569t5+sV1cwqEbliOqfM=S(ziIEm4}% zXq-7nEF%g262(3~O$bdrS zFRJ4lnc54}M0_FAA{Izv5))hN(Yor=RK6LUBQ!} zM240)tjPO(#0UVZ==jdZU2*?;hR*7Q+FTGYyidkUn~G+2=R~@u_ULt=YP2{Aqx70d`=-VkY+Wz~r*uE>)s%d{ zO2B%_;&DX`bnf1s2zwO{5PFYUR;(rOwS-AS9m1(-K`r>2iY_FmNc-Bjgo-M{gS4Iy z`g0Fb_p zPitb>I?+&_sFD{Y&IC8*pr0KSS9W-cXnYYw*XNgh+L<7RB?bG=f;1LxR(;;B>v(eWB`5DLXhlHTuqp*-L{dLFD+sn8JkWHxWXsQ#G|d|pCO zIjctL{8S<9Eq#2=u%X&3rb?w)+oa07@B_dKX8Z{ayWwFG$zFqyJ&C;@!&l7q`Ub

    7loxPEH_{5Z+nwkB18+QxH`5P98TmbJ(WC$PjdFAPw>dAsoiO z>uCSBH*F2@Hn9$e&`gS>VV3pHEh@Q}w%)?hwP=hOX%iZ?Z)*J{j#oUlsh>rrqNvYF zLs}u~RFBYhW(=B?k)NFKeDC^fxrYSHa!!71dC=KjHRG8s?@!5bGe5^XG|e=i-jANV zRIpMW?uczEe!ufl+U{_4)W#jt^x~8J2gM^UWJm@=s4RB;m?WTNqgM({4 z2X;{z^$`&Kh;z-GJ>9SSpf+REzjmVwJ}2P20u{ zKFyM)ySrT)t+LHA=Z1wv!rX<4>l173)=&3Wd3|ntCA!=LD!Ywi+s9|d%uDn8FKSOZ zSM1nV_zzLKgZJU1x=~WDj_c5VL2)B(X%zSl*KJii)&6!Hzu40$DNZF4n)aE_W=c8- zHK~+o)V1)c{%F z(&kQj46N0=jP`tv)~;^@TUutO3~DbVAWDNLziT3dAoSOpTwo9r4(Ww_vx@yRMAnqMB;>#p{B&y4@NOEee6&h33b3_4Femr2;|UQx?|49h9$AS-b-?o0 zw{+=*WoP5=z>Y@usnYuyYTv{u9mU|N3e=b=1hp!}DL9XdrjO(UQ)tYRn@nTtLQBA2&$R5lo4L}3>JeRQYN zyMus1;R#cSU{P+|k!VX8T zFai2n@2Xwh5sy(*VBvASPzq6$LkDz!>IT8>v+opH_iqs4`)y?biEHqX+Fm?0=3y33 zqHY>k{YC7(@U$9@@17I}^D<;{@pxk*rH?jEt=jQm!Q)=4d*lwGp5=!lxw9?@q1BPn ze;kF>=7QFS@N!W1dE0UAdX9cg7x>BWPOuP~&N^*+7yGEl(ChVZOF)Lu-dSLlEprnf zB8Fd-?i)lxb(AEHppW{LtG3_tkgam(E}#dynm!o2CfAHN9`Bf`9+rDp3@_H-U>o+`(b&c93}?gKI&~NdO|~8(+Ha9wYxZi5r&d)$m)vs88;wJjPcR z1SE<*8EiuV$;zXgfH_}PpGl#6%tl{O888}pA&irwe@wunkUwjXSX;ox8(GDSw;eYs z^LemfCiI;@w4||kX4UY6FL%`LqPX;Nf!<=f8`dDki_|Vr2p`@n++6~m*z;%Su9|}< zXHojS`v4c6ymAD$&%>5-!r(#Tj0f7Q+7Ysh zqS%p?d}DX#f|7&bn^k^iEl0Q=)pW`cxB zH|atJa$OC;Q?zNY$`vG?;h3asM<#kv%;P@NXHbSAThhz@hjxi>WQR1+H`%E2$PKBV zm)x2Sd@%RQIqioKXkT%YUPG7mt|gR5OS(Y_0XNG!y_D(b#-O46u5hE_Fm;&;e0fqO$QQvwJ~!(XR?oL}rd3HWg0VE1w%*fEp~ z%AjrtGB*d`1P0>xvSWJX*dXBbg4D2;pt%1gqz|&CW>rJyJ;OKVC#Caucrc55-hmNL zW$e8G5OfDnByD--IC0=oB=+uqIB9lIJG;zvHY zJ+15J_9SweUAlGCZ048ATe3N39d&oQz6~i0&267+Ki$wMIV1OcIO%JwW@Z1ceLs=go+?V z(mR}X?~a*@Nce(izRelQP&dZjtMKQ;#f9cp-*sTC8RTjXY1h3}0=Rfzson2IdJ*zi zXSWVYd>;ywOID+6dO3_89;` z@s|Z`!FhzEk^O5UN5{v6g5i&QAichs^f?BP^3%=NFLhhN(ppUm}cOXtawhsQ~lu;0F+ zI*WuuT$4uW8145xWMvQ~Re=LHtD~ zDxe7%3km*-*Pp)X;F)gND4xgj!!Gh0W^Av)X^*PC-(!Dq0}8+@c@g>PMb^N&53^M% zw83Vg>Kb>$o+iCZ{)rMrIBpyfRG=>^Duj>#9NZJ4m%ncO?BI9+|2F=jzI(*=;BQ30 zK7b0of{)@2^ehd8P4&#IAAd_}4oN@#!iWm9+`LTiWzTB%Ix$x(BwYXnb+iJ0T`~5h zyGOQZ&H`e8(h;-I1P_SZ2w!aj~B!&WD-{?zHycbMU*|H4mTS!r6X9P&i;)^h$I1#x<={EV2PZU)AU19Rb zP!p^HAC}_oUC%b2Oun9$(c=ypU&Y`x+{;t=xVgq4+F(e#LHKRB)f=qGf5*8WRiK>& z_!~9wPewFw(Tbssfr7n_tpkIBjlI#Mn&1-4|JAC1hu=>G*KUm`g(Tb@k);^tf4*mYG9x?zCk&*?A>|z73uI5?AuPT-z@kqL+jGJw8E`4NjLFzJNm# zR51K%Y3=(Pg;`$f`padHHYy^T;?EAfyxqKv*5t%oL^;XP19>GdzL+@F!@5oFELoS` zyPwT3OxiR1Q`N6Z;hED`2WMV^R@eEsSaIiSWoJ84N!If&t!^%PKzKA`hf)A)CaUD} zHZQAi#G&x=`Y`TRB<@A3XcsLaH`thE>h>h{WyA_`oE}OjJ%hcf?+x`nlgKyPuw|?f<+FY_gxP)Y#9~bB_hC(AUUs4>)YN&N;Td$gYmam79BQL+Z0SCH`vFb8fmgC>i7eog!zS=r`8F zV*R@OY-eX3qYs^0g?g6&(-Jl`t#6H_B7{#s1d@sbA?c|y$y?kl$*JLd7NVJ$S69=N zA6cyH%c!$gMt5&u@V}4o@UNAN+LFan=-KqZbgP9QveGplT#gQyW5cP2g`*aaJT;TL zn)voDu8i@fX+js8Fx>xJV&-A?=d^xk!aV(ggdk4bKB*Zfe}3!^)_iL-1S9>}51ncV zcwYUnS8^8IMl&T+!z-A@Z$>DC11D;bBnjuD8Nd4Yc=O)g9Y1W@e3m3M_u{Z7=-$ju zJx9XCXLE*ugoO_dqzS+ptuxOeZh68(0C|B3dEi@B{KVGPp?^wVhN1Z={f3;fjzo{p z3wS_Z(*9M4Ixx@b1y*OJGMW@uoA=y}e|@f~*xb8*iw!mfzOyE*hPfp~LQT0L@~hy~ zL9Rmh``d(#$E4`5>GTN%9eBUt-~V9p_xSyN{g=O+C`kV&z<-vz|1J15)5dR_ZU*&9nqWpOf z_X~vpTonga<XDa#&ked1j;2(+Tzb2-Cg8rG`{02(<1N1+#oj;}jGjI4M3!cX50D%9> mBK{Qr&w2lM@hFDh#DA^=3ewPE4}RRwMgkZ<1_9IK-TwntIi@fG literal 0 HcmV?d00001 diff --git a/HugeCTR/dlrm/dlrm.sh b/HugeCTR/dlrm/dlrm.sh new file mode 100644 index 00000000..13bbf586 --- /dev/null +++ b/HugeCTR/dlrm/dlrm.sh @@ -0,0 +1,14 @@ +max_iter=12000 +warmup_steps=1000 +lr=0.5 +test_case=dlrm_baseline_${max_iter}_${warmup_steps}_${lr} + +python dlrm_kaggle_fp32.py \ + --gpu_num_per_node 8 \ + --eval_batchs 70 \ + --max_iter ${max_iter} \ + --batch_size 65536 \ + --learning_rate ${lr} \ + --warmup_steps ${warmup_steps} \ + --loss_print_every_n_iter 100 \ + --eval_interval 100 | tee log/${test_case}.log \ No newline at end of file From cf50a5884838c4e0ec8a8ae19f5d03e78d1d8837 Mon Sep 17 00:00:00 2001 From: ccddyy416 <1482342831@qq.com> Date: Mon, 17 Jan 2022 10:28:16 +0800 Subject: [PATCH 04/20] update baseline parameters table and rename dlrm.sh --- HugeCTR/dlrm/README.md | 8 +++++++- HugeCTR/dlrm/{dlrm.sh => dlrm_baseline_auc_lossses.sh} | 0 2 files changed, 7 insertions(+), 1 deletion(-) rename HugeCTR/dlrm/{dlrm.sh => dlrm_baseline_auc_lossses.sh} (100%) diff --git a/HugeCTR/dlrm/README.md b/HugeCTR/dlrm/README.md index b9a322f7..2a699497 100644 --- a/HugeCTR/dlrm/README.md +++ b/HugeCTR/dlrm/README.md @@ -48,8 +48,9 @@ Legend: command: bash dlrm.sh -| gpu_num_per_node | 8 | +| parameter | value | | ---------------------------- | ------------------------------ | +| gpu_num_per_node | 8 | | num_nodes | 1 | | eval_batchs | 70 | | batch_size | 65536 | @@ -61,6 +62,11 @@ command: bash dlrm.sh | max_iter | 12000 | | loss_print_every_n_iter | 100 | | eval_interval | 100 | +| eval_batch_size | 65536 | +| decay_start | 0 | +| decay_steps | 1 | +| decay_power | 2 | +| end_lr', | 0 | ### baseline 运行log diff --git a/HugeCTR/dlrm/dlrm.sh b/HugeCTR/dlrm/dlrm_baseline_auc_lossses.sh similarity index 100% rename from HugeCTR/dlrm/dlrm.sh rename to HugeCTR/dlrm/dlrm_baseline_auc_lossses.sh From 0a7aa0f144dd5c5753c541c5df1a5f781a0bd27a Mon Sep 17 00:00:00 2001 From: ccddyy416 <1482342831@qq.com> Date: Mon, 17 Jan 2022 10:32:19 +0800 Subject: [PATCH 05/20] Update dlrm_baseline_auc_lossses.sh --- HugeCTR/dlrm/dlrm_baseline_auc_lossses.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/HugeCTR/dlrm/dlrm_baseline_auc_lossses.sh b/HugeCTR/dlrm/dlrm_baseline_auc_lossses.sh index 13bbf586..08c1dee7 100644 --- a/HugeCTR/dlrm/dlrm_baseline_auc_lossses.sh +++ b/HugeCTR/dlrm/dlrm_baseline_auc_lossses.sh @@ -11,4 +11,4 @@ python dlrm_kaggle_fp32.py \ --learning_rate ${lr} \ --warmup_steps ${warmup_steps} \ --loss_print_every_n_iter 100 \ - --eval_interval 100 | tee log/${test_case}.log \ No newline at end of file + --eval_interval 100 | tee log/${test_case}.log From ede0ef9521b27869e129dc7e8cd578f3cb39f75b Mon Sep 17 00:00:00 2001 From: ccddyy416 <1482342831@qq.com> Date: Tue, 18 Jan 2022 12:26:03 +0800 Subject: [PATCH 06/20] upload dlrm hugectr testing scripts --- HugeCTR/dlrm/dlrm_bsz_test.sh | 19 +++++++++++++++++++ HugeCTR/dlrm/dlrm_embedding_size_test.sh | 19 +++++++++++++++++++ 2 files changed, 38 insertions(+) create mode 100644 HugeCTR/dlrm/dlrm_bsz_test.sh create mode 100644 HugeCTR/dlrm/dlrm_embedding_size_test.sh diff --git a/HugeCTR/dlrm/dlrm_bsz_test.sh b/HugeCTR/dlrm/dlrm_bsz_test.sh new file mode 100644 index 00000000..bb2b3953 --- /dev/null +++ b/HugeCTR/dlrm/dlrm_bsz_test.sh @@ -0,0 +1,19 @@ +max_iter=12000 +warmup_steps=1000 +lr=0.5 +for bsz in 16 64 256 1024 4096 16384 65536 +do + test_case=dlrm_test_bsz${bsz} + + python dlrm.py \ + --gpu_num_per_node 8 \ + --data_dir /dataset/criteo_kaggle/hugectr_dlrm \ + --eval_batchs 70 \ + --batch_size ${bsz} \ + --learning_rate ${lr} \ + --warmup_steps ${warmup_steps} \ + --max_iter ${max_iter} \ + --loss_print_every_n_iter 100 \ + --embedding_vec_size 128 \ + --eval_interval 100 | tee log/${test_case}.log +done diff --git a/HugeCTR/dlrm/dlrm_embedding_size_test.sh b/HugeCTR/dlrm/dlrm_embedding_size_test.sh new file mode 100644 index 00000000..2354cf28 --- /dev/null +++ b/HugeCTR/dlrm/dlrm_embedding_size_test.sh @@ -0,0 +1,19 @@ +max_iter=12000 +warmup_steps=1000 +lr=0.5 +for embedding_vec_size in 2 8 32 128 512 +do + test_case=dlrm_test_embsz${embedding_vec_size} + + python dlrm.py \ + --gpu_num_per_node 8 \ + --data_dir /dataset/criteo_kaggle/hugectr_dlrm \ + --eval_batchs 70 \ + --batch_size 65536 \ + --learning_rate ${lr} \ + --warmup_steps ${warmup_steps} \ + --max_iter ${max_iter} \ + --loss_print_every_n_iter 100 \ + --embedding_vec_size ${embedding_vec_size} \ + --eval_interval 100 | tee log/${test_case}.log +done From 1513022462669eb39280a01f8131d1d9ff231ffe Mon Sep 17 00:00:00 2001 From: ccddyy416 <1482342831@qq.com> Date: Tue, 18 Jan 2022 14:45:42 +0800 Subject: [PATCH 07/20] update testing scripts --- HugeCTR/dlrm/dlrm.py | 4 +-- HugeCTR/dlrm/dlrm_bsz_test.sh | 31 ++++++++++++++---------- HugeCTR/dlrm/dlrm_embedding_size_test.sh | 7 ++++-- HugeCTR/dlrm/gpu_memory_usage.py.txt | 25 +++++++++++++++++++ 4 files changed, 50 insertions(+), 17 deletions(-) create mode 100644 HugeCTR/dlrm/gpu_memory_usage.py.txt diff --git a/HugeCTR/dlrm/dlrm.py b/HugeCTR/dlrm/dlrm.py index cba254d8..cd68dce5 100644 --- a/HugeCTR/dlrm/dlrm.py +++ b/HugeCTR/dlrm/dlrm.py @@ -53,7 +53,7 @@ def DLRM(args): model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.InnerProduct, bottom_names = ["relu2"], top_names = ["fc3"], - num_output=128)) + num_output=args.embedding_vec_size)) model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.ReLU, bottom_names = ["fc3"], top_names = ["relu3"])) @@ -126,7 +126,7 @@ def _print_args(args): print("=".ljust(66, "=")) print( "Running {}: gpu_num_per_node = {}, num_nodes = {}.".format( - "HugeCTR-WDL", args.gpu_num_per_node, args.num_nodes + "HugeCTR-DLRM", args.gpu_num_per_node, args.num_nodes ) ) print("=".ljust(66, "=")) diff --git a/HugeCTR/dlrm/dlrm_bsz_test.sh b/HugeCTR/dlrm/dlrm_bsz_test.sh index bb2b3953..35fa0c74 100644 --- a/HugeCTR/dlrm/dlrm_bsz_test.sh +++ b/HugeCTR/dlrm/dlrm_bsz_test.sh @@ -1,19 +1,24 @@ max_iter=12000 warmup_steps=1000 lr=0.5 -for bsz in 16 64 256 1024 4096 16384 65536 +for bsz in 16 64 256 1024 4096 16384 32768 do - test_case=dlrm_test_bsz${bsz} + for ngpu in 1 8 + do + test_case=dlrm_test_bsz${bsz} + mem_usage_file=${test_case}.mem - python dlrm.py \ - --gpu_num_per_node 8 \ - --data_dir /dataset/criteo_kaggle/hugectr_dlrm \ - --eval_batchs 70 \ - --batch_size ${bsz} \ - --learning_rate ${lr} \ - --warmup_steps ${warmup_steps} \ - --max_iter ${max_iter} \ - --loss_print_every_n_iter 100 \ - --embedding_vec_size 128 \ - --eval_interval 100 | tee log/${test_case}.log + python gpu_memory_usage.py 1> log/$mem_usage_file 2>&1 log/$mem_usage_file 2>&1 mem_threshold: + state = 'Detecting' + elif state == 'Detecting': + if info.used < mem_threshold: + running = False + else: + device0_max_used_mem = max(device0_max_used_mem, info.used) + +nvmlShutdown() +print('max device0 memory usage is:', device0_max_used_mem) From 188448b231cb16cfe616173a379ac8ccec9e8a88 Mon Sep 17 00:00:00 2001 From: ccddyy416 <1482342831@qq.com> Date: Wed, 19 Jan 2022 10:40:09 +0800 Subject: [PATCH 08/20] update testing scripts --- HugeCTR/dlrm/dlrm_bsz_test.sh | 2 +- HugeCTR/dlrm/dlrm_embedding_size_test.sh | 33 +++++++++++++----------- 2 files changed, 19 insertions(+), 16 deletions(-) diff --git a/HugeCTR/dlrm/dlrm_bsz_test.sh b/HugeCTR/dlrm/dlrm_bsz_test.sh index 35fa0c74..6bccc5a9 100644 --- a/HugeCTR/dlrm/dlrm_bsz_test.sh +++ b/HugeCTR/dlrm/dlrm_bsz_test.sh @@ -5,7 +5,7 @@ for bsz in 16 64 256 1024 4096 16384 32768 do for ngpu in 1 8 do - test_case=dlrm_test_bsz${bsz} + test_case=dlrm_test_n1g$ngpu}_bsz${bsz} mem_usage_file=${test_case}.mem python gpu_memory_usage.py 1> log/$mem_usage_file 2>&1 log/$mem_usage_file 2>&1 log/$mem_usage_file 2>&1 Date: Wed, 19 Jan 2022 14:35:25 +0800 Subject: [PATCH 09/20] update test result --- HugeCTR/dlrm/README.md | 52 ++++++++++++++++++++++++ HugeCTR/dlrm/dlrm_embedding_size_test.sh | 2 +- 2 files changed, 53 insertions(+), 1 deletion(-) diff --git a/HugeCTR/dlrm/README.md b/HugeCTR/dlrm/README.md index 2a699497..34674f19 100644 --- a/HugeCTR/dlrm/README.md +++ b/HugeCTR/dlrm/README.md @@ -1,4 +1,7 @@ + + # NVIDIA HugeCTR DLRM Benchmark Test + This folder holds NVIDIA HugeCTR DLRM Benchmark Test scripts, tools and reports. You can refer to [HugeCTR User Guide](https://github.com/NVIDIA/HugeCTR/blob/master/docs/hugectr_user_guide.md) for additional information. @@ -72,3 +75,52 @@ command: bash dlrm.sh 见baseline_log_info.csv +### Test Case + +#### embedding size + +one GPU + +| gpu | batch_size | embedding_vec_size | latency(ms) | memory_usage(MB) | +| ---- | ---------- | ------------------ | ----------- | ---------------- | +| n1g1 | 32768 | 2 | 51.235 | 4,256 | +| n1g1 | 32768 | 8 | 51.501 | 5,288 | +| n1g1 | 32768 | 32 | 53.238 | 9,428 | +| n1g1 | 32768 | 128 | 65.268 | 25,968 | + +eight GPUs + +| gpu | batch_size | embedding_vec_size | latency(ms) | memory_usage(MB) | +| ---- | ---------- | ------------------ | ----------- | ---------------- | +| n1g8 | 32768 | 2 | 106.352 | 2,112 | +| n1g8 | 32768 | 8 | 112.477 | 2,180 | +| n1g8 | 32768 | 32 | 102.875 | 2,452 | +| n1g8 | 32768 | 128 | 295.483 | 3,540 | + + + +#### batch size + +one GPU + +| gpu | batch_size | embedding_vec_size | latency(ms) | memory_usage(MB) | +| ---- | ---------- | ------------------ | ----------- | ---------------- | +| n1g1 | 16 | 128 | 0.535 | 17,890 | +| n1g1 | 64 | 128 | 0.626 | 17,910 | +| n1g1 | 256 | 128 | 0.980 | 17,942 | +| n1g1 | 1024 | 128 | 2.219 | 18,138 | +| n1g1 | 4096 | 128 | 7.236 | 18,900 | +| n1g1 | 16384 | 128 | 27.148 | 21,930 | +| n1g1 | 32768 | 128 | 53.337 | 25,966 | + +eight GPUs + +| gpu | batch_size | embedding_vec_size | latency(ms) | memory_usage(MB) | +| ---- | ---------- | ------------------ | ----------- | ---------------- | +| n1g8 | 16 | 128 | 0.889 | 1,670 | +| n1g8 | 64 | 128 | 1.161 | 1,682 | +| n1g8 | 256 | 128 | 2.333 | 1,700 | +| n1g8 | 1024 | 128 | 7.667 | 1,738 | +| n1g8 | 4096 | 128 | 33.855 | 1,910 | +| n1g8 | 16384 | 128 | 119.610 | 2,606 | +| n1g8 | 32768 | 128 | 211.966 | 3,540log_file | \ No newline at end of file diff --git a/HugeCTR/dlrm/dlrm_embedding_size_test.sh b/HugeCTR/dlrm/dlrm_embedding_size_test.sh index 4428be66..605c13b3 100644 --- a/HugeCTR/dlrm/dlrm_embedding_size_test.sh +++ b/HugeCTR/dlrm/dlrm_embedding_size_test.sh @@ -5,7 +5,7 @@ for embedding_vec_size in 2 8 32 128 do for ngpu in 1 8 do - test_case=dlrm_test_n1g$ngpu}_embsz${embedding_vec_size} + test_case=dlrm_test_n1g${ngpu}_embsz${embedding_vec_size} mem_usage_file=${test_case}.mem python gpu_memory_usage.py 1> log/$mem_usage_file 2>&1 Date: Wed, 19 Jan 2022 14:58:31 +0800 Subject: [PATCH 10/20] update extract log_info tool --- HugeCTR/dlrm/extract_hugectr_log.py | 115 ++++++++++++++++++++++++++++ 1 file changed, 115 insertions(+) create mode 100644 HugeCTR/dlrm/extract_hugectr_log.py diff --git a/HugeCTR/dlrm/extract_hugectr_log.py b/HugeCTR/dlrm/extract_hugectr_log.py new file mode 100644 index 00000000..7c35dbb2 --- /dev/null +++ b/HugeCTR/dlrm/extract_hugectr_log.py @@ -0,0 +1,115 @@ +import argparse +import os +import glob +from statistics import median + + + + +def write_line(f, lst, separator=',', start_end=False): + lst = ['', *lst, ''] if start_end else lst + f.write(separator.join(lst)) + f.write('\n') + + +def value_format(value): + if isinstance(value, float): + return '{:.3f}'.format(value) + elif isinstance(value, int): + return f'{value:,}' + else: + return str(value) + + +def extract_mem_info(mem_file): + if not os.path.isfile(mem_file): + return 'NA' + + with open(mem_file, 'r') as f: + for line in f.readlines(): + ss = line.split(' ') + if len(ss) < 5: + continue + if ss[0] == 'max': + return int(float(ss[-1].strip()) / 1024 /1024) + return 'NA' + + +def extract_info_from_file(log_file, start_iter): + ''' + batch_size_per_device = 128 + gpu_num_per_node = 8 + num_nodes = 2 + [HUGECTR][02:51:59][INFO][RANK0]: Iter: 100 Time(100 iters): 0.315066s Loss: 0.125152 lr:0.001000 + [HUGECTR][02:51:59][INFO][RANK0]: Iter: 200 Time(100 iters): 0.213347s Loss: 0.106469 lr:0.001000 + ... + [HUGECTR][02:52:01][INFO][RANK0]: Iter: 1100 Time(100 iters): 0.222373s Loss: 0.100451 lr:0.001000 + max_iter = 1200 + loss_print_every_n_iter = 100 + gpu_num_per_node = 1 + num_nodes = 1 + ''' + # extract info from file name + result_dict = {} + loss_print_every_n_iter = 0 + with open(log_file, 'r') as f: + latencies = [] + vocab_size = [] + for line in f.readlines(): + ss = line.split(' ') + if ss[0] in ['num_nodes', 'gpu_num_per_node', 'batch_size', 'embedding_vec_size']: + result_dict[ss[0]] = ss[2].strip() + # if ss[0] in ['workspace_size_per_gpu_in_mb']: + # result_dict['vocab_size'] = int((int(ss[2].strip()) * 1024 * 1024 / 4) // 1000000 * 1000000) + if ss[0] == 'loss_print_every_n_iter': + loss_print_every_n_iter = float(ss[2].strip()) + elif len(ss) > 3 and ss[1] == 'Iter:' and '[INFO]' in ss[0]: + if int(ss[2].strip()) != start_iter: + latencies.append(float(ss[5].strip()[:-1])) + if loss_print_every_n_iter > 0: + result_dict['latency(ms)'] = 1000 * sum(latencies) / len(latencies) / loss_print_every_n_iter + mem = extract_mem_info(log_file[:-3] + 'mem') + result_dict['memory_usage(MB)'] = mem + return result_dict + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="flags for HugeCTR WDL") + parser.add_argument("--benchmark_log_dir", type=str, required=True) + parser.add_argument("--start_iter", type=int, default=1000) + # parser.add_argument("--end_iter", type=int, default=1100) + args = parser.parse_args() + + logs_list = sorted(glob.glob(os.path.join(args.benchmark_log_dir, "*.log")), key=os.path.getmtime) + #logs_list = sorted(logs_list) + chunk_list = {} + for log_file in logs_list: + test_result = extract_info_from_file(log_file, args.start_iter) + print(test_result) + json_file = os.path.basename(log_file)[:-4] + # print(json_file) + test_result['log_file'] = json_file + if json_file not in chunk_list.keys(): + chunk_list[json_file] = [] + chunk_list[json_file].append(test_result) + result_list = [] + for log_name,chunk in chunk_list.items(): + latency_list = [] + for single_result in chunk: + latency_list.append(single_result['latency(ms)']) + tmp_chunk = chunk[0] + tmp_chunk['gpu'] = 'n{}g{}'.format(tmp_chunk['num_nodes'], tmp_chunk['gpu_num_per_node']) + tmp_chunk['latency(ms)'] = median(latency_list) + result_list.append(tmp_chunk) + #with open(os.path.join(args.benchmark_log_dir, 'latency_reprot.md'), 'w') as f: + report_file = args.benchmark_log_dir + '_latency_report.md' + with open(report_file, 'w') as f: + titles = ['log_file', 'gpu', 'batch_size', 'embedding_vec_size', 'latency(ms)', 'memory_usage(MB)'] + write_line(f, titles, '|', True) + write_line(f, ['----' for _ in titles], '|', True) + for result in result_list: + if 'latency(ms)' not in result.keys(): + print(result['log_file'], 'is not complete!') + continue + cells = [value_format(result[title]) for title in titles] + write_line(f, cells, '|', True) From 507cdb57e07d8a75c81e1481552731029608e2bc Mon Sep 17 00:00:00 2001 From: ccddyy416 <1482342831@qq.com> Date: Wed, 19 Jan 2022 23:15:09 +0800 Subject: [PATCH 11/20] update testing result --- HugeCTR/dlrm/README.md | 6 +++++- HugeCTR/dlrm/dlrm_bsz_test.sh | 4 ++-- .../dlrm/{gpu_memory_usage.py.txt => gpu_memory_usage.py} | 0 3 files changed, 7 insertions(+), 3 deletions(-) rename HugeCTR/dlrm/{gpu_memory_usage.py.txt => gpu_memory_usage.py} (100%) diff --git a/HugeCTR/dlrm/README.md b/HugeCTR/dlrm/README.md index 34674f19..c9838bbb 100644 --- a/HugeCTR/dlrm/README.md +++ b/HugeCTR/dlrm/README.md @@ -123,4 +123,8 @@ eight GPUs | n1g8 | 1024 | 128 | 7.667 | 1,738 | | n1g8 | 4096 | 128 | 33.855 | 1,910 | | n1g8 | 16384 | 128 | 119.610 | 2,606 | -| n1g8 | 32768 | 128 | 211.966 | 3,540log_file | \ No newline at end of file +| n1g8 | 32768 | 128 | 211.966 | 3,540 | +| n1g8 | 65536 | 128 | 535.517 | 5,408 | + + + diff --git a/HugeCTR/dlrm/dlrm_bsz_test.sh b/HugeCTR/dlrm/dlrm_bsz_test.sh index 6bccc5a9..d339a5af 100644 --- a/HugeCTR/dlrm/dlrm_bsz_test.sh +++ b/HugeCTR/dlrm/dlrm_bsz_test.sh @@ -17,8 +17,8 @@ do --learning_rate ${lr} \ --warmup_steps ${warmup_steps} \ --max_iter ${max_iter} \ - --loss_print_every_n_iter 1000 \ + --loss_print_every_n_iter 100 \ --embedding_vec_size 128 \ - --eval_interval 1000 | tee log/${test_case}.log + --eval_interval 100 | tee log/${test_case}.log done done diff --git a/HugeCTR/dlrm/gpu_memory_usage.py.txt b/HugeCTR/dlrm/gpu_memory_usage.py similarity index 100% rename from HugeCTR/dlrm/gpu_memory_usage.py.txt rename to HugeCTR/dlrm/gpu_memory_usage.py From 83d1d688baa413d84fedf60a18ed06a7d1e0bd0d Mon Sep 17 00:00:00 2001 From: ccddyy416 <1482342831@qq.com> Date: Mon, 24 Jan 2022 09:31:42 +0800 Subject: [PATCH 12/20] Update README.md update test result --- HugeCTR/dlrm/README.md | 78 ++++++++++++++++++++---------------------- 1 file changed, 37 insertions(+), 41 deletions(-) diff --git a/HugeCTR/dlrm/README.md b/HugeCTR/dlrm/README.md index c9838bbb..fb4068d0 100644 --- a/HugeCTR/dlrm/README.md +++ b/HugeCTR/dlrm/README.md @@ -6,13 +6,12 @@ This folder holds NVIDIA HugeCTR DLRM Benchmark Test scripts, tools and reports. You can refer to [HugeCTR User Guide](https://github.com/NVIDIA/HugeCTR/blob/master/docs/hugectr_user_guide.md) for additional information. -## folder structure ## Benchmark Test Cases -This report summarized HugeCTR test on 1 nodes with 8 x Tesla V100 on Dec 2021 +This report summarized HugeCTR test on 1 nodes with 8 x Tesla V100 in Jan 2022 ### Test Environment -- 1 nodes with Tesla V100-SXM2-16GB x 8 +- 1 nodes with Tesla V100-SXM2-32GB x 8 - InfiniBand 100 Gb/sec (4X EDR), Mellanox Technologies MT27700 Family - Intel(R) Xeon(R) Gold 6271C CPU @ 2.60GHz ($ cat /proc/cpuinfo | grep name | cut -f2 -d: | uniq -c*) - Memory 384G ($ cat /proc/meminfo) @@ -22,15 +21,15 @@ This report summarized HugeCTR test on 1 nodes with 8 x Tesla V100 on Dec 2021 - `nvidia-smi topo -m` ``` - GPU0 GPU1 GPU2 GPU3 GPU4 GPU5 GPU6 GPU7 mlx5_0 mlx5_1 CPU Affinity NUMA Affinity -GPU0 X NV1 NV2 NV1 SYS SYS SYS NV2 NODE SYS 0-23,48-71 0 -GPU1 NV1 X NV1 NV2 SYS SYS NV2 SYS NODE SYS 0-23,48-71 0 -GPU2 NV2 NV1 X NV2 SYS NV1 SYS SYS PIX SYS 0-23,48-71 0 -GPU3 NV1 NV2 NV2 X NV1 SYS SYS SYS PIX SYS 0-23,48-71 0 -GPU4 SYS SYS SYS NV1 X NV2 NV2 NV1 SYS NODE 24-47,72-95 1 -GPU5 SYS SYS NV1 SYS NV2 X NV1 NV2 SYS NODE 24-47,72-95 1 -GPU6 SYS NV2 SYS SYS NV2 NV1 X NV1 SYS PIX 24-47,72-95 1 -GPU7 NV2 SYS SYS SYS NV1 NV2 NV1 X SYS PIX 24-47,72-95 1 + GPU0 GPU1 GPU2 GPU3 GPU4 GPU5 GPU6 GPU7 mlx5_0 mlx5_1 CPU Affinity NUMA Affinity +GPU0 X NV1 NV2 NV1 SYS SYS SYS NV2 NODE SYS 0-23,48-71 0 +GPU1 NV1 X NV1 NV2 SYS SYS NV2 SYS NODE SYS 0-23,48-71 0 +GPU2 NV2 NV1 X NV2 SYS NV1 SYS SYS PIX SYS 0-23,48-71 0 +GPU3 NV1 NV2 NV2 X NV1 SYS SYS SYS PIX SYS 0-23,48-71 0 +GPU4 SYS SYS SYS NV1 X NV2 NV2 NV1 SYS NODE 24-47,72-95 1 +GPU5 SYS SYS NV1 SYS NV2 X NV1 NV2 SYS NODE 24-47,72-95 1 +GPU6 SYS NV2 SYS SYS NV2 NV1 X NV1 SYS PIX 24-47,72-95 1 +GPU7 NV2 SYS SYS SYS NV1 NV2 NV1 X SYS PIX 24-47,72-95 1 mlx5_0 NODE NODE PIX PIX SYS SYS SYS SYS X SYS mlx5_1 SYS SYS SYS SYS NODE NODE PIX PIX SYS X @@ -69,13 +68,13 @@ command: bash dlrm.sh | decay_start | 0 | | decay_steps | 1 | | decay_power | 2 | -| end_lr', | 0 | +| end_lr | 0 | -### baseline 运行log +### baseline runing log -见baseline_log_info.csv +see baseline_log_info.csv -### Test Case +### Test Result #### embedding size @@ -83,21 +82,19 @@ one GPU | gpu | batch_size | embedding_vec_size | latency(ms) | memory_usage(MB) | | ---- | ---------- | ------------------ | ----------- | ---------------- | -| n1g1 | 32768 | 2 | 51.235 | 4,256 | -| n1g1 | 32768 | 8 | 51.501 | 5,288 | -| n1g1 | 32768 | 32 | 53.238 | 9,428 | -| n1g1 | 32768 | 128 | 65.268 | 25,968 | +| n1g1 | 32768 | 2 | 51.340 | 4,256 | +| n1g1 | 32768 | 8 | 51.467 | 5,288 | +| n1g1 | 32768 | 32 | 53.179 | 9,428 | +| n1g1 | 32768 | 128 | 65.284 | 25,966 | eight GPUs | gpu | batch_size | embedding_vec_size | latency(ms) | memory_usage(MB) | | ---- | ---------- | ------------------ | ----------- | ---------------- | -| n1g8 | 32768 | 2 | 106.352 | 2,112 | -| n1g8 | 32768 | 8 | 112.477 | 2,180 | -| n1g8 | 32768 | 32 | 102.875 | 2,452 | -| n1g8 | 32768 | 128 | 295.483 | 3,540 | - - +| n1g8 | 32768 | 2 | 106.358 | 2,112 | +| n1g8 | 32768 | 8 | 112.861 | 2,180 | +| n1g8 | 32768 | 32 | 102.251 | 2,452 | +| n1g8 | 32768 | 128 | 298.217 | 3,540 | #### batch size @@ -105,26 +102,25 @@ one GPU | gpu | batch_size | embedding_vec_size | latency(ms) | memory_usage(MB) | | ---- | ---------- | ------------------ | ----------- | ---------------- | -| n1g1 | 16 | 128 | 0.535 | 17,890 | -| n1g1 | 64 | 128 | 0.626 | 17,910 | -| n1g1 | 256 | 128 | 0.980 | 17,942 | -| n1g1 | 1024 | 128 | 2.219 | 18,138 | -| n1g1 | 4096 | 128 | 7.236 | 18,900 | -| n1g1 | 16384 | 128 | 27.148 | 21,930 | -| n1g1 | 32768 | 128 | 53.337 | 25,966 | +| n1g1 | 16 | 128 | 0.687 | 17,890 | +| n1g1 | 64 | 128 | 0.785 | 17,910 | +| n1g1 | 256 | 128 | 1.206 | 17,942 | +| n1g1 | 1024 | 128 | 2.699 | 18,138 | +| n1g1 | 4096 | 128 | 8.860 | 18,900 | +| n1g1 | 16384 | 128 | 33.391 | 21,930 | +| n1g1 | 32768 | 128 | 65.386 | 25,966 | eight GPUs | gpu | batch_size | embedding_vec_size | latency(ms) | memory_usage(MB) | | ---- | ---------- | ------------------ | ----------- | ---------------- | -| n1g8 | 16 | 128 | 0.889 | 1,670 | -| n1g8 | 64 | 128 | 1.161 | 1,682 | -| n1g8 | 256 | 128 | 2.333 | 1,700 | -| n1g8 | 1024 | 128 | 7.667 | 1,738 | -| n1g8 | 4096 | 128 | 33.855 | 1,910 | -| n1g8 | 16384 | 128 | 119.610 | 2,606 | -| n1g8 | 32768 | 128 | 211.966 | 3,540 | -| n1g8 | 65536 | 128 | 535.517 | 5,408 | +| n1g8 | 16 | 128 | 1.152 | 1,670 | +| n1g8 | 64 | 128 | 1.410 | 1,680 | +| n1g8 | 256 | 128 | 2.781 | 1,700 | +| n1g8 | 1024 | 128 | 10.886 | 1,738 | +| n1g8 | 4096 | 128 | 52.476 | 1,910 | +| n1g8 | 16384 | 128 | 173.699 | 2,608 | +| n1g8 | 32768 | 128 | 296.878 | 3,540 | From a7cf174a0f3de6f775a8af1d5c4b1513e529e51d Mon Sep 17 00:00:00 2001 From: ccddyy416 <1482342831@qq.com> Date: Mon, 24 Jan 2022 09:35:16 +0800 Subject: [PATCH 13/20] Update README.md --- HugeCTR/dlrm/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/HugeCTR/dlrm/README.md b/HugeCTR/dlrm/README.md index fb4068d0..8580f423 100644 --- a/HugeCTR/dlrm/README.md +++ b/HugeCTR/dlrm/README.md @@ -21,7 +21,7 @@ This report summarized HugeCTR test on 1 nodes with 8 x Tesla V100 in Jan 2022 - `nvidia-smi topo -m` ``` - GPU0 GPU1 GPU2 GPU3 GPU4 GPU5 GPU6 GPU7 mlx5_0 mlx5_1 CPU Affinity NUMA Affinity + GPU0 GPU1 GPU2 GPU3 GPU4 GPU5 GPU6 GPU7 mlx5_0 mlx5_1 CPU Affinity NUMA Affinity GPU0 X NV1 NV2 NV1 SYS SYS SYS NV2 NODE SYS 0-23,48-71 0 GPU1 NV1 X NV1 NV2 SYS SYS NV2 SYS NODE SYS 0-23,48-71 0 GPU2 NV2 NV1 X NV2 SYS NV1 SYS SYS PIX SYS 0-23,48-71 0 From fc75a5736a311916fde1e00ebd91d20333b06ee2 Mon Sep 17 00:00:00 2001 From: ccddyy416 <1482342831@qq.com> Date: Mon, 24 Jan 2022 09:36:28 +0800 Subject: [PATCH 14/20] Update README.md --- HugeCTR/dlrm/README.md | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/HugeCTR/dlrm/README.md b/HugeCTR/dlrm/README.md index 8580f423..a474dcec 100644 --- a/HugeCTR/dlrm/README.md +++ b/HugeCTR/dlrm/README.md @@ -22,16 +22,16 @@ This report summarized HugeCTR test on 1 nodes with 8 x Tesla V100 in Jan 2022 ``` GPU0 GPU1 GPU2 GPU3 GPU4 GPU5 GPU6 GPU7 mlx5_0 mlx5_1 CPU Affinity NUMA Affinity -GPU0 X NV1 NV2 NV1 SYS SYS SYS NV2 NODE SYS 0-23,48-71 0 -GPU1 NV1 X NV1 NV2 SYS SYS NV2 SYS NODE SYS 0-23,48-71 0 -GPU2 NV2 NV1 X NV2 SYS NV1 SYS SYS PIX SYS 0-23,48-71 0 -GPU3 NV1 NV2 NV2 X NV1 SYS SYS SYS PIX SYS 0-23,48-71 0 -GPU4 SYS SYS SYS NV1 X NV2 NV2 NV1 SYS NODE 24-47,72-95 1 -GPU5 SYS SYS NV1 SYS NV2 X NV1 NV2 SYS NODE 24-47,72-95 1 -GPU6 SYS NV2 SYS SYS NV2 NV1 X NV1 SYS PIX 24-47,72-95 1 -GPU7 NV2 SYS SYS SYS NV1 NV2 NV1 X SYS PIX 24-47,72-95 1 -mlx5_0 NODE NODE PIX PIX SYS SYS SYS SYS X SYS -mlx5_1 SYS SYS SYS SYS NODE NODE PIX PIX SYS X +GPU0 X NV1 NV2 NV1 SYS SYS SYS NV2 NODE SYS 0-23,48-71 0 +GPU1 NV1 X NV1 NV2 SYS SYS NV2 SYS NODE SYS 0-23,48-71 0 +GPU2 NV2 NV1 X NV2 SYS NV1 SYS SYS PIX SYS 0-23,48-71 0 +GPU3 NV1 NV2 NV2 X NV1 SYS SYS SYS PIX SYS 0-23,48-71 0 +GPU4 SYS SYS SYS NV1 X NV2 NV2 NV1 SYS NODE 24-47,72-95 1 +GPU5 SYS SYS NV1 SYS NV2 X NV1 NV2 SYS NODE 24-47,72-95 1 +GPU6 SYS NV2 SYS SYS NV2 NV1 X NV1 SYS PIX 24-47,72-95 1 +GPU7 NV2 SYS SYS SYS NV1 NV2 NV1 X SYS PIX 24-47,72-95 1 +mlx5_0 NODE NODE PIX PIX SYS SYS SYS SYS X SYS +mlx5_1 SYS SYS SYS SYS NODE NODE PIX PIX SYS X Legend: From 4babb5417be6ecbc36a87316d242971fb7184657 Mon Sep 17 00:00:00 2001 From: ccddyy416 <1482342831@qq.com> Date: Mon, 24 Jan 2022 09:38:39 +0800 Subject: [PATCH 15/20] Update README.md --- HugeCTR/dlrm/README.md | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/HugeCTR/dlrm/README.md b/HugeCTR/dlrm/README.md index a474dcec..959739e0 100644 --- a/HugeCTR/dlrm/README.md +++ b/HugeCTR/dlrm/README.md @@ -22,16 +22,16 @@ This report summarized HugeCTR test on 1 nodes with 8 x Tesla V100 in Jan 2022 ``` GPU0 GPU1 GPU2 GPU3 GPU4 GPU5 GPU6 GPU7 mlx5_0 mlx5_1 CPU Affinity NUMA Affinity -GPU0 X NV1 NV2 NV1 SYS SYS SYS NV2 NODE SYS 0-23,48-71 0 -GPU1 NV1 X NV1 NV2 SYS SYS NV2 SYS NODE SYS 0-23,48-71 0 -GPU2 NV2 NV1 X NV2 SYS NV1 SYS SYS PIX SYS 0-23,48-71 0 -GPU3 NV1 NV2 NV2 X NV1 SYS SYS SYS PIX SYS 0-23,48-71 0 -GPU4 SYS SYS SYS NV1 X NV2 NV2 NV1 SYS NODE 24-47,72-95 1 -GPU5 SYS SYS NV1 SYS NV2 X NV1 NV2 SYS NODE 24-47,72-95 1 -GPU6 SYS NV2 SYS SYS NV2 NV1 X NV1 SYS PIX 24-47,72-95 1 -GPU7 NV2 SYS SYS SYS NV1 NV2 NV1 X SYS PIX 24-47,72-95 1 -mlx5_0 NODE NODE PIX PIX SYS SYS SYS SYS X SYS -mlx5_1 SYS SYS SYS SYS NODE NODE PIX PIX SYS X +GPU0 X NV1 NV2 NV1 SYS SYS SYS NV2 NODE SYS 0-23,48-71 0 +GPU1 NV1 X NV1 NV2 SYS SYS NV2 SYS NODE SYS 0-23,48-71 0 +GPU2 NV2 NV1 X NV2 SYS NV1 SYS SYS PIX SYS 0-23,48-71 0 +GPU3 NV1 NV2 NV2 X NV1 SYS SYS SYS PIX SYS 0-23,48-71 0 +GPU4 SYS SYS SYS NV1 X NV2 NV2 NV1 SYS NODE 24-47,72-95 1 +GPU5 SYS SYS NV1 SYS NV2 X NV1 NV2 SYS NODE 24-47,72-95 1 +GPU6 SYS NV2 SYS SYS NV2 NV1 X NV1 SYS PIX 24-47,72-95 1 +GPU7 NV2 SYS SYS SYS NV1 NV2 NV1 X SYS PIX 24-47,72-95 1 +mlx5_0 NODE NODE PIX PIX SYS SYS SYS SYS X SYS +mlx5_1 SYS SYS SYS SYS NODE NODE PIX PIX SYS X Legend: From 8598e32dca7cb66f6cd61226feada6675a9508a1 Mon Sep 17 00:00:00 2001 From: ccddyy416 <1482342831@qq.com> Date: Mon, 24 Jan 2022 09:41:43 +0800 Subject: [PATCH 16/20] Update README.md --- HugeCTR/dlrm/README.md | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/HugeCTR/dlrm/README.md b/HugeCTR/dlrm/README.md index 959739e0..42c40595 100644 --- a/HugeCTR/dlrm/README.md +++ b/HugeCTR/dlrm/README.md @@ -22,16 +22,16 @@ This report summarized HugeCTR test on 1 nodes with 8 x Tesla V100 in Jan 2022 ``` GPU0 GPU1 GPU2 GPU3 GPU4 GPU5 GPU6 GPU7 mlx5_0 mlx5_1 CPU Affinity NUMA Affinity -GPU0 X NV1 NV2 NV1 SYS SYS SYS NV2 NODE SYS 0-23,48-71 0 -GPU1 NV1 X NV1 NV2 SYS SYS NV2 SYS NODE SYS 0-23,48-71 0 -GPU2 NV2 NV1 X NV2 SYS NV1 SYS SYS PIX SYS 0-23,48-71 0 -GPU3 NV1 NV2 NV2 X NV1 SYS SYS SYS PIX SYS 0-23,48-71 0 -GPU4 SYS SYS SYS NV1 X NV2 NV2 NV1 SYS NODE 24-47,72-95 1 -GPU5 SYS SYS NV1 SYS NV2 X NV1 NV2 SYS NODE 24-47,72-95 1 -GPU6 SYS NV2 SYS SYS NV2 NV1 X NV1 SYS PIX 24-47,72-95 1 -GPU7 NV2 SYS SYS SYS NV1 NV2 NV1 X SYS PIX 24-47,72-95 1 -mlx5_0 NODE NODE PIX PIX SYS SYS SYS SYS X SYS -mlx5_1 SYS SYS SYS SYS NODE NODE PIX PIX SYS X +GPU0 X NV1 NV2 NV1 SYS SYS SYS NV2 NODE SYS 0-23,48-71 0 +GPU1 NV1 X NV1 NV2 SYS SYS NV2 SYS NODE SYS 0-23,48-71 0 +GPU2 NV2 NV1 X NV2 SYS NV1 SYS SYS PIX SYS 0-23,48-71 0 +GPU3 NV1 NV2 NV2 X NV1 SYS SYS SYS PIX SYS 0-23,48-71 0 +GPU4 SYS SYS SYS NV1 X NV2 NV2 NV1 SYS NODE 24-47,72-95 1 +GPU5 SYS SYS NV1 SYS NV2 X NV1 NV2 SYS NODE 24-47,72-95 1 +GPU6 SYS NV2 SYS SYS NV2 NV1 X NV1 SYS PIX 24-47,72-95 1 +GPU7 NV2 SYS SYS SYS NV1 NV2 NV1 X SYS PIX 24-47,72-95 1 +mlx5_0 NODE NODE PIX PIX SYS SYS SYS SYS X SYS +mlx5_1 SYS SYS SYS SYS NODE NODE PIX PIX SYS X Legend: From 31c2ab9a912424126bf1c820c47b4934369c74ad Mon Sep 17 00:00:00 2001 From: ccddyy416 <1482342831@qq.com> Date: Mon, 24 Jan 2022 09:44:39 +0800 Subject: [PATCH 17/20] Update README.md --- HugeCTR/dlrm/README.md | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/HugeCTR/dlrm/README.md b/HugeCTR/dlrm/README.md index 42c40595..08224c65 100644 --- a/HugeCTR/dlrm/README.md +++ b/HugeCTR/dlrm/README.md @@ -21,16 +21,16 @@ This report summarized HugeCTR test on 1 nodes with 8 x Tesla V100 in Jan 2022 - `nvidia-smi topo -m` ``` - GPU0 GPU1 GPU2 GPU3 GPU4 GPU5 GPU6 GPU7 mlx5_0 mlx5_1 CPU Affinity NUMA Affinity + GPU0 GPU1 GPU2 GPU3 GPU4 GPU5 GPU6 GPU7 mlx5_0 mlx5_1 CPU Affinity NUMA Affinity GPU0 X NV1 NV2 NV1 SYS SYS SYS NV2 NODE SYS 0-23,48-71 0 GPU1 NV1 X NV1 NV2 SYS SYS NV2 SYS NODE SYS 0-23,48-71 0 -GPU2 NV2 NV1 X NV2 SYS NV1 SYS SYS PIX SYS 0-23,48-71 0 -GPU3 NV1 NV2 NV2 X NV1 SYS SYS SYS PIX SYS 0-23,48-71 0 -GPU4 SYS SYS SYS NV1 X NV2 NV2 NV1 SYS NODE 24-47,72-95 1 -GPU5 SYS SYS NV1 SYS NV2 X NV1 NV2 SYS NODE 24-47,72-95 1 -GPU6 SYS NV2 SYS SYS NV2 NV1 X NV1 SYS PIX 24-47,72-95 1 -GPU7 NV2 SYS SYS SYS NV1 NV2 NV1 X SYS PIX 24-47,72-95 1 -mlx5_0 NODE NODE PIX PIX SYS SYS SYS SYS X SYS +GPU2 NV2 NV1 X NV2 SYS NV1 SYS SYS PIX SYS 0-23,48-71 0 +GPU3 NV1 NV2 NV2 X NV1 SYS SYS SYS PIX SYS 0-23,48-71 0 +GPU4 SYS SYS SYS NV1 X NV2 NV2 NV1 SYS NODE 24-47,72-95 1 +GPU5 SYS SYS NV1 SYS NV2 X NV1 NV2 SYS NODE 24-47,72-95 1 +GPU6 SYS NV2 SYS SYS NV2 NV1 X NV1 SYS PIX 24-47,72-95 1 +GPU7 NV2 SYS SYS SYS NV1 NV2 NV1 X SYS PIX 24-47,72-95 1 +mlx5_0 NODE NODE PIX PIX SYS SYS SYS SYS X SYS mlx5_1 SYS SYS SYS SYS NODE NODE PIX PIX SYS X Legend: From a6a139c592f0777bfa256f8313081fd1b48c955a Mon Sep 17 00:00:00 2001 From: ccddyy416 <1482342831@qq.com> Date: Mon, 24 Jan 2022 09:45:27 +0800 Subject: [PATCH 18/20] Update README.md --- HugeCTR/dlrm/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/HugeCTR/dlrm/README.md b/HugeCTR/dlrm/README.md index 08224c65..9ad5485f 100644 --- a/HugeCTR/dlrm/README.md +++ b/HugeCTR/dlrm/README.md @@ -30,7 +30,7 @@ GPU4 SYS SYS SYS NV1 X NV2 NV2 NV1 SYS NODE 24-47,72-95 1 GPU5 SYS SYS NV1 SYS NV2 X NV1 NV2 SYS NODE 24-47,72-95 1 GPU6 SYS NV2 SYS SYS NV2 NV1 X NV1 SYS PIX 24-47,72-95 1 GPU7 NV2 SYS SYS SYS NV1 NV2 NV1 X SYS PIX 24-47,72-95 1 -mlx5_0 NODE NODE PIX PIX SYS SYS SYS SYS X SYS +mlx5_0 NODE NODE PIX PIX SYS SYS SYS SYS X SYS mlx5_1 SYS SYS SYS SYS NODE NODE PIX PIX SYS X Legend: From 0b769c5ecfdac879eb0523ef51ac7fca8c51e74e Mon Sep 17 00:00:00 2001 From: ccddyy416 <1482342831@qq.com> Date: Mon, 24 Jan 2022 09:46:58 +0800 Subject: [PATCH 19/20] Update README.md --- HugeCTR/dlrm/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/HugeCTR/dlrm/README.md b/HugeCTR/dlrm/README.md index 9ad5485f..86b29d79 100644 --- a/HugeCTR/dlrm/README.md +++ b/HugeCTR/dlrm/README.md @@ -27,9 +27,9 @@ GPU1 NV1 X NV1 NV2 SYS SYS NV2 SYS NODE SYS 0-23,48-71 0 GPU2 NV2 NV1 X NV2 SYS NV1 SYS SYS PIX SYS 0-23,48-71 0 GPU3 NV1 NV2 NV2 X NV1 SYS SYS SYS PIX SYS 0-23,48-71 0 GPU4 SYS SYS SYS NV1 X NV2 NV2 NV1 SYS NODE 24-47,72-95 1 -GPU5 SYS SYS NV1 SYS NV2 X NV1 NV2 SYS NODE 24-47,72-95 1 -GPU6 SYS NV2 SYS SYS NV2 NV1 X NV1 SYS PIX 24-47,72-95 1 -GPU7 NV2 SYS SYS SYS NV1 NV2 NV1 X SYS PIX 24-47,72-95 1 +GPU5 SYS SYS NV1 SYS NV2 X NV1 NV2 SYS NODE 24-47,72-95 1 +GPU6 SYS NV2 SYS SYS NV2 NV1 X NV1 SYS PIX 24-47,72-95 1 +GPU7 NV2 SYS SYS SYS NV1 NV2 NV1 X SYS PIX 24-47,72-95 1 mlx5_0 NODE NODE PIX PIX SYS SYS SYS SYS X SYS mlx5_1 SYS SYS SYS SYS NODE NODE PIX PIX SYS X From 8e9233e574550b5d902a094a969a8293bb9613bb Mon Sep 17 00:00:00 2001 From: ccddyy416 <1482342831@qq.com> Date: Tue, 25 Jan 2022 16:19:32 +0800 Subject: [PATCH 20/20] update test result with no evaluation --- HugeCTR/dlrm/README.md | 46 ++++++++++++++++++++++-------------------- 1 file changed, 24 insertions(+), 22 deletions(-) diff --git a/HugeCTR/dlrm/README.md b/HugeCTR/dlrm/README.md index 86b29d79..9636915a 100644 --- a/HugeCTR/dlrm/README.md +++ b/HugeCTR/dlrm/README.md @@ -82,19 +82,20 @@ one GPU | gpu | batch_size | embedding_vec_size | latency(ms) | memory_usage(MB) | | ---- | ---------- | ------------------ | ----------- | ---------------- | -| n1g1 | 32768 | 2 | 51.340 | 4,256 | -| n1g1 | 32768 | 8 | 51.467 | 5,288 | -| n1g1 | 32768 | 32 | 53.179 | 9,428 | -| n1g1 | 32768 | 128 | 65.284 | 25,966 | +| n1g1 | 32768 | 2 | 41.150 | 4,252 | +| n1g1 | 32768 | 8 | 41.396 | 5,286 | +| n1g1 | 32768 | 32 | 42.675 | 9,422 | +| n1g1 | 32768 | 128 | 51.944 | 25,962 | eight GPUs | gpu | batch_size | embedding_vec_size | latency(ms) | memory_usage(MB) | | ---- | ---------- | ------------------ | ----------- | ---------------- | -| n1g8 | 32768 | 2 | 106.358 | 2,112 | -| n1g8 | 32768 | 8 | 112.861 | 2,180 | -| n1g8 | 32768 | 32 | 102.251 | 2,452 | -| n1g8 | 32768 | 128 | 298.217 | 3,540 | +| n1g8 | 32768 | 2 | 66.358 | 2,108 | +| n1g8 | 32768 | 8 | 69.207 | 2,176 | +| n1g8 | 32768 | 32 | 64.588 | 2,448 | +| n1g8 | 32768 | 128 | 201.402 | 3,536 | +| n1g8 | 65536 | 128 | 357.724 | 5,406 | #### batch size @@ -102,25 +103,26 @@ one GPU | gpu | batch_size | embedding_vec_size | latency(ms) | memory_usage(MB) | | ---- | ---------- | ------------------ | ----------- | ---------------- | -| n1g1 | 16 | 128 | 0.687 | 17,890 | -| n1g1 | 64 | 128 | 0.785 | 17,910 | -| n1g1 | 256 | 128 | 1.206 | 17,942 | -| n1g1 | 1024 | 128 | 2.699 | 18,138 | -| n1g1 | 4096 | 128 | 8.860 | 18,900 | -| n1g1 | 16384 | 128 | 33.391 | 21,930 | -| n1g1 | 32768 | 128 | 65.386 | 25,966 | +| n1g1 | 16 | 128 | 0.508 | 17,884 | +| n1g1 | 64 | 128 | 0.608 | 17,902 | +| n1g1 | 256 | 128 | 0.957 | 17,940 | +| n1g1 | 1024 | 128 | 2.159 | 18,134 | +| n1g1 | 4096 | 128 | 7.060 | 18,896 | +| n1g1 | 16384 | 128 | 26.455 | 21,928 | +| n1g1 | 32768 | 128 | 51.935 | 25,962 | eight GPUs | gpu | batch_size | embedding_vec_size | latency(ms) | memory_usage(MB) | | ---- | ---------- | ------------------ | ----------- | ---------------- | -| n1g8 | 16 | 128 | 1.152 | 1,670 | -| n1g8 | 64 | 128 | 1.410 | 1,680 | -| n1g8 | 256 | 128 | 2.781 | 1,700 | -| n1g8 | 1024 | 128 | 10.886 | 1,738 | -| n1g8 | 4096 | 128 | 52.476 | 1,910 | -| n1g8 | 16384 | 128 | 173.699 | 2,608 | -| n1g8 | 32768 | 128 | 296.878 | 3,540 | +| n1g8 | 16 | 128 | 0.859 | 1,666 | +| n1g8 | 64 | 128 | 1.136 | 1,674 | +| n1g8 | 256 | 128 | 2.262 | 1,692 | +| n1g8 | 1024 | 128 | 7.232 | 1,732 | +| n1g8 | 4096 | 128 | 31.997 | 1,906 | +| n1g8 | 16384 | 128 | 112.966 | 2,604 | +| n1g8 | 32768 | 128 | 201.806 | 3,536 | +| n1g8 | 65536 | 128 | 357.724 | 5,406 |