diff --git a/HugeCTR/dlrm/README.md b/HugeCTR/dlrm/README.md index 2a699497..9636915a 100644 --- a/HugeCTR/dlrm/README.md +++ b/HugeCTR/dlrm/README.md @@ -1,15 +1,17 @@ + + # NVIDIA HugeCTR DLRM Benchmark Test + This folder holds NVIDIA HugeCTR DLRM Benchmark Test scripts, tools and reports. You can refer to [HugeCTR User Guide](https://github.com/NVIDIA/HugeCTR/blob/master/docs/hugectr_user_guide.md) for additional information. -## folder structure ## Benchmark Test Cases -This report summarized HugeCTR test on 1 nodes with 8 x Tesla V100 on Dec 2021 +This report summarized HugeCTR test on 1 nodes with 8 x Tesla V100 in Jan 2022 ### Test Environment -- 1 nodes with Tesla V100-SXM2-16GB x 8 +- 1 nodes with Tesla V100-SXM2-32GB x 8 - InfiniBand 100 Gb/sec (4X EDR), Mellanox Technologies MT27700 Family - Intel(R) Xeon(R) Gold 6271C CPU @ 2.60GHz ($ cat /proc/cpuinfo | grep name | cut -f2 -d: | uniq -c*) - Memory 384G ($ cat /proc/meminfo) @@ -19,17 +21,17 @@ This report summarized HugeCTR test on 1 nodes with 8 x Tesla V100 on Dec 2021 - `nvidia-smi topo -m` ``` - GPU0 GPU1 GPU2 GPU3 GPU4 GPU5 GPU6 GPU7 mlx5_0 mlx5_1 CPU Affinity NUMA Affinity -GPU0 X NV1 NV2 NV1 SYS SYS SYS NV2 NODE SYS 0-23,48-71 0 -GPU1 NV1 X NV1 NV2 SYS SYS NV2 SYS NODE SYS 0-23,48-71 0 -GPU2 NV2 NV1 X NV2 SYS NV1 SYS SYS PIX SYS 0-23,48-71 0 -GPU3 NV1 NV2 NV2 X NV1 SYS SYS SYS PIX SYS 0-23,48-71 0 -GPU4 SYS SYS SYS NV1 X NV2 NV2 NV1 SYS NODE 24-47,72-95 1 -GPU5 SYS SYS NV1 SYS NV2 X NV1 NV2 SYS NODE 24-47,72-95 1 -GPU6 SYS NV2 SYS SYS NV2 NV1 X NV1 SYS PIX 24-47,72-95 1 -GPU7 NV2 SYS SYS SYS NV1 NV2 NV1 X SYS PIX 24-47,72-95 1 -mlx5_0 NODE NODE PIX PIX SYS SYS SYS SYS X SYS -mlx5_1 SYS SYS SYS SYS NODE NODE PIX PIX SYS X + GPU0 GPU1 GPU2 GPU3 GPU4 GPU5 GPU6 GPU7 mlx5_0 mlx5_1 CPU Affinity NUMA Affinity +GPU0 X NV1 NV2 NV1 SYS SYS SYS NV2 NODE SYS 0-23,48-71 0 +GPU1 NV1 X NV1 NV2 SYS SYS NV2 SYS NODE SYS 0-23,48-71 0 +GPU2 NV2 NV1 X NV2 SYS NV1 SYS SYS PIX SYS 0-23,48-71 0 +GPU3 NV1 NV2 NV2 X NV1 SYS SYS SYS PIX SYS 0-23,48-71 0 +GPU4 SYS SYS SYS NV1 X NV2 NV2 NV1 SYS NODE 24-47,72-95 1 +GPU5 SYS SYS NV1 SYS NV2 X NV1 NV2 SYS NODE 24-47,72-95 1 +GPU6 SYS NV2 SYS SYS NV2 NV1 X NV1 SYS PIX 24-47,72-95 1 +GPU7 NV2 SYS SYS SYS NV1 NV2 NV1 X SYS PIX 24-47,72-95 1 +mlx5_0 NODE NODE PIX PIX SYS SYS SYS SYS X SYS +mlx5_1 SYS SYS SYS SYS NODE NODE PIX PIX SYS X Legend: @@ -66,9 +68,61 @@ command: bash dlrm.sh | decay_start | 0 | | decay_steps | 1 | | decay_power | 2 | -| end_lr', | 0 | +| end_lr | 0 | + +### baseline runing log + +see baseline_log_info.csv + +### Test Result + +#### embedding size + +one GPU + +| gpu | batch_size | embedding_vec_size | latency(ms) | memory_usage(MB) | +| ---- | ---------- | ------------------ | ----------- | ---------------- | +| n1g1 | 32768 | 2 | 41.150 | 4,252 | +| n1g1 | 32768 | 8 | 41.396 | 5,286 | +| n1g1 | 32768 | 32 | 42.675 | 9,422 | +| n1g1 | 32768 | 128 | 51.944 | 25,962 | + +eight GPUs + +| gpu | batch_size | embedding_vec_size | latency(ms) | memory_usage(MB) | +| ---- | ---------- | ------------------ | ----------- | ---------------- | +| n1g8 | 32768 | 2 | 66.358 | 2,108 | +| n1g8 | 32768 | 8 | 69.207 | 2,176 | +| n1g8 | 32768 | 32 | 64.588 | 2,448 | +| n1g8 | 32768 | 128 | 201.402 | 3,536 | +| n1g8 | 65536 | 128 | 357.724 | 5,406 | + +#### batch size + +one GPU + +| gpu | batch_size | embedding_vec_size | latency(ms) | memory_usage(MB) | +| ---- | ---------- | ------------------ | ----------- | ---------------- | +| n1g1 | 16 | 128 | 0.508 | 17,884 | +| n1g1 | 64 | 128 | 0.608 | 17,902 | +| n1g1 | 256 | 128 | 0.957 | 17,940 | +| n1g1 | 1024 | 128 | 2.159 | 18,134 | +| n1g1 | 4096 | 128 | 7.060 | 18,896 | +| n1g1 | 16384 | 128 | 26.455 | 21,928 | +| n1g1 | 32768 | 128 | 51.935 | 25,962 | + +eight GPUs + +| gpu | batch_size | embedding_vec_size | latency(ms) | memory_usage(MB) | +| ---- | ---------- | ------------------ | ----------- | ---------------- | +| n1g8 | 16 | 128 | 0.859 | 1,666 | +| n1g8 | 64 | 128 | 1.136 | 1,674 | +| n1g8 | 256 | 128 | 2.262 | 1,692 | +| n1g8 | 1024 | 128 | 7.232 | 1,732 | +| n1g8 | 4096 | 128 | 31.997 | 1,906 | +| n1g8 | 16384 | 128 | 112.966 | 2,604 | +| n1g8 | 32768 | 128 | 201.806 | 3,536 | +| n1g8 | 65536 | 128 | 357.724 | 5,406 | -### baseline 运行log -见baseline_log_info.csv diff --git a/HugeCTR/dlrm/dlrm.py b/HugeCTR/dlrm/dlrm.py index cba254d8..cd68dce5 100644 --- a/HugeCTR/dlrm/dlrm.py +++ b/HugeCTR/dlrm/dlrm.py @@ -53,7 +53,7 @@ def DLRM(args): model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.InnerProduct, bottom_names = ["relu2"], top_names = ["fc3"], - num_output=128)) + num_output=args.embedding_vec_size)) model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.ReLU, bottom_names = ["fc3"], top_names = ["relu3"])) @@ -126,7 +126,7 @@ def _print_args(args): print("=".ljust(66, "=")) print( "Running {}: gpu_num_per_node = {}, num_nodes = {}.".format( - "HugeCTR-WDL", args.gpu_num_per_node, args.num_nodes + "HugeCTR-DLRM", args.gpu_num_per_node, args.num_nodes ) ) print("=".ljust(66, "=")) diff --git a/HugeCTR/dlrm/dlrm_bsz_test.sh b/HugeCTR/dlrm/dlrm_bsz_test.sh new file mode 100644 index 00000000..d339a5af --- /dev/null +++ b/HugeCTR/dlrm/dlrm_bsz_test.sh @@ -0,0 +1,24 @@ +max_iter=12000 +warmup_steps=1000 +lr=0.5 +for bsz in 16 64 256 1024 4096 16384 32768 +do + for ngpu in 1 8 + do + test_case=dlrm_test_n1g$ngpu}_bsz${bsz} + mem_usage_file=${test_case}.mem + + python gpu_memory_usage.py 1> log/$mem_usage_file 2>&1 log/$mem_usage_file 2>&1 3 and ss[1] == 'Iter:' and '[INFO]' in ss[0]: + if int(ss[2].strip()) != start_iter: + latencies.append(float(ss[5].strip()[:-1])) + if loss_print_every_n_iter > 0: + result_dict['latency(ms)'] = 1000 * sum(latencies) / len(latencies) / loss_print_every_n_iter + mem = extract_mem_info(log_file[:-3] + 'mem') + result_dict['memory_usage(MB)'] = mem + return result_dict + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="flags for HugeCTR WDL") + parser.add_argument("--benchmark_log_dir", type=str, required=True) + parser.add_argument("--start_iter", type=int, default=1000) + # parser.add_argument("--end_iter", type=int, default=1100) + args = parser.parse_args() + + logs_list = sorted(glob.glob(os.path.join(args.benchmark_log_dir, "*.log")), key=os.path.getmtime) + #logs_list = sorted(logs_list) + chunk_list = {} + for log_file in logs_list: + test_result = extract_info_from_file(log_file, args.start_iter) + print(test_result) + json_file = os.path.basename(log_file)[:-4] + # print(json_file) + test_result['log_file'] = json_file + if json_file not in chunk_list.keys(): + chunk_list[json_file] = [] + chunk_list[json_file].append(test_result) + result_list = [] + for log_name,chunk in chunk_list.items(): + latency_list = [] + for single_result in chunk: + latency_list.append(single_result['latency(ms)']) + tmp_chunk = chunk[0] + tmp_chunk['gpu'] = 'n{}g{}'.format(tmp_chunk['num_nodes'], tmp_chunk['gpu_num_per_node']) + tmp_chunk['latency(ms)'] = median(latency_list) + result_list.append(tmp_chunk) + #with open(os.path.join(args.benchmark_log_dir, 'latency_reprot.md'), 'w') as f: + report_file = args.benchmark_log_dir + '_latency_report.md' + with open(report_file, 'w') as f: + titles = ['log_file', 'gpu', 'batch_size', 'embedding_vec_size', 'latency(ms)', 'memory_usage(MB)'] + write_line(f, titles, '|', True) + write_line(f, ['----' for _ in titles], '|', True) + for result in result_list: + if 'latency(ms)' not in result.keys(): + print(result['log_file'], 'is not complete!') + continue + cells = [value_format(result[title]) for title in titles] + write_line(f, cells, '|', True) diff --git a/HugeCTR/dlrm/gpu_memory_usage.py b/HugeCTR/dlrm/gpu_memory_usage.py new file mode 100644 index 00000000..3f985b65 --- /dev/null +++ b/HugeCTR/dlrm/gpu_memory_usage.py @@ -0,0 +1,25 @@ +import time +from pynvml import * + +nvmlInit() +handle = nvmlDeviceGetHandleByIndex(0) +running = True + +mem_threshold = 32*1024*1024 +state = 'init' #'Detecting' + +device0_max_used_mem = 0 +while running == True: + time.sleep(1) + info = nvmlDeviceGetMemoryInfo(handle) + if state == 'init': + if info.used > mem_threshold: + state = 'Detecting' + elif state == 'Detecting': + if info.used < mem_threshold: + running = False + else: + device0_max_used_mem = max(device0_max_used_mem, info.used) + +nvmlShutdown() +print('max device0 memory usage is:', device0_max_used_mem)