From 001a525e2eadb913cb2a0603e06d70f52f9abead Mon Sep 17 00:00:00 2001 From: ccddyy416 <1482342831@qq.com> Date: Mon, 7 Feb 2022 14:00:19 +0800 Subject: [PATCH 1/4] add batchsize and embsize testing scripts --- OneFlow/ClickThroughRate/DLRM/bsz_test.sh | 58 +++++++++++++++++++ .../ClickThroughRate/DLRM/emb_size_test.sh | 58 +++++++++++++++++++ 2 files changed, 116 insertions(+) create mode 100644 OneFlow/ClickThroughRate/DLRM/bsz_test.sh create mode 100644 OneFlow/ClickThroughRate/DLRM/emb_size_test.sh diff --git a/OneFlow/ClickThroughRate/DLRM/bsz_test.sh b/OneFlow/ClickThroughRate/DLRM/bsz_test.sh new file mode 100644 index 00000000..4bcc9bf1 --- /dev/null +++ b/OneFlow/ClickThroughRate/DLRM/bsz_test.sh @@ -0,0 +1,58 @@ +rm core.* +MASTER_ADDR=127.0.0.1 +NUM_NODES=1 +NODE_RANK=0 +# DATA_DIR=/dataset/wdl_ofrecord/ofrecord +dataset_format=ofrecord +DATA_DIR=/tank/dataset/criteo_kaggle/dlrm_$dataset_format +EMBD_SIZE=33762577 # 33762578 +emb_size=16 + +# test: 3274330 +# val: 3274328 +# train: 39291958 +eval_batch_size=327432 +eval_batchs=$(( 3274330 / eval_batch_size )) +# export CUDA_VISIBLE_DEVICES=1 +export ONEFLOW_DEBUG_MODE=True + +for DEVICE_NUM_PER_NODE in 1 8 +do + for BATHSIZE in 16 64 256 1024 4096 16384 65536 + do + test_case=BATHSIZE_test_n1g${DEVICE_NUM_PER_NODE} + log_file=${test_case}.log + mem_file=${test_case}.mem + + python gpu_memory_usage.py 1> log/$mem_file 2>&1 log/$mem_file 2>&1 Date: Mon, 7 Feb 2022 17:37:15 +0800 Subject: [PATCH 2/4] update --- OneFlow/ClickThroughRate/DLRM/bsz_test.sh | 2 +- .../ClickThroughRate/DLRM/emb_size_test.sh | 2 +- .../ClickThroughRate/DLRM/gpu_memory_usage.py | 25 ++ OneFlow/ClickThroughRate/DLRM/train.py | 265 ++++++++++++++++++ 4 files changed, 292 insertions(+), 2 deletions(-) create mode 100644 OneFlow/ClickThroughRate/DLRM/gpu_memory_usage.py create mode 100644 OneFlow/ClickThroughRate/DLRM/train.py diff --git a/OneFlow/ClickThroughRate/DLRM/bsz_test.sh b/OneFlow/ClickThroughRate/DLRM/bsz_test.sh index 4bcc9bf1..38e7d336 100644 --- a/OneFlow/ClickThroughRate/DLRM/bsz_test.sh +++ b/OneFlow/ClickThroughRate/DLRM/bsz_test.sh @@ -50,8 +50,8 @@ do --data_part_num 256 \ --data_part_name_suffix_length 5 \ --execution_mode 'graph' \ - # --model_load_dir /tank/model_zoo/dlrm_baseline_params_emb$emb_size \ --test_name 'train_graph_conisitent_'$DEVICE_NUM_PER_NODE'gpu' | tee log/${test_case}.log + # --model_load_dir /tank/model_zoo/dlrm_baseline_params_emb$emb_size \ # --dataset_format torch \ # --model_load_dir /tank/xiexuan/dlrm/initial_parameters \ done diff --git a/OneFlow/ClickThroughRate/DLRM/emb_size_test.sh b/OneFlow/ClickThroughRate/DLRM/emb_size_test.sh index e88353e3..fdcffcce 100644 --- a/OneFlow/ClickThroughRate/DLRM/emb_size_test.sh +++ b/OneFlow/ClickThroughRate/DLRM/emb_size_test.sh @@ -50,8 +50,8 @@ do --data_part_num 256 \ --data_part_name_suffix_length 5 \ --execution_mode 'graph' \ - # --model_load_dir /tank/model_zoo/dlrm_baseline_params_emb$emb_size \ --test_name 'train_graph_conisitent_'$DEVICE_NUM_PER_NODE'gpu' | tee log/${test_case}.log + # --model_load_dir /tank/model_zoo/dlrm_baseline_params_emb$emb_size \ # --dataset_format torch \ # --model_load_dir /tank/xiexuan/dlrm/initial_parameters \ done diff --git a/OneFlow/ClickThroughRate/DLRM/gpu_memory_usage.py b/OneFlow/ClickThroughRate/DLRM/gpu_memory_usage.py new file mode 100644 index 00000000..3b036ebe --- /dev/null +++ b/OneFlow/ClickThroughRate/DLRM/gpu_memory_usage.py @@ -0,0 +1,25 @@ +import time +from pynvml import * + +nvmlInit() +handle = nvmlDeviceGetHandleByIndex(0) +running = True + +mem_threshold = 32*1024*1024 +state = 'init' #'Detecting' + +device0_max_used_mem = 0 +while running == True: + time.sleep(1) + info = nvmlDeviceGetMemoryInfo(handle) + if state == 'init': + if info.used > mem_threshold: + state = 'Detecting' + elif state == 'Detecting': + if info.used < mem_threshold: + running = False + else: + device0_max_used_mem = max(device0_max_used_mem, info.used) + +nvmlShutdown() +print('max device0 memory usage is:', device0_max_used_mem) diff --git a/OneFlow/ClickThroughRate/DLRM/train.py b/OneFlow/ClickThroughRate/DLRM/train.py new file mode 100644 index 00000000..81c7bed5 --- /dev/null +++ b/OneFlow/ClickThroughRate/DLRM/train.py @@ -0,0 +1,265 @@ +import oneflow as flow +import os +import sys +import pickle + +sys.path.append( + os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir)) +) +import numpy as np +from sklearn.metrics import roc_auc_score +from config import get_args +from models.data import make_data_loader +from models.dlrm import make_dlrm_module +from lr_scheduler import make_lr_scheduler +from oneflow.nn.parallel import DistributedDataParallel as DDP +from graph import DLRMValGraph, DLRMTrainGraph +import warnings +import utils.logger as log +from utils.auc_calculater import calculate_auc_from_dir + + +class Trainer(object): + def __init__(self): + args = get_args() + self.args = args + self.save_path = args.model_save_dir + self.save_init = args.save_initial_model + self.save_model_after_each_eval = args.save_model_after_each_eval + self.eval_after_training = args.eval_after_training + self.dataset_format = args.dataset_format + self.execution_mode = args.execution_mode + self.max_iter = args.max_iter + self.loss_print_every_n_iter = args.loss_print_every_n_iter + self.ddp = args.ddp + if self.ddp == 1 and self.execution_mode == "graph": + warnings.warn( + """when ddp is True, the execution_mode can only be eager, but it is graph""", + UserWarning, + ) + self.execution_mode = "eager" + self.is_consistent = args.is_consistent + self.rank = flow.env.get_rank() + self.world_size = flow.env.get_world_size() + self.cur_iter = 0 + self.eval_interval = args.eval_interval + self.eval_batchs = args.eval_batchs + self.init_logger() + self.train_dataloader = make_data_loader(args, "train", self.is_consistent, self.dataset_format) + self.val_dataloader = make_data_loader(args, "val", self.is_consistent, self.dataset_format) + self.dlrm_module = make_dlrm_module(args) + if self.is_consistent: + self.dlrm_module.to_consistent(flow.env.all_device_placement("cuda"), flow.sbp.broadcast) + self.dlrm_module.embedding.set_model_parallel(flow.env.all_device_placement("cuda")) + else: + self.dlrm_module.to("cuda") + self.init_model() + # self.opt = flow.optim.Adam( + self.opt = flow.optim.SGD( + self.dlrm_module.parameters(), lr=args.learning_rate + ) + self.lr_scheduler = make_lr_scheduler(args, self.opt) + if args.loss_scale_policy == "static": + self.grad_scaler = flow.amp.StaticGradScaler(1024) + else: + self.grad_scaler = flow.amp.GradScaler( + init_scale=1073741824, + growth_factor=2.0, + backoff_factor=0.5, + growth_interval=2000, + ) + + self.loss = flow.nn.BCELoss(reduction="none").to("cuda") + if self.execution_mode == "graph": + self.eval_graph = DLRMValGraph( + self.dlrm_module, self.val_dataloader, args.use_fp16 + ) + self.train_graph = DLRMTrainGraph( + self.dlrm_module, self.train_dataloader, self.loss, self.opt, + self.lr_scheduler, self.grad_scaler, args.use_fp16 + ) + + def init_model(self): + args = self.args + if args.model_load_dir != "": + self.load_state_dict() + if self.ddp: + self.dlrm_module = DDP(self.dlrm_module) + if self.save_init and args.model_save_dir != "": + self.save("initial_checkpoint") + + def init_logger(self): + print_ranks = [0] + self.train_logger = log.make_logger(self.rank, print_ranks) + self.train_logger.register_metric("iter", log.IterationMeter(), "iter: {}/{}") + self.train_logger.register_metric("loss", log.AverageMeter(), "loss: {:.16f}", True) + self.train_logger.register_metric("latency", log.LatencyMeter(), "latency(ms): {:.16f}", True) + + self.val_logger = log.make_logger(self.rank, print_ranks) + self.val_logger.register_metric("iter", log.IterationMeter(), "iter: {}/{}") + self.val_logger.register_metric("auc", log.IterationMeter(), "eval_auc: {}") + + def meter( + self, + loss=None, + do_print=False, + ): + self.train_logger.meter("iter", (self.cur_iter, self.max_iter)) + if loss is not None: + self.train_logger.meter("loss", loss) + self.train_logger.meter("latency") + if do_print: + self.train_logger.print_metrics() + + def meter_train_iter(self, loss): + do_print = ( + self.cur_iter % self.loss_print_every_n_iter == 0 + ) + self.meter( + loss=loss, + do_print=do_print, + ) + + def meter_eval(self, auc): + self.val_logger.meter("iter", (self.cur_iter, self.max_iter)) + if auc is not None: + self.val_logger.meter("auc", auc) + self.val_logger.print_metrics() + + + def load_state_dict(self): + print(f"Loading model from {self.args.model_load_dir}") + if self.is_consistent: + state_dict = flow.load(self.args.model_load_dir, consistent_src_rank=0) + elif self.rank == 0: + state_dict = flow.load(self.args.model_load_dir) + else: + return + self.dlrm_module.load_state_dict(state_dict, strict=False) + + def save(self, subdir): + if self.save_path is None or self.save_path == '': + return + save_path = os.path.join(self.save_path, subdir) + if self.rank == 0: + print(f"Saving model to {save_path}") + state_dict = self.dlrm_module.state_dict() + if self.is_consistent: + flow.save(state_dict, save_path, consistent_dst_rank=0) + elif self.rank == 0: + flow.save(state_dict, save_path) + else: + return + + def __call__(self): + self.train() + + def train(self): + self.dlrm_module.train() + for _ in range(self.max_iter): + self.cur_iter += 1 + loss = self.train_one_step() + + loss = tol(loss) + + self.meter_train_iter(loss) + + if self.eval_interval > 0 and self.cur_iter % self.eval_interval == 0: + self.eval(self.save_model_after_each_eval) + if self.eval_after_training: + self.eval(True) + if self.args.eval_save_dir != '' and self.rank == 0: + calculate_auc_from_dir(self.args.eval_save_dir) + + def eval(self, save_model=False): + if self.eval_batchs <= 0: + return + self.dlrm_module.eval() + labels = [] + preds = [] + for _ in range(self.eval_batchs): + if self.execution_mode == "graph": + pred, label = self.eval_graph() + else: + pred, label = self.inference() + label_ = label.numpy().astype(np.float32) + labels.append(label_) + preds.append(pred.numpy()) + if self.args.eval_save_dir != '': + if self.rank == 0: + pf = os.path.join(self.args.eval_save_dir, f'iter_{self.cur_iter}.pkl') + with open(pf, 'wb') as f: + obj = {'labels': labels, 'preds': preds, 'iter': self.cur_iter} + pickle.dump(obj, f, protocol=pickle.HIGHEST_PROTOCOL) + auc = roc_auc_score(label_, pred.numpy()) + # auc = 'nc' + else: + labels = np.concatenate(labels, axis=0) + preds = np.concatenate(preds, axis=0) + auc = roc_auc_score(labels, preds) + self.meter_eval(auc) + if save_model: + sub_save_dir = f"iter_{self.cur_iter}_val_auc_{auc}" + self.save(sub_save_dir) + self.dlrm_module.train() + + def inference(self): + ( + labels, + dense_fields, + sparse_fields, + ) = self.val_dataloader() + labels = labels.to("cuda") + dense_fields = dense_fields.to("cuda") + sparse_fields = sparse_fields.to("cuda") + with flow.no_grad(): + predicts = self.dlrm_module( + dense_fields, sparse_fields + ) + return predicts, labels + + def forward(self): + ( + labels, + dense_fields, + sparse_fields, + ) = self.train_dataloader() + labels = labels.to("cuda") + dense_fields = dense_fields.to("cuda") + sparse_fields = sparse_fields.to("cuda") + predicts = self.dlrm_module(dense_fields, sparse_fields) + loss = self.loss(predicts, labels) + reduce_loss = flow.mean(loss) + return reduce_loss + + def train_eager(self): + loss = self.forward() + loss.backward() + self.opt.step() + self.opt.zero_grad() + return loss + + def train_one_step(self): + self.dlrm_module.train() + if self.execution_mode == "graph": + train_loss = self.train_graph() + else: + train_loss = self.train_eager() + return train_loss + + +def tol(tensor, pure_local=True): + """ to local """ + if tensor.is_consistent: + if pure_local: + tensor = tensor.to_local() + else: + tensor = tensor.to_consistent(sbp=flow.sbp.broadcast).to_local() + + return tensor + + +if __name__ == "__main__": + flow.boxing.nccl.enable_all_to_all(True) + trainer = Trainer() + trainer() From ca6d64d470233bf19e27222cc4b229e89220a610 Mon Sep 17 00:00:00 2001 From: ccddyy416 <1482342831@qq.com> Date: Tue, 8 Feb 2022 14:19:35 +0800 Subject: [PATCH 3/4] modify test scripts --- OneFlow/ClickThroughRate/DLRM/bsz_test.sh | 52 +--- OneFlow/ClickThroughRate/DLRM/dlrm_test.sh | 57 ++++ .../ClickThroughRate/DLRM/emb_size_test.sh | 56 +--- OneFlow/ClickThroughRate/DLRM/train.py | 265 ------------------ 4 files changed, 63 insertions(+), 367 deletions(-) create mode 100644 OneFlow/ClickThroughRate/DLRM/dlrm_test.sh delete mode 100644 OneFlow/ClickThroughRate/DLRM/train.py diff --git a/OneFlow/ClickThroughRate/DLRM/bsz_test.sh b/OneFlow/ClickThroughRate/DLRM/bsz_test.sh index 38e7d336..139ce7ab 100644 --- a/OneFlow/ClickThroughRate/DLRM/bsz_test.sh +++ b/OneFlow/ClickThroughRate/DLRM/bsz_test.sh @@ -1,58 +1,10 @@ -rm core.* -MASTER_ADDR=127.0.0.1 -NUM_NODES=1 -NODE_RANK=0 -# DATA_DIR=/dataset/wdl_ofrecord/ofrecord -dataset_format=ofrecord -DATA_DIR=/tank/dataset/criteo_kaggle/dlrm_$dataset_format -EMBD_SIZE=33762577 # 33762578 +test_name=bsz_test emb_size=16 -# test: 3274330 -# val: 3274328 -# train: 39291958 -eval_batch_size=327432 -eval_batchs=$(( 3274330 / eval_batch_size )) -# export CUDA_VISIBLE_DEVICES=1 -export ONEFLOW_DEBUG_MODE=True - for DEVICE_NUM_PER_NODE in 1 8 do for BATHSIZE in 16 64 256 1024 4096 16384 65536 do - test_case=BATHSIZE_test_n1g${DEVICE_NUM_PER_NODE} - log_file=${test_case}.log - mem_file=${test_case}.mem - - python gpu_memory_usage.py 1> log/$mem_file 2>&1 log/$mem_file 2>&1 log/$mem_file 2>&1 0 and self.cur_iter % self.eval_interval == 0: - self.eval(self.save_model_after_each_eval) - if self.eval_after_training: - self.eval(True) - if self.args.eval_save_dir != '' and self.rank == 0: - calculate_auc_from_dir(self.args.eval_save_dir) - - def eval(self, save_model=False): - if self.eval_batchs <= 0: - return - self.dlrm_module.eval() - labels = [] - preds = [] - for _ in range(self.eval_batchs): - if self.execution_mode == "graph": - pred, label = self.eval_graph() - else: - pred, label = self.inference() - label_ = label.numpy().astype(np.float32) - labels.append(label_) - preds.append(pred.numpy()) - if self.args.eval_save_dir != '': - if self.rank == 0: - pf = os.path.join(self.args.eval_save_dir, f'iter_{self.cur_iter}.pkl') - with open(pf, 'wb') as f: - obj = {'labels': labels, 'preds': preds, 'iter': self.cur_iter} - pickle.dump(obj, f, protocol=pickle.HIGHEST_PROTOCOL) - auc = roc_auc_score(label_, pred.numpy()) - # auc = 'nc' - else: - labels = np.concatenate(labels, axis=0) - preds = np.concatenate(preds, axis=0) - auc = roc_auc_score(labels, preds) - self.meter_eval(auc) - if save_model: - sub_save_dir = f"iter_{self.cur_iter}_val_auc_{auc}" - self.save(sub_save_dir) - self.dlrm_module.train() - - def inference(self): - ( - labels, - dense_fields, - sparse_fields, - ) = self.val_dataloader() - labels = labels.to("cuda") - dense_fields = dense_fields.to("cuda") - sparse_fields = sparse_fields.to("cuda") - with flow.no_grad(): - predicts = self.dlrm_module( - dense_fields, sparse_fields - ) - return predicts, labels - - def forward(self): - ( - labels, - dense_fields, - sparse_fields, - ) = self.train_dataloader() - labels = labels.to("cuda") - dense_fields = dense_fields.to("cuda") - sparse_fields = sparse_fields.to("cuda") - predicts = self.dlrm_module(dense_fields, sparse_fields) - loss = self.loss(predicts, labels) - reduce_loss = flow.mean(loss) - return reduce_loss - - def train_eager(self): - loss = self.forward() - loss.backward() - self.opt.step() - self.opt.zero_grad() - return loss - - def train_one_step(self): - self.dlrm_module.train() - if self.execution_mode == "graph": - train_loss = self.train_graph() - else: - train_loss = self.train_eager() - return train_loss - - -def tol(tensor, pure_local=True): - """ to local """ - if tensor.is_consistent: - if pure_local: - tensor = tensor.to_local() - else: - tensor = tensor.to_consistent(sbp=flow.sbp.broadcast).to_local() - - return tensor - - -if __name__ == "__main__": - flow.boxing.nccl.enable_all_to_all(True) - trainer = Trainer() - trainer() From 17d4ef01f89cfb72a6193e1df728897f46edff0f Mon Sep 17 00:00:00 2001 From: ccddyy416 <1482342831@qq.com> Date: Fri, 11 Feb 2022 14:20:13 +0800 Subject: [PATCH 4/4] extract log info --- .../DLRM/extract_info_from_log.py | 119 ++++++++++++++++++ .../DLRM/extract_info_from_log.sh | 1 + 2 files changed, 120 insertions(+) create mode 100644 OneFlow/ClickThroughRate/DLRM/extract_info_from_log.py create mode 100644 OneFlow/ClickThroughRate/DLRM/extract_info_from_log.sh diff --git a/OneFlow/ClickThroughRate/DLRM/extract_info_from_log.py b/OneFlow/ClickThroughRate/DLRM/extract_info_from_log.py new file mode 100644 index 00000000..5dbeeb16 --- /dev/null +++ b/OneFlow/ClickThroughRate/DLRM/extract_info_from_log.py @@ -0,0 +1,119 @@ +import argparse +import os +import glob +from statistics import median + + + + +def write_line(f, lst, separator=',', start_end=False): + lst = ['', *lst, ''] if start_end else lst + f.write(separator.join(lst)) + f.write('\n') + + +def value_format(value): + if isinstance(value, float): + return '{:.3f}'.format(value) + elif isinstance(value, int): + return f'{value:,}' + else: + return str(value) + + +def extract_mem_info(mem_file): + if not os.path.isfile(mem_file): + return 'NA' + + with open(mem_file, 'r') as f: + for line in f.readlines(): + ss = line.split(' ') + if len(ss) < 5: + continue + if ss[0] == 'max': + return int(float(ss[-1].strip()) / 1024 /1024) + return 'NA' + +def extract_info_from_file_for_models(log_file): + ''' +[rank:0] iter: 100/1200, loss: 0.0831875279545784, latency(ms): 81.5818255022168159 | 2021-12-01 13:19:02.625 +[rank:0] iter: 200/1200, loss: 0.0780148208141327, latency(ms): 2.2327776625752449 | 2021-12-01 13:19:02.848 +... +[rank:0] iter: 1200/1200, loss: 0.0711858719587326, latency(ms): 2.3108293302357197 | 2021-12-01 13:19:05.145 + ''' + # extract info from file name + result_dict = {} + with open(log_file, 'r') as f: + latencies = [] + for line in f.readlines(): + ss = line.strip().split(' ') + if ss[0] in ['num_nodes', 'batch_size', 'batch_size_per_proc', 'vocab_size','embedding_vec_size']: + result_dict[ss[0]] = ss[2].strip() + elif len(ss) > 6 and ss[1] == 'iter:' and ss[3] == 'loss:': + latencies.append(float(ss[6].strip())) + + result_dict['gpu_num_per_node'] = int(int(result_dict['batch_size']) / int(result_dict['batch_size_per_proc'])) + result_dict['num_nodes'] = 1 + + if len(latencies) > 2: + latencies.pop(0) + latencies.pop(-1) + + if len(latencies) > 0: + result_dict['latency(ms)'] = sum(latencies) / len(latencies) + else: + result_dict['latency(ms)'] = 'NA' + + mem = extract_mem_info(log_file[:-3] + 'mem') + result_dict['memory_usage(MB)'] = mem + return result_dict + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="flags for OneFlow wide&deep") + parser.add_argument("--benchmark_log_dir", type=str, required=True) + parser.add_argument("--repo", type=str, default='benchmark', help='benchmark or models') + args = parser.parse_args() + + logs_list = sorted(glob.glob(os.path.join(args.benchmark_log_dir, "*.log")), key=os.path.getmtime) + #logs_list = sorted(logs_list) + chunk_list = {} + for log_file in logs_list: + if args.repo == 'benchmark': + test_result = extract_info_from_file(log_file) + else: + test_result = extract_info_from_file_for_models(log_file) + + print(test_result) + json_file = os.path.basename(log_file)[:-4] + # json_file = os.path.basename(log_file)[:-13] + print(json_file) + test_result['log_file'] = json_file + if json_file not in chunk_list.keys(): + chunk_list[json_file] = [] + chunk_list[json_file].append(test_result) + result_list = [] + for log_name, chunk in chunk_list.items(): + latency_list = [] + for single_result in chunk: + if 'latency(ms)' in single_result: + latency_list.append(single_result['latency(ms)']) + tmp_chunk = chunk[0] + tmp_chunk['gpu'] = 'n{}g{}'.format(tmp_chunk['num_nodes'], tmp_chunk['gpu_num_per_node']) + if len(latency_list): + tmp_chunk['latency(ms)'] = median(latency_list) + result_list.append(tmp_chunk) + else: + print('latency is not calculated in ', log_name) + #with open(os.path.join(args.benchmark_log_dir, 'latency_reprot.md'), 'w') as f: + report_file = args.benchmark_log_dir + '_latency_report.md' + with open(report_file, 'w') as f: + titles = ['log_file', 'gpu', 'batch_size', 'vocab_size','embedding_vec_size', 'latency(ms)', 'memory_usage(MB)'] + write_line(f, titles, '|', True) + write_line(f, ['----' for _ in titles], '|', True) + for result in result_list: + if 'latency(ms)' not in result.keys(): + print(result['log_file'], 'is not complete!') + continue + cells = [value_format(result[title]) for title in titles] + write_line(f, cells, '|', True) diff --git a/OneFlow/ClickThroughRate/DLRM/extract_info_from_log.sh b/OneFlow/ClickThroughRate/DLRM/extract_info_from_log.sh new file mode 100644 index 00000000..09a2ae18 --- /dev/null +++ b/OneFlow/ClickThroughRate/DLRM/extract_info_from_log.sh @@ -0,0 +1 @@ +python extract_info_from_log.py --benchmark_log_dir ./log --repo models