Skip to content
4 changes: 4 additions & 0 deletions LanguageModeling/BERT/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,10 @@ def get_parser(parser=None):
help='use use fp16 or not')
parser.add_argument('--use_xla', type=str2bool, nargs='?', const=True,
help='Whether to use use xla')
parser.add_argument("--num_accumulation_steps", type=int, default=1,
help='Number of accumulation steps before gradient update, Global batch size = num_accumulation_steps * train_batch_size')
parser.add_argument("--optimizer_type", type=str, default="adam",
help="Optimizer used for training - LAMB or ADAM")

# log and resore/save
parser.add_argument("--loss_print_every_n_iter", type=int, default=10, required=False,
Expand Down
28 changes: 25 additions & 3 deletions LanguageModeling/BERT/run_pretraining.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,18 +29,35 @@
parser.add_argument("--data_part_num", type=int, default=32, help="data part number in dataset")
parser.add_argument("--iter_num", type=int, default=1144000, help="total iterations to run")
parser.add_argument("--batch_size_per_device", type=int, default=64)
parser.add_argument("--debug", type=int, default=0)
parser.add_argument("--data_load_random", type=int, default=1)
parser.add_argument("--model_load", type=str, default=None)


args = parser.parse_args()
configs.print_args(args)


if args.debug == 1:
flow.config.enable_debug_mode(True)
print('Enable Debug !!!!!!!')

if args.data_load_random == 1:
random_tmp=True
print('Enable random loading of data !!!!!!!')
else:
random_tmp=False
print('Disable random loading of data !!!!!!!')

batch_size = args.num_nodes * args.gpu_num_per_node * args.batch_size_per_device


def BertDecoder(data_dir, batch_size, data_part_num, seq_length, max_predictions_per_seq):
ofrecord = flow.data.ofrecord_reader(data_dir,
batch_size=batch_size,
data_part_num=data_part_num,
random_shuffle = True,
shuffle_after_epoch=True)
random_shuffle = random_tmp,
shuffle_after_epoch=random_tmp)
blob_confs = {}
def _blob_conf(name, shape, dtype=flow.int32):
blob_confs[name] = flow.data.OFRecordRawDecoder(ofrecord, name, shape=shape, dtype=dtype)
Expand Down Expand Up @@ -104,8 +121,13 @@ def main():

snapshot = Snapshot(args.model_save_dir, args.model_load_dir)


if args.model_load != None:
flow.load_variables(flow.checkpoint.get(args.model_load))

print('num_accumulation_steps:', args.num_accumulation_steps)
metric = Metric(desc='train', print_steps=args.loss_print_every_n_iter,
batch_size=batch_size, keys=['total_loss', 'mlm_loss', 'nsp_loss'])
batch_size=batch_size * args.num_accumulation_steps, keys=['total_loss', 'mlm_loss', 'nsp_loss'])
for step in range(args.iter_num):
PretrainJob().async_get(metric.metric_cb(step))
#PretrainJob().async_get(metric.metric_cb(step, epoch=3))
Expand Down
46 changes: 46 additions & 0 deletions LanguageModeling/BERT/run_pretraining_adam.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
BENCH_ROOT_DIR=/path/to/OneFlow-Benchmark/LanguageModeling/BERT
OUTPUT_DIR=/DATA/disk1/of_output

DATA_DIR=/DATA/disk1/bert/wiki_seq_len_128


BZ=48
ITER_NUM=1000000
max_seq_length=128
max_predictions_per_seq=20

of_log_dir=$OUTPUT_DIR/bert_master/of
rm -rf ${of_log_dir}
mkdir -p ${of_log_dir}
rm -rf core.*

export PYTHONUNBUFFERED=1
export ONEFLOW_DEBUG_MODE=True
export GLOG_v=3
export CUDA_VISIBLE_DEVICES=6
python3 $BENCH_ROOT_DIR/run_pretraining.py \
--gpu_num_per_node=1 \
--num_nodes=1 \
--learning_rate=1.25e-5 \
--warmup_proportion=0.01 \
--weight_decay_rate=0.01 \
--batch_size_per_device=${BZ} \
--iter_num=${ITER_NUM} \
--loss_print_every_n_iter=1 \
--seq_length=128 \
--use_fp16 \
--max_predictions_per_seq=20 \
--num_hidden_layers=12 \
--num_attention_heads=12 \
--num_accumulation_steps=1 \
--max_position_embeddings=512 \
--type_vocab_size=2 \
--vocab_size=30522 \
--attention_probs_dropout_prob=0.1 \
--hidden_dropout_prob=0.1 \
--hidden_size_per_head=64 \
--data_part_num=64 \
--data_dir=$DATA_DIR \
--log_dir=${of_log_dir} \
--model_save_every_n_iter=50000 \
--model_save_dir=${of_log_dir}
47 changes: 47 additions & 0 deletions LanguageModeling/BERT/run_pretraining_lamb.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
BENCH_ROOT_DIR=/path/to/OneFlow-Benchmark/LanguageModeling/BERT
OUTPUT_DIR=/DATA/disk1/of_output

DATA_DIR=/DATA/disk1/bert/wiki_seq_len_128


BZ=16
ITER_NUM=1000000
max_seq_length=128
max_predictions_per_seq=20

of_log_dir=$OUTPUT_DIR/bert_master/of
rm -rf ${of_log_dir}
mkdir -p ${of_log_dir}
rm -rf core.*

export PYTHONUNBUFFERED=1
export ONEFLOW_DEBUG_MODE=True
export GLOG_v=3

python3 $BENCH_ROOT_DIR/run_pretraining.py \
--gpu_num_per_node=8 \
--num_nodes=1 \
--learning_rate=1e-4 \
--warmup_proportion=0.01 \
--weight_decay_rate=0.01 \
--batch_size_per_device=${BZ} \
--iter_num=${ITER_NUM} \
--loss_print_every_n_iter=1 \
--seq_length=128 \
--use_fp16 \
--optimizer_type="lamb" \
--max_predictions_per_seq=20 \
--num_hidden_layers=12 \
--num_attention_heads=12 \
--num_accumulation_steps=512 \
--max_position_embeddings=512 \
--type_vocab_size=2 \
--vocab_size=30522 \
--attention_probs_dropout_prob=0.1 \
--hidden_dropout_prob=0.1 \
--hidden_size_per_head=64 \
--data_part_num=64 \
--data_dir=$DATA_DIR \
--log_dir=${of_log_dir} \
--model_save_every_n_iter=50000 \
--model_save_dir=${of_log_dir}
74 changes: 74 additions & 0 deletions LanguageModeling/BERT/run_squad.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
BENCH_ROOT_DIR=/home/oyy/workspace/OneFlow-Benchmark/LanguageModeling/BERT
# pretrained model dir
PRETRAINED_MODEL=/DATA/disk1/of_output/uncased_L-12_H-768_A-12_oneflow

# squad ofrecord dataset dir
DATA_ROOT=/DATA/disk1/of_output/bert/of_squad

# `vocab.txt` dir
REF_ROOT_DIR=/DATA/disk1/of_output/uncased_L-12_H-768_A-12

# `evaluate-v*.py` and `dev-v*.json` dir
SQUAD_TOOL_DIR=/DATA/disk1/of_output/bert/of_squad
db_version=${1:-"v2.0"}
if [ $db_version = "v1.1" ]; then
train_example_num=88614
eval_example_num=10833
version_2_with_negative="False"
elif [ $db_version = "v2.0" ]; then
train_example_num=131944
eval_example_num=12232
version_2_with_negative="True"
else
echo "db_version must be 'v1.1' or 'v2.0'"
exit
fi

train_data_dir=$DATA_ROOT/train-$db_version
eval_data_dir=$DATA_ROOT/dev-$db_version
LOGFILE=./bert_fp_training.log
export PYTHONUNBUFFERED=1
export ONEFLOW_DEBUG_MODE=True
export CUDA_VISIBLE_DEVICES=7
# finetune and eval SQuAD,
# `predictions.json` will be saved to folder `./squad_output`
python3 $BENCH_ROOT_DIR/run_squad.py \
--model=SQuAD \
--do_train=True \
--do_eval=True \
--gpu_num_per_node=1 \
--learning_rate=3e-5 \
--batch_size_per_device=16 \
--eval_batch_size_per_device=16 \
--num_epoch=3 \
--use_fp16 \
--version_2_with_negative=$version_2_with_negative \
--loss_print_every_n_iter=20 \
--do_lower_case=True \
--seq_length=384 \
--num_hidden_layers=12 \
--num_attention_heads=12 \
--max_position_embeddings=512 \
--type_vocab_size=2 \
--vocab_size=30522 \
--attention_probs_dropout_prob=0.1 \
--hidden_dropout_prob=0.1 \
--hidden_size_per_head=64 \
--train_data_dir=$train_data_dir \
--train_example_num=$train_example_num \
--eval_data_dir=$eval_data_dir \
--eval_example_num=$eval_example_num \
--log_dir=./log \
--model_load_dir=${PRETRAINED_MODEL} \
--save_last_snapshot=True \
--model_save_dir=./squad_snapshots \
--vocab_file=$REF_ROOT_DIR/vocab.txt \
--predict_file=$SQUAD_TOOL_DIR/dev-${db_version}.json \
--output_dir=./squad_output 2>&1 | tee ${LOGFILE}


# evaluate predictions.json to get metrics
python3 $SQUAD_TOOL_DIR/evaluate-${db_version}.py \
$SQUAD_TOOL_DIR/dev-${db_version}.json \
./squad_output/predictions.json

12 changes: 12 additions & 0 deletions LanguageModeling/BERT/tools/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
## Oneflow BERT automatic test tools
Automatically execute bert modle with different parameters and analyze the results
### Dependent environment
- matplotlib
```
pip install matplotlib
```

### Features
- `analysis.py`,Analyze the log file to get the total loss mlm loss nsp loos through GPU memory
- `result_analysis.py`,Analyze the running results of the two versions and output reports
- `stitching_pic.py` Multiple pictures are spliced ​​together
63 changes: 63 additions & 0 deletions LanguageModeling/BERT/tools/analysis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import argparse
import re
import json

from ctypes import *



def collect_loss(log_file, gpu_num):
print("loss : ",log_file)

f = open(log_file,"r")
lines = f.readlines()
total_loss = []
mlm_loss = []
nsp_loss = []
throughput =[]
memory=[]

pattern = re.compile(r"step:\s*(\d+)\s*,\s*total_loss:\s*(\d+\.?\d+)\s*,\s*mlm_loss:\s*(\d+\.?\d+)\s*,\s*nsp_loss:\s*(\d+\.?\d+)\s*,\s*throughput:\s*(\d+\.?\d+)\s*")
for line in lines:
if(line.split(':')[0] == 'step'):

match = pattern.match(line)
if match:
total_loss.append(match.group(2))
mlm_loss.append(match.group(3))
nsp_loss.append(match.group(4))
throughput.append(match.group(5))
if(line.split(' [MiB]\\n')[0] == 'b\'memory.used'):
str_tmp = line.split(' [MiB]\\n')[1]

for i in range(gpu_num):
memory.append(str_tmp.split(' MiB\\n')[i])

return total_loss, mlm_loss, nsp_loss,throughput, memory


def main():
parser = argparse.ArgumentParser(description="collect GPU device memory usage")
parser.add_argument("--log_file", type=str, default=None)
parser.add_argument("--mem_file", type=str, default=None)
parser.add_argument("--out_file", type=str, default=None)
parser.add_argument("--gpu_num", type=int, default=1)

args = parser.parse_args()

total_loss, mlm_loss, nsp_loss,throughput, memory = collect_loss(args.log_file, args.gpu_num)

out={}
out['total_loss'] = total_loss
out['mlm_loss'] = mlm_loss
out['nsp_loss'] = nsp_loss
out['throughput'] = throughput
out['memory'] = memory

string = json.dumps(out)
with open(args.out_file,'w')as f:
f.write(string)


if __name__ == "__main__":
main()
63 changes: 63 additions & 0 deletions LanguageModeling/BERT/tools/gpu_memory_usage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import time
import argparse
import pynvml


class Device(object):
class Status:
INIT = "INIT"
DETECTING = "DETECTING"
STOP = "STOP"

start_detecting_mem_threshold = 32 * 1024 * 1024

def __init__(self, handle):
self.handle = handle
self.status = self.Status.INIT
self.max_mem_usage = 0

def update(self):
info = pynvml.nvmlDeviceGetMemoryInfo(self.handle)
if self.status == self.Status.INIT:
if info.used > self.start_detecting_mem_threshold:
self.status = self.Status.DETECTING
elif self.status == self.Status.DETECTING:
if info.used < self.start_detecting_mem_threshold:
self.status = self.Status.STOP
return False
else:
self.max_mem_usage = max(self.max_mem_usage, info.used)
elif self.status == self.Status.STOP:
raise ValueError("detecting is stop")
else:
raise ValueError("invalid status")

return True


def main():
parser = argparse.ArgumentParser(description="collect GPU device memory usage")
parser.add_argument("-g", type=int, default=1, help="number of gpu devices")
parser.add_argument("-n", type=float, default=1, help="metrics rate")
args = parser.parse_args()

pynvml.nvmlInit()
n_gpus = args.g
devices = [Device(pynvml.nvmlDeviceGetHandleByIndex(i)) for i in range(n_gpus)]

running = True
while running:
time.sleep(args.n)
running = False
for device in devices:
running |= device.update()

pynvml.nvmlShutdown()
for i, device in enumerate(devices):
max_mem_usage_mbytes = device.max_mem_usage / 1024 / 1024
print(f"{max_mem_usage_mbytes:.2f}")


if __name__ == "__main__":
main()

Loading