Skip to content

Commit a0ccdcc

Browse files
committed
Refactor bert generation to optimize RAM usage
1 parent 859f0e1 commit a0ccdcc

File tree

1 file changed

+11
-5
lines changed

1 file changed

+11
-5
lines changed

src/team_comm_tools/utils/check_embeddings.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -402,16 +402,21 @@ def generate_bert(chat_data, output_path, message_col, batch_size=64):
402402
print(f"Generating RoBERTa sentiments...")
403403

404404
messages = chat_data[message_col].tolist()
405-
batch_sentiments_df = pd.DataFrame()
405+
# batch_sentiments_df = pd.DataFrame()
406+
Path(output_path).parent.mkdir(parents=True, exist_ok=True)
406407

408+
first = True
407409
for i in tqdm(range(0, len(messages), batch_size)):
408410
batch = messages[i:i + batch_size]
409411
batch_df = get_sentiment(batch)
410-
batch_sentiments_df = pd.concat([batch_sentiments_df, batch_df], ignore_index=True)
412+
batch_df.to_csv(output_path, mode='a', header=first, index=False)
413+
first = False
414+
# batch_sentiments_df = pd.concat([batch_sentiments_df, batch_df], ignore_index=True)
411415

416+
# batch_sentiments_df = pd.concat(batch_sentiments_lst, ignore_index=True)
412417
# Create directories along the path if they don't exist
413-
Path(output_path).parent.mkdir(parents=True, exist_ok=True)
414-
batch_sentiments_df.to_csv(output_path, index=False)
418+
419+
# batch_sentiments_df.to_csv(output_path, index=False)
415420

416421
def get_sentiment(texts):
417422
"""
@@ -432,7 +437,8 @@ def get_sentiment(texts):
432437
return pd.DataFrame(np.nan, index=texts_series.index, columns=['positive_bert', 'negative_bert', 'neutral_bert'])
433438

434439
encoded = tokenizer(non_null_non_empty_texts, padding=True, truncation=True, max_length=512, return_tensors='pt')
435-
output = model_bert(**encoded)
440+
with torch.no_grad():
441+
output = model_bert(**encoded)
436442

437443
scores = output[0].detach().numpy()
438444
scores = softmax(scores, axis=1)

0 commit comments

Comments
 (0)