From c12172cb843c151c016252686832b5920b54b387 Mon Sep 17 00:00:00 2001 From: 41ow1ives <77601251@daum.net> Date: Mon, 20 May 2024 13:46:51 +0000 Subject: [PATCH] FIX: Correct handling of unused 'subset' parameter in fasttext_filter (#54) --- dataverse/etl/quality/language.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/dataverse/etl/quality/language.py b/dataverse/etl/quality/language.py index d125204..29056df 100644 --- a/dataverse/etl/quality/language.py +++ b/dataverse/etl/quality/language.py @@ -59,8 +59,8 @@ def load_fasttext( return _FastText(model_path=str(fasttext_path)) -def language_predict_fasttext(row, model, top_k: int = 1, score_rounding: int = 2): - text = row["text"].replace("\n", "") +def language_predict_fasttext(row, subset, model, top_k: int = 1, score_rounding: int = 2): + text = row[subset].replace("\n", "") labels, scores = model.predict(text, k=top_k) labels = [label.replace("__label__", "") for label in labels] @@ -135,6 +135,7 @@ def quality___language___fasttext_filter( data = data.mapPartitions( functools.partial( language_predict_fasttext_by_partition, + subset=subset, top_k=top_k, score_rounding=score_rounding, )