caisa-lab · shahzeb171 · May 27, 2024
diff --git a/data/gsm/mistral/mistral_gsm_response.csv b/data/gsm/mistral/mistral_gsm_response.csv
diff --git a/script/gsm/flan_gsm.py b/script/gsm/flan_gsm.py
@@ -11,7 +11,7 @@
 from jsonformer import Jsonformer
 
 from config import access_token, DIR_PATH
-from utils import get_questions_and_answer_from_dataset
+from utils import get_noisy_questions_and_answer_from_dataset
 
 tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-xl")
 model = T5ForConditionalGeneration.from_pretrained(
@@ -26,8 +26,8 @@
     },
 }
 
-csv_file = f"{DIR_PATH}/data/gsm/train_preprocessed.csv"
-questions, ground_truths = get_questions_and_answer_from_dataset(csv_file)
+csv_file = f"{DIR_PATH}/data/noisy_datasets/gsm8k_noisy_punct_10.csv"
+questions, ground_truths = get_noisy_questions_and_answer_from_dataset(csv_file)
 
 
 output_file = (

diff --git a/script/gsm/mistral_gsm.py b/script/gsm/mistral_gsm.py
@@ -16,7 +16,7 @@
 from jsonformer import Jsonformer
 
 from config import access_token, DIR_PATH
-from utils import get_questions_and_answer_from_dataset
+from utils import get_noisy_questions_and_answer_from_dataset
 
 access_token = access_token
 model_name = "mistralai/Mistral-7B-v0.1"
@@ -46,8 +46,8 @@
     },
 }
 
-csv_file = f"{DIR_PATH}/data/gsm/train_preprocessed.csv"
-questions, ground_truths = get_questions_and_answer_from_dataset(csv_file)
+csv_file = f"{DIR_PATH}/data/noisy_datasets/gsm8k_noisy_punct_10.csv"
+questions, ground_truths = get_noisy_questions_and_answer_from_dataset(csv_file)
 
 
 output_file = (

diff --git a/script/gsm/mistral_instruct_gsm.py b/script/gsm/mistral_instruct_gsm.py
@@ -15,7 +15,7 @@
 from jsonformer import Jsonformer
 
 from config import access_token, DIR_PATH
-from utils import get_questions_and_answer_from_dataset
+from utils import get_noisy_questions_and_answer_from_dataset
 
 
 access_token = access_token
@@ -46,8 +46,8 @@
     },
 }
 
-csv_file = f"{DIR_PATH}/data/gsm/train_preprocessed.csv"
-questions, ground_truths = get_questions_and_answer_from_dataset(csv_file)
+csv_file = f"{DIR_PATH}/data/noisy_datasets/gsm8k_noisy_punct_10.csv"
+questions, ground_truths = get_noisy_questions_and_answer_from_dataset(csv_file)
 
 
 output_file = f"{DIR_PATH}/data/gsm/mistral_instruct/mistral_instruct_gsm_response.csv"

diff --git a/script/gsm/mistral_math_gsm.py b/script/gsm/mistral_math_gsm.py
@@ -15,7 +15,7 @@
 from jsonformer import Jsonformer
 
 from config import access_token, DIR_PATH
-from utils import get_questions_and_answer_from_dataset
+from utils import get_noisy_questions_and_answer_from_dataset
 
 
 access_token = access_token
@@ -45,8 +45,8 @@
     },
 }
 
-csv_file = f"{DIR_PATH}/data/gsm/train_preprocessed.csv"
-questions, ground_truths = get_questions_and_answer_from_dataset(csv_file)
+csv_file = f"{DIR_PATH}/data/noisy_datasets/gsm8k_noisy_punct_10.csv"
+questions, ground_truths = get_noisy_questions_and_answer_from_dataset(csv_file)
 
 #TODO: Change to relative path
 output_file = f"{DIR_PATH}/data/gsm/mistral_math/mistral_math_gsm_response.csv"

diff --git a/script/gsm/utils.py b/script/gsm/utils.py
@@ -78,3 +78,12 @@ def safe_convert_llm_to_int(value):
     )
 
     print("Accuracy saved to accuracy.csv.")
+
+
+
+def get_noisy_questions_and_answer_from_dataset(csv_file_path):
+    # Load the specific CSV file
+    data = pd.read_csv(csv_file_path)
+    questions = data["noisy_questions"].tolist()
+    groundTruths = data["numeric_answer"].tolist()
+    return questions,groundTruths