From ca03dcb43eeb02b0fdddb80f535c14ff7eb1f320 Mon Sep 17 00:00:00 2001 From: Wahesh Date: Fri, 31 May 2024 14:10:51 +0545 Subject: [PATCH 1/6] Walrus with Redis DB --Clear Redis database before indexing and handle exceptions - Added code to clear the Redis database using `flushdb()` before starting the indexing process. - Included the document ID in the key and within the document content. - Encapsulated the search result processing within a try-except block to handle potential exceptions such as connection errors and key errors. - Modified the print statements to display the school name fetched from Redis along with the document content and ID. --- 2024-05_school_mapping/walrus.php | 63 +++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) create mode 100644 2024-05_school_mapping/walrus.php diff --git a/2024-05_school_mapping/walrus.php b/2024-05_school_mapping/walrus.php new file mode 100644 index 0000000..19ef087 --- /dev/null +++ b/2024-05_school_mapping/walrus.php @@ -0,0 +1,63 @@ +import csv +from walrus import Database +import re +import redis + + +# Initialize the database and create the search index. +redis_client = redis.StrictRedis() + +# Flush the entire database to clear all keys. +redis_client.flushdb() + +# Initialize the database. +db = Database() + +search_index = db.Index('app-search') +phonetic_index = db.Index('phonetic-search', metaphone=True) + +# Read the data from the first TSV file and index it. +with open(r"..//school_list_A.tsv", 'r', encoding='utf-8') as file1: + reader = csv.DictReader(file1, delimiter='\t') + print("Starting...") + + for row in reader: + + doc_id = row['school_id'] + content = row['velthuis'] + document_key = f"{doc_id}_search" # Modify the document key based on your requirements + document = {'content': content, 'id': doc_id} # Include the 'id' inside the document + search_index.add(document_key, **document) + + document_key = f"{doc_id}_phonetic" # Modify the document key based on your requirements + phonetic_index.add(document_key, **document) + print(doc_id, content) +print("Ending...") +def sanitize_query(query): + # Remove special characters + sanitized_query = re.sub(r'[^\w\s]', '', query) + return sanitized_query + +# Read the list of schools from the second TSV file. +with open(r"..//school_list_B.tsv", 'r',encoding='utf-8') as file2: + reader = csv.DictReader(file2, delimiter='\t') + print("Starting to match") + for row in reader: + school_name = row['name'] + print("trying to match", school_name) + + sanitized_school_name = sanitize_query(school_name) + try: + + for document in search_index.search(school_name): + doc_id = document['id'] + school_name_redis = redis_client.get(doc_id).decode('utf-8') # Assuming school name is stored as UTF-8 string + print(f"School Name: {school_name_redis}, Content: {document['content']}, ID: {doc_id}") + + for document in phonetic_index.search(school_name): + doc_id = document['id'] + school_name_redis = redis_client.get(doc_id).decode('utf-8') # Assuming school name is stored as UTF-8 string + print(f"School Name: {school_name_redis}, Content: {document['content']}, ID: {doc_id}") + except: + print("error matching", school_name) + print("resuming next", school_name) From f93ecca8cb59914777ee6cb87d6b6858d9e89db9 Mon Sep 17 00:00:00 2001 From: Wahesh Date: Fri, 31 May 2024 14:11:07 +0545 Subject: [PATCH 2/6] Rename walrus.php to walrus.py --- 2024-05_school_mapping/{walrus.php => walrus.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename 2024-05_school_mapping/{walrus.php => walrus.py} (100%) diff --git a/2024-05_school_mapping/walrus.php b/2024-05_school_mapping/walrus.py similarity index 100% rename from 2024-05_school_mapping/walrus.php rename to 2024-05_school_mapping/walrus.py From bff6da8c63e5938daaf87dd94f08d07101a295c4 Mon Sep 17 00:00:00 2001 From: Wahesh Date: Fri, 31 May 2024 14:11:55 +0545 Subject: [PATCH 3/6] Update walrus.py --- 2024-05_school_mapping/walrus.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/2024-05_school_mapping/walrus.py b/2024-05_school_mapping/walrus.py index 19ef087..168ea98 100644 --- a/2024-05_school_mapping/walrus.py +++ b/2024-05_school_mapping/walrus.py @@ -7,7 +7,8 @@ # Initialize the database and create the search index. redis_client = redis.StrictRedis() -# Flush the entire database to clear all keys. +# Flush the entire database to clear all keys. +# There were multiple runs, so it confused the redis redis_client.flushdb() # Initialize the database. From c1bf0acd0e8501e308f16549885cf688a47dea03 Mon Sep 17 00:00:00 2001 From: Wahesh Date: Fri, 31 May 2024 14:57:46 +0545 Subject: [PATCH 4/6] Update walrus.py --- 2024-05_school_mapping/walrus.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/2024-05_school_mapping/walrus.py b/2024-05_school_mapping/walrus.py index 168ea98..c3104ef 100644 --- a/2024-05_school_mapping/walrus.py +++ b/2024-05_school_mapping/walrus.py @@ -49,16 +49,23 @@ def sanitize_query(query): sanitized_school_name = sanitize_query(school_name) try: - + match_found = False + for document in search_index.search(school_name): doc_id = document['id'] school_name_redis = redis_client.get(doc_id).decode('utf-8') # Assuming school name is stored as UTF-8 string print(f"School Name: {school_name_redis}, Content: {document['content']}, ID: {doc_id}") + match_found = True for document in phonetic_index.search(school_name): doc_id = document['id'] school_name_redis = redis_client.get(doc_id).decode('utf-8') # Assuming school name is stored as UTF-8 string print(f"School Name: {school_name_redis}, Content: {document['content']}, ID: {doc_id}") - except: - print("error matching", school_name) - print("resuming next", school_name) + match_found = True + + if match_found: + # Add the school ID from the TSV file into the Redis table as a new key-value pair. + redis_client.set(f"matched_{doc_id}", school_name) + except Exception as e: + print(f"Error matching {school_name}: {e}") + print(f"Resuming next school name") From eb89d7ce95c51d6dc0b91a100a39cde26d90f59f Mon Sep 17 00:00:00 2001 From: Wahesh Date: Fri, 31 May 2024 16:55:58 +0545 Subject: [PATCH 5/6] Create fuxxy.py took so long to execute. Will find a way to speed this up. --- 2024-05_school_mapping/fuxxy.py | 68 +++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 2024-05_school_mapping/fuxxy.py diff --git a/2024-05_school_mapping/fuxxy.py b/2024-05_school_mapping/fuxxy.py new file mode 100644 index 0000000..610c7d3 --- /dev/null +++ b/2024-05_school_mapping/fuxxy.py @@ -0,0 +1,68 @@ +import pandas as pd +from fuzzywuzzy import fuzz +import time +import multiprocessing + +# Define the function for comparing names +def compare_names(a_name, b_names): + matches = [] + for b_name in b_names: + score = fuzz.partial_ratio(a_name, b_name) + if score >= 1: + match = {"school_a": a_name, "school_b": b_name, "confidence_score": score} + matches.append(match) + return matches + +# Define the function for processing comparisons +def process_comparisons(a_names, b_names_list): + matches = [] + for a_name in a_names: + matches.extend(compare_names(a_name, b_names_list)) + return matches + +# Define file paths (replace with your actual paths) +file_a_path = "E:\\incubator-main\\incubator-main\\2024-05_school_mapping\\data\\Book3.csv" +file_b_path = "E:\\incubator-main\\incubator-main\\2024-05_school_mapping\\data\\school_list_B.tsv" +output_file_path = "E:\\incubator-main\\incubator-main\\2024-05_school_mapping\\data\\school_matches.csv" + +# Read data using pandas.read_csv +try: + df_a = pd.read_csv(file_a_path) + df_b = pd.read_csv(file_b_path, sep="\t") +except FileNotFoundError: + print("Error: One or both files not found. Please check the paths.") + exit() + +# Handle case-sensitivity issues (optional) +df_a["velthuis2"] = df_a["velthuis2"].str.lower() +df_b["name"] = df_b["name"].str.lower() + +# Split df_b into chunks for multiprocessing +chunk_size = len(df_b) // multiprocessing.cpu_count() +b_chunks = [df_b[i:i+chunk_size] for i in range(0, len(df_b), chunk_size)] + +# Start timer +start_time = time.time() + +# Create a multiprocessing pool +pool = multiprocessing.Pool() + +# Process comparisons using multiprocessing +matches = [] +for match in pool.starmap(process_comparisons, [(df_a["velthuis2"], chunk) for chunk in b_chunks]): + matches.extend(match) + +# Close the pool +pool.close() +pool.join() + +# Print elapsed time +elapsed_time = time.time() - start_time +print(f"Elapsed time: {elapsed_time:.2f} seconds") + +# Create DataFrame from matches +df_matches = pd.DataFrame(matches) + +# Write DataFrame to CSV file +df_matches.to_csv(output_file_path, index=False) +print(f"Matches with confidence scores written to: {output_file_path}") From 444f9c0cb04690c026ab1a24683202a3c2346e6f Mon Sep 17 00:00:00 2001 From: Wahesh Date: Fri, 31 May 2024 17:38:26 +0545 Subject: [PATCH 6/6] Update fuxxy.py removed multiprocess matched using fuzzy set match to 100 Got good results. --- 2024-05_school_mapping/fuxxy.py | 90 +++++++++++++++++---------------- 1 file changed, 46 insertions(+), 44 deletions(-) diff --git a/2024-05_school_mapping/fuxxy.py b/2024-05_school_mapping/fuxxy.py index 610c7d3..d21b2b7 100644 --- a/2024-05_school_mapping/fuxxy.py +++ b/2024-05_school_mapping/fuxxy.py @@ -1,31 +1,18 @@ import pandas as pd from fuzzywuzzy import fuzz import time -import multiprocessing -# Define the function for comparing names -def compare_names(a_name, b_names): - matches = [] - for b_name in b_names: - score = fuzz.partial_ratio(a_name, b_name) - if score >= 1: - match = {"school_a": a_name, "school_b": b_name, "confidence_score": score} - matches.append(match) - return matches - -# Define the function for processing comparisons -def process_comparisons(a_names, b_names_list): - matches = [] - for a_name in a_names: - matches.extend(compare_names(a_name, b_names_list)) - return matches # Define file paths (replace with your actual paths) -file_a_path = "E:\\incubator-main\\incubator-main\\2024-05_school_mapping\\data\\Book3.csv" +file_a_path = "E:\\incubator-main\\incubator-main\\2024-05_school_mapping\\data\\Book3.csv" file_b_path = "E:\\incubator-main\\incubator-main\\2024-05_school_mapping\\data\\school_list_B.tsv" -output_file_path = "E:\\incubator-main\\incubator-main\\2024-05_school_mapping\\data\\school_matches.csv" +output_file_path = "E:\\incubator-main\\incubator-main\\2024-05_school_mapping\\data\\school_matches.csv" -# Read data using pandas.read_csv +# Define school name column names (replace with actual names) +school_column_name_a = "velthuis2" ## from excel break down the column into multiple chunks. only for TSV A. So that, the we can match names only. +school_column_name_b = "name" + +# Read data using pandas.read_csv (assuming TSV format) try: df_a = pd.read_csv(file_a_path) df_b = pd.read_csv(file_b_path, sep="\t") @@ -34,35 +21,50 @@ def process_comparisons(a_names, b_names_list): exit() # Handle case-sensitivity issues (optional) -df_a["velthuis2"] = df_a["velthuis2"].str.lower() -df_b["name"] = df_b["name"].str.lower() - -# Split df_b into chunks for multiprocessing -chunk_size = len(df_b) // multiprocessing.cpu_count() -b_chunks = [df_b[i:i+chunk_size] for i in range(0, len(df_b), chunk_size)] +df_a[school_column_name_a] = df_a[school_column_name_a].str.lower() +df_b[school_column_name_b] = df_b[school_column_name_b].str.lower() -# Start timer +# Create an empty list to store matches with confidence scores +matches = [] start_time = time.time() -# Create a multiprocessing pool -pool = multiprocessing.Pool() +# Get total number of comparisons to estimate time +total_comparisons = len(df_a) * len(df_b) +processed_comparisons = 0 -# Process comparisons using multiprocessing -matches = [] -for match in pool.starmap(process_comparisons, [(df_a["velthuis2"], chunk) for chunk in b_chunks]): - matches.extend(match) +# Iterate through each school name in df_a +for a_id, a_name in zip(df_a['school_id'], df_a[school_column_name_a]): + # if processed_comparisons >= 100: + # break + for b_id, b_name in zip(df_b['school_id'], df_b[school_column_name_b]): + processed_comparisons += 1 + + # Calculate partial ratio score + score = fuzz.partial_ratio(a_name, b_name) -# Close the pool -pool.close() -pool.join() + # Set a minimum score threshold (adjust as needed) + if score >= 100: + match = {"school_a_id": a_id, "school_a": a_name, "school_b_id": b_id, "school_b": b_name, "confidence_score": score} + matches.append(match) + + if processed_comparisons % 100 == 1: # Adjust the reporting frequency as needed + elapsed_time = time.time() - start_time + avg_time_per_comparison = elapsed_time / processed_comparisons + remaining_comparisons = total_comparisons - processed_comparisons + estimated_remaining_time = remaining_comparisons * avg_time_per_comparison + remaining_hours = int(estimated_remaining_time // 3600) + remaining_minutes = int((estimated_remaining_time % 3600) // 60) + remaining_seconds = int(estimated_remaining_time % 60) -# Print elapsed time -elapsed_time = time.time() - start_time -print(f"Elapsed time: {elapsed_time:.2f} seconds") + print(f"Processed {processed_comparisons}/{total_comparisons} comparisons") + print(f"Estimated remaining time: {remaining_hours} hours, {remaining_minutes} minutes, {remaining_seconds} seconds") -# Create DataFrame from matches -df_matches = pd.DataFrame(matches) +# Create a DataFrame from the matches list +if matches: + df_matches = pd.DataFrame(matches) -# Write DataFrame to CSV file -df_matches.to_csv(output_file_path, index=False) -print(f"Matches with confidence scores written to: {output_file_path}") + # Write the DataFrame to a CSV file + df_matches.to_csv(output_file_path, index=False) + print(f"Matches with confidence scores written to: {output_file_path}") +else: + print("No matches found between the two files.")