diff --git a/2024-05_school_mapping/fuxxy.py b/2024-05_school_mapping/fuxxy.py new file mode 100644 index 0000000..d21b2b7 --- /dev/null +++ b/2024-05_school_mapping/fuxxy.py @@ -0,0 +1,70 @@ +import pandas as pd +from fuzzywuzzy import fuzz +import time + + +# Define file paths (replace with your actual paths) +file_a_path = "E:\\incubator-main\\incubator-main\\2024-05_school_mapping\\data\\Book3.csv" +file_b_path = "E:\\incubator-main\\incubator-main\\2024-05_school_mapping\\data\\school_list_B.tsv" +output_file_path = "E:\\incubator-main\\incubator-main\\2024-05_school_mapping\\data\\school_matches.csv" + +# Define school name column names (replace with actual names) +school_column_name_a = "velthuis2" ## from excel break down the column into multiple chunks. only for TSV A. So that, the we can match names only. +school_column_name_b = "name" + +# Read data using pandas.read_csv (assuming TSV format) +try: + df_a = pd.read_csv(file_a_path) + df_b = pd.read_csv(file_b_path, sep="\t") +except FileNotFoundError: + print("Error: One or both files not found. Please check the paths.") + exit() + +# Handle case-sensitivity issues (optional) +df_a[school_column_name_a] = df_a[school_column_name_a].str.lower() +df_b[school_column_name_b] = df_b[school_column_name_b].str.lower() + +# Create an empty list to store matches with confidence scores +matches = [] +start_time = time.time() + +# Get total number of comparisons to estimate time +total_comparisons = len(df_a) * len(df_b) +processed_comparisons = 0 + +# Iterate through each school name in df_a +for a_id, a_name in zip(df_a['school_id'], df_a[school_column_name_a]): + # if processed_comparisons >= 100: + # break + for b_id, b_name in zip(df_b['school_id'], df_b[school_column_name_b]): + processed_comparisons += 1 + + # Calculate partial ratio score + score = fuzz.partial_ratio(a_name, b_name) + + # Set a minimum score threshold (adjust as needed) + if score >= 100: + match = {"school_a_id": a_id, "school_a": a_name, "school_b_id": b_id, "school_b": b_name, "confidence_score": score} + matches.append(match) + + if processed_comparisons % 100 == 1: # Adjust the reporting frequency as needed + elapsed_time = time.time() - start_time + avg_time_per_comparison = elapsed_time / processed_comparisons + remaining_comparisons = total_comparisons - processed_comparisons + estimated_remaining_time = remaining_comparisons * avg_time_per_comparison + remaining_hours = int(estimated_remaining_time // 3600) + remaining_minutes = int((estimated_remaining_time % 3600) // 60) + remaining_seconds = int(estimated_remaining_time % 60) + + print(f"Processed {processed_comparisons}/{total_comparisons} comparisons") + print(f"Estimated remaining time: {remaining_hours} hours, {remaining_minutes} minutes, {remaining_seconds} seconds") + +# Create a DataFrame from the matches list +if matches: + df_matches = pd.DataFrame(matches) + + # Write the DataFrame to a CSV file + df_matches.to_csv(output_file_path, index=False) + print(f"Matches with confidence scores written to: {output_file_path}") +else: + print("No matches found between the two files.") diff --git a/2024-05_school_mapping/walrus.py b/2024-05_school_mapping/walrus.py new file mode 100644 index 0000000..c3104ef --- /dev/null +++ b/2024-05_school_mapping/walrus.py @@ -0,0 +1,71 @@ +import csv +from walrus import Database +import re +import redis + + +# Initialize the database and create the search index. +redis_client = redis.StrictRedis() + +# Flush the entire database to clear all keys. +# There were multiple runs, so it confused the redis +redis_client.flushdb() + +# Initialize the database. +db = Database() + +search_index = db.Index('app-search') +phonetic_index = db.Index('phonetic-search', metaphone=True) + +# Read the data from the first TSV file and index it. +with open(r"..//school_list_A.tsv", 'r', encoding='utf-8') as file1: + reader = csv.DictReader(file1, delimiter='\t') + print("Starting...") + + for row in reader: + + doc_id = row['school_id'] + content = row['velthuis'] + document_key = f"{doc_id}_search" # Modify the document key based on your requirements + document = {'content': content, 'id': doc_id} # Include the 'id' inside the document + search_index.add(document_key, **document) + + document_key = f"{doc_id}_phonetic" # Modify the document key based on your requirements + phonetic_index.add(document_key, **document) + print(doc_id, content) +print("Ending...") +def sanitize_query(query): + # Remove special characters + sanitized_query = re.sub(r'[^\w\s]', '', query) + return sanitized_query + +# Read the list of schools from the second TSV file. +with open(r"..//school_list_B.tsv", 'r',encoding='utf-8') as file2: + reader = csv.DictReader(file2, delimiter='\t') + print("Starting to match") + for row in reader: + school_name = row['name'] + print("trying to match", school_name) + + sanitized_school_name = sanitize_query(school_name) + try: + match_found = False + + for document in search_index.search(school_name): + doc_id = document['id'] + school_name_redis = redis_client.get(doc_id).decode('utf-8') # Assuming school name is stored as UTF-8 string + print(f"School Name: {school_name_redis}, Content: {document['content']}, ID: {doc_id}") + match_found = True + + for document in phonetic_index.search(school_name): + doc_id = document['id'] + school_name_redis = redis_client.get(doc_id).decode('utf-8') # Assuming school name is stored as UTF-8 string + print(f"School Name: {school_name_redis}, Content: {document['content']}, ID: {doc_id}") + match_found = True + + if match_found: + # Add the school ID from the TSV file into the Redis table as a new key-value pair. + redis_client.set(f"matched_{doc_id}", school_name) + except Exception as e: + print(f"Error matching {school_name}: {e}") + print(f"Resuming next school name")