From ca03dcb43eeb02b0fdddb80f535c14ff7eb1f320 Mon Sep 17 00:00:00 2001
From: Wahesh <mahesh.dahal@westcliff.edu>
Date: Fri, 31 May 2024 14:10:51 +0545
Subject: [PATCH 1/6] Walrus with Redis DB

--Clear Redis database before indexing and handle exceptions
- Added code to clear the Redis database using `flushdb()` before starting the indexing process.
- Included the document ID in the key and within the document content.
- Encapsulated the search result processing within a try-except block to handle potential exceptions such as connection errors and key errors.
- Modified the print statements to display the school name fetched from Redis along with the document content and ID.
---
 2024-05_school_mapping/walrus.php | 63 +++++++++++++++++++++++++++++++
 1 file changed, 63 insertions(+)
 create mode 100644 2024-05_school_mapping/walrus.php

diff --git a/2024-05_school_mapping/walrus.php b/2024-05_school_mapping/walrus.php
new file mode 100644
index 0000000..19ef087
--- /dev/null
+++ b/2024-05_school_mapping/walrus.php
@@ -0,0 +1,63 @@
+import csv
+from walrus import Database
+import re
+import redis
+
+
+# Initialize the database and create the search index.
+redis_client = redis.StrictRedis()
+
+# Flush the entire database to clear all keys.
+redis_client.flushdb()
+
+# Initialize the database.
+db = Database()
+
+search_index = db.Index('app-search')
+phonetic_index = db.Index('phonetic-search', metaphone=True)
+
+# Read the data from the first TSV file and index it.
+with open(r"..//school_list_A.tsv", 'r', encoding='utf-8') as file1:
+    reader = csv.DictReader(file1, delimiter='\t')
+    print("Starting...")
+
+    for row in reader:
+        
+        doc_id = row['school_id']
+        content = row['velthuis']
+        document_key = f"{doc_id}_search"  # Modify the document key based on your requirements
+        document = {'content': content, 'id': doc_id}  # Include the 'id' inside the document
+        search_index.add(document_key, **document)
+        
+        document_key = f"{doc_id}_phonetic"  # Modify the document key based on your requirements
+        phonetic_index.add(document_key, **document)
+        print(doc_id, content)
+print("Ending...")
+def sanitize_query(query):
+    # Remove special characters
+    sanitized_query = re.sub(r'[^\w\s]', '', query)
+    return sanitized_query
+
+# Read the list of schools from the second TSV file.
+with open(r"..//school_list_B.tsv", 'r',encoding='utf-8') as file2:
+    reader = csv.DictReader(file2, delimiter='\t')
+    print("Starting to match")
+    for row in reader:
+        school_name = row['name']
+        print("trying to match", school_name)
+
+        sanitized_school_name = sanitize_query(school_name)
+        try:
+                
+            for document in search_index.search(school_name):
+                doc_id = document['id']
+                school_name_redis = redis_client.get(doc_id).decode('utf-8')  # Assuming school name is stored as UTF-8 string
+                print(f"School Name: {school_name_redis}, Content: {document['content']}, ID: {doc_id}")
+
+            for document in phonetic_index.search(school_name):
+                doc_id = document['id']
+                school_name_redis = redis_client.get(doc_id).decode('utf-8')  # Assuming school name is stored as UTF-8 string
+                print(f"School Name: {school_name_redis}, Content: {document['content']}, ID: {doc_id}")
+        except:
+            print("error matching", school_name)
+            print("resuming next", school_name)

From f93ecca8cb59914777ee6cb87d6b6858d9e89db9 Mon Sep 17 00:00:00 2001
From: Wahesh <mahesh.dahal@westcliff.edu>
Date: Fri, 31 May 2024 14:11:07 +0545
Subject: [PATCH 2/6] Rename walrus.php to walrus.py

---
 2024-05_school_mapping/{walrus.php => walrus.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename 2024-05_school_mapping/{walrus.php => walrus.py} (100%)

diff --git a/2024-05_school_mapping/walrus.php b/2024-05_school_mapping/walrus.py
similarity index 100%
rename from 2024-05_school_mapping/walrus.php
rename to 2024-05_school_mapping/walrus.py

From bff6da8c63e5938daaf87dd94f08d07101a295c4 Mon Sep 17 00:00:00 2001
From: Wahesh <mahesh.dahal@westcliff.edu>
Date: Fri, 31 May 2024 14:11:55 +0545
Subject: [PATCH 3/6] Update walrus.py

---
 2024-05_school_mapping/walrus.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/2024-05_school_mapping/walrus.py b/2024-05_school_mapping/walrus.py
index 19ef087..168ea98 100644
--- a/2024-05_school_mapping/walrus.py
+++ b/2024-05_school_mapping/walrus.py
@@ -7,7 +7,8 @@
 # Initialize the database and create the search index.
 redis_client = redis.StrictRedis()
 
-# Flush the entire database to clear all keys.
+# Flush the entire database to clear all keys. 
+# There were multiple runs, so it confused the redis
 redis_client.flushdb()
 
 # Initialize the database.

From c1bf0acd0e8501e308f16549885cf688a47dea03 Mon Sep 17 00:00:00 2001
From: Wahesh <mahesh.dahal@westcliff.edu>
Date: Fri, 31 May 2024 14:57:46 +0545
Subject: [PATCH 4/6] Update walrus.py

---
 2024-05_school_mapping/walrus.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/2024-05_school_mapping/walrus.py b/2024-05_school_mapping/walrus.py
index 168ea98..c3104ef 100644
--- a/2024-05_school_mapping/walrus.py
+++ b/2024-05_school_mapping/walrus.py
@@ -49,16 +49,23 @@ def sanitize_query(query):
 
         sanitized_school_name = sanitize_query(school_name)
         try:
-                
+            match_found = False
+    
             for document in search_index.search(school_name):
                 doc_id = document['id']
                 school_name_redis = redis_client.get(doc_id).decode('utf-8')  # Assuming school name is stored as UTF-8 string
                 print(f"School Name: {school_name_redis}, Content: {document['content']}, ID: {doc_id}")
+                match_found = True
 
             for document in phonetic_index.search(school_name):
                 doc_id = document['id']
                 school_name_redis = redis_client.get(doc_id).decode('utf-8')  # Assuming school name is stored as UTF-8 string
                 print(f"School Name: {school_name_redis}, Content: {document['content']}, ID: {doc_id}")
-        except:
-            print("error matching", school_name)
-            print("resuming next", school_name)
+                match_found = True
+        
+            if match_found:
+                # Add the school ID from the TSV file into the Redis table as a new key-value pair.
+                redis_client.set(f"matched_{doc_id}", school_name)
+                    except Exception as e:
+        print(f"Error matching {school_name}: {e}")
+        print(f"Resuming next school name")

From eb89d7ce95c51d6dc0b91a100a39cde26d90f59f Mon Sep 17 00:00:00 2001
From: Wahesh <mahesh.dahal@westcliff.edu>
Date: Fri, 31 May 2024 16:55:58 +0545
Subject: [PATCH 5/6] Create fuxxy.py

took so long to execute. Will find a way to speed this up.
---
 2024-05_school_mapping/fuxxy.py | 68 +++++++++++++++++++++++++++++++++
 1 file changed, 68 insertions(+)
 create mode 100644 2024-05_school_mapping/fuxxy.py

diff --git a/2024-05_school_mapping/fuxxy.py b/2024-05_school_mapping/fuxxy.py
new file mode 100644
index 0000000..610c7d3
--- /dev/null
+++ b/2024-05_school_mapping/fuxxy.py
@@ -0,0 +1,68 @@
+import pandas as pd
+from fuzzywuzzy import fuzz
+import time
+import multiprocessing
+
+# Define the function for comparing names
+def compare_names(a_name, b_names):
+    matches = []
+    for b_name in b_names:
+        score = fuzz.partial_ratio(a_name, b_name)
+        if score >= 1:
+            match = {"school_a": a_name, "school_b": b_name, "confidence_score": score}
+            matches.append(match)
+    return matches
+
+# Define the function for processing comparisons
+def process_comparisons(a_names, b_names_list):
+    matches = []
+    for a_name in a_names:
+        matches.extend(compare_names(a_name, b_names_list))
+    return matches
+
+# Define file paths (replace with your actual paths)
+file_a_path = "E:\\incubator-main\\incubator-main\\2024-05_school_mapping\\data\\Book3.csv"
+file_b_path = "E:\\incubator-main\\incubator-main\\2024-05_school_mapping\\data\\school_list_B.tsv"
+output_file_path = "E:\\incubator-main\\incubator-main\\2024-05_school_mapping\\data\\school_matches.csv"
+
+# Read data using pandas.read_csv
+try:
+    df_a = pd.read_csv(file_a_path)
+    df_b = pd.read_csv(file_b_path, sep="\t")
+except FileNotFoundError:
+    print("Error: One or both files not found. Please check the paths.")
+    exit()
+
+# Handle case-sensitivity issues (optional)
+df_a["velthuis2"] = df_a["velthuis2"].str.lower()
+df_b["name"] = df_b["name"].str.lower()
+
+# Split df_b into chunks for multiprocessing
+chunk_size = len(df_b) // multiprocessing.cpu_count()
+b_chunks = [df_b[i:i+chunk_size] for i in range(0, len(df_b), chunk_size)]
+
+# Start timer
+start_time = time.time()
+
+# Create a multiprocessing pool
+pool = multiprocessing.Pool()
+
+# Process comparisons using multiprocessing
+matches = []
+for match in pool.starmap(process_comparisons, [(df_a["velthuis2"], chunk) for chunk in b_chunks]):
+    matches.extend(match)
+
+# Close the pool
+pool.close()
+pool.join()
+
+# Print elapsed time
+elapsed_time = time.time() - start_time
+print(f"Elapsed time: {elapsed_time:.2f} seconds")
+
+# Create DataFrame from matches
+df_matches = pd.DataFrame(matches)
+
+# Write DataFrame to CSV file
+df_matches.to_csv(output_file_path, index=False)
+print(f"Matches with confidence scores written to: {output_file_path}")

From 444f9c0cb04690c026ab1a24683202a3c2346e6f Mon Sep 17 00:00:00 2001
From: Wahesh <mahesh.dahal@westcliff.edu>
Date: Fri, 31 May 2024 17:38:26 +0545
Subject: [PATCH 6/6] Update fuxxy.py

removed multiprocess
matched using fuzzy
set match to 100
Got good results.
---
 2024-05_school_mapping/fuxxy.py | 90 +++++++++++++++++----------------
 1 file changed, 46 insertions(+), 44 deletions(-)

diff --git a/2024-05_school_mapping/fuxxy.py b/2024-05_school_mapping/fuxxy.py
index 610c7d3..d21b2b7 100644
--- a/2024-05_school_mapping/fuxxy.py
+++ b/2024-05_school_mapping/fuxxy.py
@@ -1,31 +1,18 @@
 import pandas as pd
 from fuzzywuzzy import fuzz
 import time
-import multiprocessing
 
-# Define the function for comparing names
-def compare_names(a_name, b_names):
-    matches = []
-    for b_name in b_names:
-        score = fuzz.partial_ratio(a_name, b_name)
-        if score >= 1:
-            match = {"school_a": a_name, "school_b": b_name, "confidence_score": score}
-            matches.append(match)
-    return matches
-
-# Define the function for processing comparisons
-def process_comparisons(a_names, b_names_list):
-    matches = []
-    for a_name in a_names:
-        matches.extend(compare_names(a_name, b_names_list))
-    return matches
 
 # Define file paths (replace with your actual paths)
-file_a_path = "E:\\incubator-main\\incubator-main\\2024-05_school_mapping\\data\\Book3.csv"
+file_a_path = "E:\\incubator-main\\incubator-main\\2024-05_school_mapping\\data\\Book3.csv"  
 file_b_path = "E:\\incubator-main\\incubator-main\\2024-05_school_mapping\\data\\school_list_B.tsv"
-output_file_path = "E:\\incubator-main\\incubator-main\\2024-05_school_mapping\\data\\school_matches.csv"
+output_file_path = "E:\\incubator-main\\incubator-main\\2024-05_school_mapping\\data\\school_matches.csv"  
 
-# Read data using pandas.read_csv
+# Define school name column names (replace with actual names)
+school_column_name_a = "velthuis2"   ## from excel break down the column into multiple chunks. only for TSV A. So that, the we can match names only. 
+school_column_name_b = "name"  
+
+# Read data using pandas.read_csv (assuming TSV format)
 try:
     df_a = pd.read_csv(file_a_path)
     df_b = pd.read_csv(file_b_path, sep="\t")
@@ -34,35 +21,50 @@ def process_comparisons(a_names, b_names_list):
     exit()
 
 # Handle case-sensitivity issues (optional)
-df_a["velthuis2"] = df_a["velthuis2"].str.lower()
-df_b["name"] = df_b["name"].str.lower()
-
-# Split df_b into chunks for multiprocessing
-chunk_size = len(df_b) // multiprocessing.cpu_count()
-b_chunks = [df_b[i:i+chunk_size] for i in range(0, len(df_b), chunk_size)]
+df_a[school_column_name_a] = df_a[school_column_name_a].str.lower()  
+df_b[school_column_name_b] = df_b[school_column_name_b].str.lower()  
 
-# Start timer
+# Create an empty list to store matches with confidence scores
+matches = []
 start_time = time.time()
 
-# Create a multiprocessing pool
-pool = multiprocessing.Pool()
+# Get total number of comparisons to estimate time
+total_comparisons = len(df_a) * len(df_b)
+processed_comparisons = 0
 
-# Process comparisons using multiprocessing
-matches = []
-for match in pool.starmap(process_comparisons, [(df_a["velthuis2"], chunk) for chunk in b_chunks]):
-    matches.extend(match)
+# Iterate through each school name in df_a
+for a_id, a_name in zip(df_a['school_id'], df_a[school_column_name_a]):
+    # if processed_comparisons >= 100:
+    #     break
+    for b_id, b_name in zip(df_b['school_id'], df_b[school_column_name_b]):
+        processed_comparisons += 1
+        
+        # Calculate partial ratio score
+        score = fuzz.partial_ratio(a_name, b_name)
 
-# Close the pool
-pool.close()
-pool.join()
+        # Set a minimum score threshold (adjust as needed)
+        if score >= 100:
+            match = {"school_a_id": a_id, "school_a": a_name, "school_b_id": b_id, "school_b": b_name, "confidence_score": score}
+            matches.append(match)
+            
+            if processed_comparisons % 100 == 1:  # Adjust the reporting frequency as needed
+                elapsed_time = time.time() - start_time
+                avg_time_per_comparison = elapsed_time / processed_comparisons
+                remaining_comparisons = total_comparisons - processed_comparisons
+                estimated_remaining_time = remaining_comparisons * avg_time_per_comparison
+                remaining_hours = int(estimated_remaining_time // 3600)
+                remaining_minutes = int((estimated_remaining_time % 3600) // 60)
+                remaining_seconds = int(estimated_remaining_time % 60)
 
-# Print elapsed time
-elapsed_time = time.time() - start_time
-print(f"Elapsed time: {elapsed_time:.2f} seconds")
+                print(f"Processed {processed_comparisons}/{total_comparisons} comparisons")
+                print(f"Estimated remaining time: {remaining_hours} hours, {remaining_minutes} minutes, {remaining_seconds} seconds")
 
-# Create DataFrame from matches
-df_matches = pd.DataFrame(matches)
+# Create a DataFrame from the matches list
+if matches:
+    df_matches = pd.DataFrame(matches)
 
-# Write DataFrame to CSV file
-df_matches.to_csv(output_file_path, index=False)
-print(f"Matches with confidence scores written to: {output_file_path}")
+    # Write the DataFrame to a CSV file
+    df_matches.to_csv(output_file_path, index=False)
+    print(f"Matches with confidence scores written to: {output_file_path}")
+else:
+    print("No matches found between the two files.")