From c5f4afba09753b348e204daf9b0129f694266345 Mon Sep 17 00:00:00 2001 From: Cybertechnnp <115475827+Cybertechnnp@users.noreply.github.com> Date: Tue, 4 Jun 2024 07:57:41 +0545 Subject: [PATCH] school_mapping.py Description This pull request adds a script (`school_matching.py`) to match schools from Source A to Source B using fuzzy matching of transliterated school names and district IDs. Changes Made - Added `school_matching.py` which: - Loads school data from `school_list_A.tsv` and `school_list_B.tsv`. - Transliterates Devanagari text to Romanized text using the Velthuis method. - Matches schools based on transliterated names and district IDs using the RapidFuzz library. - Saves the matching results to `school_mapping_results.csv`. Assumptions - District mapping data is provided in `jilla.tsv` with Devanagari district names. - Fuzzy matching threshold is set to 70. This approach ensures accurate matching based on transliterated names within the same district. Open to feedback and suggestions for improvements. Contributor This contribution was made by Bimal Bhandari. --- school_mapping.py | 87 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 87 insertions(+) create mode 100644 school_mapping.py diff --git a/school_mapping.py b/school_mapping.py new file mode 100644 index 0000000..dd6ea07 --- /dev/null +++ b/school_mapping.py @@ -0,0 +1,87 @@ +import pandas as pd +from rapidfuzz import fuzz, process +from indic_transliteration import sanscript + +# Load data +source_a = pd.read_csv('data/school_list_A.tsv', sep='\t') +source_b = pd.read_csv('data/school_list_B.tsv', sep='\t') + +# Function to transliterate Devanagari text to Romanized text using Velthuis method +def transliterate_text(text): + return sanscript.transliterate(text, sanscript.DEVANAGARI, sanscript.VELTHUIS) + +# Clean and normalize data +source_a['velthuis'] = source_a['school'].apply(lambda x: transliterate_text(x)).str.lower().str.strip() +source_a['district1'] = source_a['district1'].str.lower().str.strip() +source_b['district'] = source_b['district'].str.lower().str.strip() + +# Create a dictionary for district name to district id mapping in Source B +district_mapping_b = source_b[['district', 'district_id']].drop_duplicates().set_index('district')['district_id'].to_dict() + +# Create a dictionary for district name to district id mapping in Source A (assuming jilla.tsv contains this mapping) +# Assuming 'जिल्ला' is the Devanagari name for 'district' in jilla.tsv +jilla = pd.read_csv('data/jilla.tsv', sep='\t') +district_mapping_a = jilla.set_index('जिल्ला')['district_id'].to_dict() + +# Function to match schools based on transliteration and district +def match_schools(source_a, source_b, district_mapping_a, district_mapping_b, threshold=70): + matches = [] + + for index, row in source_a.iterrows(): + school_id_a = row['school_id'] + velthuis_name = row['velthuis'] + district_a = row['district1'] + + # Get district id from district name in Source A + district_id_a = district_mapping_a.get(district_a) + + if district_id_a is not None: + # Filter Source B schools by district_id + possible_matches = source_b[source_b['district_id'] == district_id_a] + + # Combine names and old names for matching + possible_names = possible_matches['name'].tolist() + possible_matches[['old_name1', 'old_name2', 'old_name3']].stack().tolist() + + # Apply fuzzy matching on combined names + best_match = process.extractOne(velthuis_name, possible_names, scorer=fuzz.token_sort_ratio) + + all_matches = process.extract(velthuis_name, possible_names, scorer=fuzz.token_sort_ratio) + + if best_match and best_match[1] >= threshold: + best_match_name = best_match[0] + + # Determine if best match is from current or old names + if best_match_name in possible_matches['name'].values: + best_match_row = possible_matches[possible_matches['name'] == best_match_name].iloc[0] + else: + old_name_matches = possible_matches[possible_matches[['old_name1', 'old_name2', 'old_name3']].apply(lambda x: best_match_name in x.values, axis=1)] + if not old_name_matches.empty: + best_match_row = old_name_matches.iloc[0] + else: + continue + + school_id_b = best_match_row['school_id'] + + # Append the match result + matches.append({ + 'school_id_a': school_id_a, + 'school_id_b': school_id_b, + 'match_score': best_match[1], + 'school_name_a': row['school'], + 'school_name_b': best_match_row['name'], + 'district_id_a': district_id_a, + 'district_a': district_a, + 'district_b': best_match_row['district'], + 'confidence': best_match[1], + 'all_matches': all_matches + }) + + return pd.DataFrame(matches) + +# Run the matching function +matched_schools = match_schools(source_a, source_b, district_mapping_a, district_mapping_b) + +# Save the matching results to a CSV file +matched_schools.to_csv('school_mapping_results.csv', index=False) + +print(f"Total matches found: {len(matched_schools)}")