import pandas as pd
from fuzzywuzzy import process, fuzz
# Load the datasets
df_original = pd.read_csv('path_to_your_original_dataset.csv')
df_27000 = pd.read_csv('path_to_27000_universities.csv', header=None, names=['university'])
df_855 = pd.read_csv('path_to_855_accurate_universities.csv', header=None, names=['accurate_university'])
# Preprocess the data: Lowercase, strip whitespace
def preprocess_name(name):
name = name.lower().strip()
return name
df_27000['cleaned_university'] = df_27000['university'].apply(preprocess_name)
df_855['cleaned_accurate_university'] = df_855['accurate_university'].apply(preprocess_name)
# Generate a list of unique university names from both lists
unique_27000_universities = df_27000['cleaned_university'].unique()
accurate_universities = df_855['cleaned_accurate_university'].unique()
# Use fuzzy matching to create a mapping to the most probable correct name
def create_standardized_mapping(unique_names, accurate_names, threshold=90):
standardized_mapping = {}
for name in unique_names:
best_match = process.extractOne(name, accurate_names, scorer=fuzz.token_sort_ratio)
if best_match and best_match[1] >= threshold:
standardized_mapping[name] = best_match[0]
else:
standardized_mapping[name] = name
return standardized_mapping
# Create the mapping
standardized_mapping = create_standardized_mapping(unique_27000_universities, accurate_universities)
# Apply the mapping to the DataFrame of 27,000 universities
df_27000['standardized_university'] = df_27000['cleaned_university'].map(standardized_mapping)
# Create a dictionary for fast lookup
standardized_dict = df_27000.set_index('cleaned_university')['standardized_university'].to_dict()
# Apply the mapping to the original dataset
df_original['cleaned_university'] = df_original['university'].apply(preprocess_name)
df_original['standardized_university'] = df_original['cleaned_university'].map(standardized_dict).fillna(df_original['cleaned_university'])
# Optional: Drop intermediate columns if no longer needed
df_original.drop(columns=['cleaned_university'], inplace=True)
# Save the cleaned DataFrame
df_original.to_csv('cleaned_universities_dataset.csv', index=False)
# Display a sample of the cleaned DataFrame
print(df_original.head())