Hahah - Githubissues


import pandas as pd
from fuzzywuzzy import process, fuzz

# Load the datasets
df_original = pd.read_csv('path_to_your_original_dataset.csv')
df_27000 = pd.read_csv('path_to_27000_universities.csv', header=None, names=['university'])
df_855 = pd.read_csv('path_to_855_accurate_universities.csv', header=None, names=['accurate_university'])

# Preprocess the data: Lowercase, strip whitespace
def preprocess_name(name):
    name = name.lower().strip()
    return name

df_27000['cleaned_university'] = df_27000['university'].apply(preprocess_name)
df_855['cleaned_accurate_university'] = df_855['accurate_university'].apply(preprocess_name)

# Generate a list of unique university names from both lists
unique_27000_universities = df_27000['cleaned_university'].unique()
accurate_universities = df_855['cleaned_accurate_university'].unique()

# Use fuzzy matching to create a mapping to the most probable correct name
def create_standardized_mapping(unique_names, accurate_names, threshold=90):
    standardized_mapping = {}

    for name in unique_names:
        best_match = process.extractOne(name, accurate_names, scorer=fuzz.token_sort_ratio)

        if best_match and best_match[1] >= threshold:
            standardized_mapping[name] = best_match[0]
        else:
            standardized_mapping[name] = name

    return standardized_mapping

# Create the mapping
standardized_mapping = create_standardized_mapping(unique_27000_universities, accurate_universities)

# Apply the mapping to the DataFrame of 27,000 universities
df_27000['standardized_university'] = df_27000['cleaned_university'].map(standardized_mapping)

# Create a dictionary for fast lookup
standardized_dict = df_27000.set_index('cleaned_university')['standardized_university'].to_dict()

# Apply the mapping to the original dataset
df_original['cleaned_university'] = df_original['university'].apply(preprocess_name)
df_original['standardized_university'] = df_original['cleaned_university'].map(standardized_dict).fillna(df_original['cleaned_university'])

# Optional: Drop intermediate columns if no longer needed
df_original.drop(columns=['cleaned_university'], inplace=True)

# Save the cleaned DataFrame
df_original.to_csv('cleaned_universities_dataset.csv', index=False)

# Display a sample of the cleaned DataFrame
print(df_original.head())
SteveHong1901 / UCL-STAT0006-Data-Analysis-Project-1

Hahah #4