SteveHong1901 / UCL-STAT0006-Data-Analysis-Project-1

0 stars 0 forks source link

Hahaa2 #8

Open SteveHong1901 opened 1 week ago

SteveHong1901 commented 1 week ago

from fuzzywuzzy import fuzz, process

def create_standardized_mapping(unique_names, accurate_names, threshold=90):
    standardized_mapping = []

    for name in unique_names:
        best_match = process.extractOne(name, accurate_names, scorer=fuzz.token_sort_ratio)

        if best_match and best_match[1] >= threshold:
            standardized_mapping.append({'original_name': name, 'matched_name': best_match[0], 'score': best_match[1]})
        else:
            standardized_mapping.append({'original_name': name, 'matched_name': name, 'score': 0})

    return standardized_mapping

# Example usage
unique_names = ["Jon", "Johann", "Jane"]
accurate_names = ["John", "Johnny", "Jane"]
threshold = 80

standardized_mapping = create_standardized_mapping(unique_names, accurate_names, threshold)

# Display results
import pandas as pd

df = pd.DataFrame(standardized_mapping)
print(df)