SteveHong1901 / UCL-STAT0006-Data-Analysis-Project-1

0 stars 0 forks source link

Yay #5

Open SteveHong1901 opened 1 week ago

SteveHong1901 commented 1 week ago

import pandas as pd
import re
import itertools
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import DBSCAN
from sklearn.metrics.pairwise import cosine_similarity

# Sample data
data = {
    'entity_name': [
        "University of Oxford", "Oxford University", "Jesus College, Oxford", "Harvard University", 
        "Harvard College", "Department of Medicine, Harvard", "Accountant Institute", "CFA", 
        "University of Texas", "University of Sussex", "MIT", "Massachusetts Institute of Technology",
        "Stanford University", "Stanford", "Stanford School of Medicine", "California Institute of Technology",
        "Caltech", "Princeton University", "Princeton", "Yale University", "Yale", "University of Cambridge", 
        "Cambridge University", "Cambridge College", "UCLA", "University of California Los Angeles", 
        "UC Berkeley", "University of California Berkeley", "Columbia University", "Columbia College",
        "New York University", "NYU", "University of Pennsylvania", "UPenn", "Penn State University", 
        "University of Southern California", "USC", "Georgia Institute of Technology", "Georgia Tech",
        "University of Michigan", "UMich", "University of Chicago", "UChicago", "Duke University", 
        "Duke", "Northwestern University", "Northwestern", "University of Washington", "UW", 
        "University of Illinois", "UIUC", "University of Florida", "UF", "Texas A&M University", 
        "Texas A&M", "Vanderbilt University", "Vanderbilt", "University of Notre Dame", "Notre Dame",
        "University of Virginia", "UVA", "University of North Carolina", "UNC", "University of Georgia", 
        "UGA", "University of Miami", "UMiami", "Ohio State University", "Ohio State", "University of Arizona", 
        "UArizona", "University of Colorado Boulder", "CU Boulder", "University of Utah", "Utah", 
        "University of Missouri", "Mizzou", "University of Kansas", "KU", "University of Kentucky", "UK", 
        "University of Tennessee", "UT", "Purdue University", "Purdue", "University of Maryland", 
        "UMD", "University of South Carolina", "UofSC", "University of Alabama", "UA", 
        "University of Oklahoma", "OU", "University of Nebraska", "UNL", "University of Iowa", "UIowa",
        "University of Arkansas", "UArk", "University of Mississippi", "Ole Miss", "University of Nevada", 
        "UNR", "University of Nevada Las Vegas", "UNLV", "University of Louisville", "UofL", 
        "University of Cincinnati", "UCincinnati", "University of Houston", "UH", "University of Pittsburgh"
    ]
}

df = pd.DataFrame(data)

# Preprocessing function to remove common prefixes/suffixes
common_phrases = ["University of", "College of", "Institute of", "Department of", "School of"]

def preprocess(name):
    name = re.sub(r'[^\w\s]', '', name)  # Remove punctuation
    name = name.lower()  # Convert to lowercase
    for phrase in common_phrases:
        name = name.replace(phrase.lower(), '').strip()  # Remove common phrases
    return name

df['cleaned_name'] = df['entity_name'].apply(preprocess)

# Generate power set of words
def generate_power_set(text):
    words = text.split()
    power_set = []
    for i in range(1, len(words) + 1):
        for subset in itertools.combinations(words, i):
            power_set.append(' '.join(subset))
    return power_set

df['power_set'] = df['cleaned_name'].apply(generate_power_set)
df['all_subsets'] = df['power_set'].apply(lambda x: ' '.join(x))

# Vectorize the names using TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['all_subsets'])

# Calculate similarity using cosine similarity
similarity_matrix = cosine_similarity(X)

# Apply DBSCAN clustering
clustering = DBSCAN(eps=0.5, min_samples=2, metric='precomputed')
clusters = clustering.fit_predict(1 - similarity_matrix)

df['cluster'] = clusters

# Function to get the most frequent name in a cluster
def get_most_frequent_name(cluster, df):
    names_in_cluster = df[df['cluster'] == cluster]['entity_name']
    if not names_in_cluster.empty:
        return names_in_cluster.value_counts().idxmax()
    return None

df['canonical_name'] = df['cluster'].apply(lambda x: get_most_frequent_name(x, df) if x != -1 else None)

# Display results
print(df[['entity_name', 'cleaned_name', 'power_set', 'all_subsets', 'cluster', 'canonical_name']])