Open SteveHong1901 opened 6 days ago
import pandas as pd
data = { 'Computers & Software': [1, 0, 0], 'IT Services': [0, 1, 0], 'Communications': [0, 0, 1], 'Real Estate Management & Development': [1, 0, 0], 'Real Estate': [0, 1, 0], 'Banks': [0, 0, 1], 'Metals & Mining': [1, 0, 0], 'Textiles, Apparel & Luxury Goods': [0, 1, 0], 'Biotechnology': [0, 0, 1], 'Education': [1, 0, 0] }
df = pd.DataFrame(data)
industries_dict = { "TMT": ["Computers & Software", "IT Services", "Communications", "Media", "Internet", "Semiconductors & Semiconductor Equipment"], "REGAL": ["Real Estate Management & Development", "Real Estate", "Leisure Equipment & Products", "Hotels, Restaurants & Leisure", "Real Estate Investment Trusts", "Sports & Entertainment"], "FIG": ["Banks", "Insurance", "Diversified Financial Services", "Consumer Finance", "Asset Management", "Finance / Banking / Investment", "Hedge Funds", "Venture Capital"], "Industrials": ["Metals & Mining", "Energy", "Utilities (Electric, Water, Gas)", "Road & Rail", "Shipping / Packaging / Distribution", "Construction & Engineering", "Oil, Gas & Consumable Fuels", "Commercial Airlines", "Chemicals", "Paper & Forest Products", "Industrial Conglomerates", "Aerospace & Defense", "Business Services", "Manufacturing", "Automobiles"], "Consumers": ["Textiles, Apparel & Luxury Goods", "Retails", "Beverages", "Food Products"], "Healthcare": ["Biotechnology", "Pharmaceuticals", "Health Care Providers & Services"], "Other": ["Education", "Non-Profit & Social Organizations", "Legal Services", "Professional Services / Accounting / Consulting"] }
role_categories = { 'Executive & Leadership': ['Partner', 'Administrator', 'Executive', 'Chairman', 'President', 'Vice Chairman', 'General Manager', 'Chairman and CEO', 'CEO', 'Chief Information Officer', 'Co-President', 'Vice President', 'CEO and President', 'Honorary Chairman', 'Managing Partner', 'Supervisory Board Member', 'Chairperson', 'Managing Director', 'Deputy Managing Director', 'Deputy Prime Minister', 'Executive Director', 'Senior Vice President', 'Chief Investment Officer', 'Co-Chairman', 'Co-CEO', 'CFO', 'Chairwoman', 'Managing Member', 'Chief Risk Officer', 'COO', 'Deputy Chairman', 'Chief Compliance Officer', 'Executive Vice President', 'Chief Scientific Officer', 'Chief Technology Officer', 'President Commissioner', 'Chairman, CEO and President', 'Executive Chairman'], 'Board & Committees': ['Board Member', 'Supervisory Board Member', 'Trustee', 'Committee Member', 'Council Member', 'Advisory Board Member'], 'Creative & Media': ['Creative Director', 'Actress', 'Musician', 'Film Director', 'Photographer', 'Artist', 'Anchor', 'Journalist'], 'Professional & Technical': ['Professional Athlete', 'Architect', 'Researcher', 'Consultant', 'Advisor', 'Specialist', 'Technician', 'Engineer', 'Designer', 'Doctor', 'Lawyer', 'Attorney', 'Professor', 'Academician', 'Editor', 'Driver', 'Pilot'], 'Administrative & Support': ['Administrator', 'Secretary', 'Assistant', 'Counsel', 'Agent', 'Trainee', 'Intern', 'Fellow', 'Volunteer'], 'Business & Operations': ['Trader', 'Dealer', 'Representative', 'Owner', 'Manager', 'Supervisor', 'Head', 'Investor', 'Portfolio Manager', 'Department Head', 'Lead Designer', 'Lead Researcher', 'Head of Department'], 'Special Roles & Titles': ['King', 'President', 'Honorary Trustee', 'None', 'Legal Representative', 'Entrepreneur in Residence'] }
categories = { 'Arts & Entertainment': ['Music', 'Art', 'Film', 'Stage and Theater', 'Games', 'Fashion', 'Collectibles', 'Card Games'], 'Sports & Recreation': ['Sports', 'Gambling', 'Outdoors', 'Extreme Sports', 'Boating'], 'Food & Beverages': ['Beverages', 'Food', 'Alcoholic Beverages', 'Smoking'], 'Business & Finance': ['Real Estate', 'Finance', 'Business', 'Economics', 'Cryptocurrencies'], 'Health & Wellness': ['Health and Wellness', 'Philanthropy', 'Human Rights'], 'Education & Knowledge': ['Public Speaking', 'Writing', 'Reading', 'Science', 'Engineering', 'Languages', 'Philosophy', 'Mathematics', 'Psychology'], 'Technology & Innovation': ['Technology', 'Engineering'], 'Social & Community': ['Family', 'Networking', 'Religion', 'Philanthropy', 'Human Rights', 'Culture and Heritage'], 'Travel & Adventure': ['Travel', 'Aviation', 'Vehicles', 'Luxury Lifestyle', 'Nightlife'], 'Professional & Career': ['Politics', 'Law', 'Journalism', 'Media', 'Education', 'History', 'Military'], 'Lifestyle & Hobbies': ['Environment', 'Design', 'Architecture', 'Horticulture', 'Agriculture', 'Other', 'Firearms'] }
def merge_and_drop_columns(df, category_dict): for category, items in category_dict.items(): existing_items = [item for item in items if item in df.columns] if existing_items: df[category] = df[existing_items].max(axis=1) df.drop(columns=[item for items in category_dict.values() for item in items if item in df.columns], inplace=True) return df
df = merge_and_drop_columns(df, industries_dict) df = merge_and_drop_columns(df, role_categories) df = merge_and_drop_columns(df, categories)
print(df)
import pandas as pd
# Example DataFrame with wealth values
data = {'wealth': [1500000, 25000000, 75000000, 150000000, 5000000000]}
df = pd.DataFrame(data)
# Define the bins and labels
bins = [0, 20000000, 50000000, 100000000, 1000000000, float('inf')]
labels = ['0-20m', '20m-50m', '50m-100m', '100m-1bn', '>1bn']
# Create a new column with the bucketed wealth values
df['wealth_bucket'] = pd.cut(df['wealth'], bins=bins, labels=labels, right=False)
print(df)
import numpy as np
import pandas as pd
from kmodes.kmodes import KModes
from sklearn.metrics import silhouette_score
# Assuming your data is in a pandas DataFrame named 'df'
# df = pd.read_csv('your_dataset.csv') # Load your dataset if it's in a CSV file
# Convert DataFrame to numpy array
data = df.values
# Define the number of clusters
num_clusters = 5 # Choose the number of clusters (this is an example)
# Initialize and fit the k-modes model
km = KModes(n_clusters=num_clusters, init='Huang', n_init=5, verbose=1)
clusters = km.fit_predict(data)
# Add cluster labels to the original DataFrame
df['Cluster'] = clusters
# Evaluate the clustering with silhouette score
# Silhouette score requires pairwise distances, use precomputed distance matrix for Hamming
from sklearn.metrics import pairwise_distances
distance_matrix = pairwise_distances(data, metric='hamming')
silhouette_avg = silhouette_score(distance_matrix, clusters, metric='precomputed')
print(f'Silhouette Score: {silhouette_avg}')