SteveHong1901 / UCL-STAT0006-Data-Analysis-Project-1

0 stars 0 forks source link

Rr #15

Open SteveHong1901 opened 1 month ago

SteveHong1901 commented 1 month ago

import pandas as pd

# Sample DataFrame with binary variables
data = {
    'inds_work': [1, 0, 0, 1],
    'inds_study': [0, 1, 0, 1],
    'pos_manager': [1, 0, 1, 0],
    'pos_intern': [0, 1, 1, 0],
    'hobby_reading': [1, 0, 1, 1],
    'hobby_sports': [0, 1, 0, 0]
}

df = pd.DataFrame(data)

# Function to create categorical variables
def create_categorical(df, prefix):
    suffixes = [col.replace(prefix, '') for col in df.columns if col.startswith(prefix)]
    df[prefix[:-1] + '_category'] = df[[col for col in df.columns if col.startswith(prefix)]].idxmax(axis=1).apply(lambda x: x.replace(prefix, ''))
    return df

# Create categorical variables
df = create_categorical(df, 'inds_')
df = create_categorical(df, 'pos_')
df = create_categorical(df, 'hobby_')

# Drop original binary columns
df.drop(columns=[col for col in df.columns if any(col.startswith(prefix) for prefix in ['inds_', 'pos_', 'hobby_'])], inplace=True)

print(df)
SteveHong1901 commented 1 month ago

import pandas as pd

# Sample DataFrame with binary variables
data = {
    'inds_work': [1, 0, 0, 1],
    'inds_study': [0, 1, 0, 1],
    'pos_manager': [1, 0, 1, 0],
    'pos_intern': [0, 1, 1, 0],
    'hobby_reading': [1, 0, 1, 1],
    'hobby_sports': [0, 1, 0, 0]
}

df = pd.DataFrame(data)

# Function to create categorical variables
def create_categorical(df, prefix):
    # Extract suffixes and create a categorical column
    suffixes = [col.replace(prefix, '') for col in df.columns if col.startswith(prefix)]
    categorical_col = prefix[:-1] + '_category'

    def get_category(row):
        for suffix in suffixes:
            if row[prefix + suffix] == 1:
                return suffix
        return None

    df[categorical_col] = df.apply(get_category, axis=1)
    return df

# Create categorical variables
df = create_categorical(df, 'inds_')
df = create_categorical(df, 'pos_')
df = create_categorical(df, 'hobby_')

# Drop original binary columns
binary_columns = [col for col in df.columns if any(col.startswith(prefix) for prefix in ['inds_', 'pos_', 'hobby_'])]
df.drop(columns=binary_columns, inplace=True)

# Display the resulting DataFrame
print(df)
SteveHong1901 commented 1 month ago

import pandas as pd

# Example dataframe with binary variables
data = {
    'inds_1': [0, 1, 0],
    'inds_2': [1, 0, 1],
    'pos_1': [0, 1, 1],
    'pos_2': [1, 0, 0],
    'hobby_1': [0, 0, 1],
    'hobby_2': [1, 1, 0],
}

df = pd.DataFrame(data)

# Combine inds_ variables into a single categorical variable
inds_columns = [col for col in df.columns if col.startswith('inds_')]
df['industry'] = df[inds_columns].apply(lambda row: '_'.join(row.astype(str)), axis=1)

# Combine pos_ variables into a single categorical variable
pos_columns = [col for col in df.columns if col.startswith('pos_')]
df['position'] = df[pos_columns].apply(lambda row: '_'.join(row.astype(str)), axis=1)

# Combine hobby_ variables into a single categorical variable
hobby_columns = [col for col in df.columns if col.startswith('hobby_')]
df['hobby'] = df[hobby_columns].apply(lambda row: '_'.join(row.astype(str)), axis=1)

# Drop the original binary columns
df.drop(columns=inds_columns + pos_columns + hobby_columns, inplace=True)

# Display the dataframe
print(df)
SteveHong1901 commented 1 month ago

import pandas as pd

# Example dataframe with binary variables
data = {
    'inds_1': [0, 1, 0],
    'inds_2': [1, 0, 1],
    'pos_1': [0, 1, 1],
    'pos_2': [1, 0, 0],
    'hobby_1': [0, 0, 1],
    'hobby_2': [1, 1, 0],
}

df = pd.DataFrame(data)

# Function to determine the categorical value
def get_category(row, prefix):
    # Extract the columns that match the prefix
    columns = [col for col in row.index if col.startswith(prefix)]
    # Find the column that is 1
    for col in columns:
        if row[col] == 1:
            return col.replace(prefix, '')
    return 'None'

# Apply the function to create the new categorical columns
df['industry'] = df.apply(get_category, axis=1, prefix='inds_')
df['position'] = df.apply(get_category, axis=1, prefix='pos_')
df['hobby'] = df.apply(get_category, axis=1, prefix='hobby_')

# Drop the original binary columns
df.drop(columns=[col for col in df.columns if col.startswith(('inds_', 'pos_', 'hobby_'))], inplace=True)

# Display the dataframe
print(df)
SteveHong1901 commented 1 month ago

import pandas as pd
from kmodes.kmodes import KModes
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt

# Assume 'data' is your DataFrame with mixed numerical and categorical data
data = pd.read_csv('your_dataset.csv')

# Convert categorical columns to string type
for col in data.select_dtypes(include=['object', 'category']).columns:
    data[col] = data[col].astype(str)

# Elbow method to find the optimal number of clusters
cost = []
K = range(1, 10)
for num_clusters in K:
    kmode = KModes(n_clusters=num_clusters, init="Huang", n_init=5, verbose=1)
    kmode.fit_predict(data)
    cost.append(kmode.cost_)

plt.figure(figsize=(10, 5))
plt.plot(K, cost, 'bx-')
plt.xlabel('Number of clusters')
plt.ylabel('Cost')
plt.title('Elbow Method For Optimal k')
plt.show()

# Silhouette method to find the optimal number of clusters
silhouette_avg = []
for num_clusters in range(2, 10):
    kmode = KModes(n_clusters=num_clusters, init="Huang", n_init=5, verbose=1)
    cluster_labels = kmode.fit_predict(data)
    silhouette_avg.append(silhouette_score(data, cluster_labels, metric='hamming'))

plt.figure(figsize=(10, 5))
plt.plot(range(2, 10), silhouette_avg, 'bx-')
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette score')
plt.title('Silhouette Method For Optimal k')
plt.show()
SteveHong1901 commented 1 month ago

import numpy as np
import pandas as pd
from kmodes.kmodes import KModes
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import silhouette_samples, silhouette_score

# Sample data
data = pd.DataFrame({
    'feature1': ['A', 'B', 'A', 'C'],
    'feature2': ['X', 'X', 'Y', 'Z']
})

# Perform k-modes clustering
km = KModes(n_clusters=2, init='Huang', n_init=5, verbose=1)
clusters = km.fit_predict(data)

# Add clusters to data
data['cluster'] = clusters

# Encode the categorical variables for silhouette calculation
encoded_data = data.copy()
label_encoders = {}
for col in data.columns[:-1]:  # Exclude the cluster column
    le = LabelEncoder()
    encoded_data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

# Custom Hamming distance function
def hamming_distance(a, b):
    return np.mean(a != b)

# Compute the silhouette score using Hamming distance
def silhouette_score_hamming(X, labels):
    n_samples = X.shape[0]
    A = np.array([np.mean([hamming_distance(X[i], X[j]) for j in range(n_samples) if labels[i] == labels[j]]) for i in range(n_samples)])
    B = np.array([np.min([np.mean([hamming_distance(X[i], X[j]) for j in range(n_samples) if labels[j] == cluster]) for cluster in set(labels) if cluster != labels[i]]) for i in range(n_samples)])
    silhouette_samples_values = (B - A) / np.maximum(A, B)
    silhouette_avg = np.mean(silhouette_samples_values)
    return silhouette_avg

# Calculate silhouette score
silhouette_avg = silhouette_score_hamming(encoded_data.iloc[:, :-1].values, encoded_data['cluster'].values)
print(f'Silhouette Score: {silhouette_avg}')

# Visualize clusters
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# 2D Projection using PCA
pca = PCA(n_components=2)
principal_components = pca.fit_transform(encoded_data.iloc[:, :-1])
data_pca = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2'])
data_pca['cluster'] = clusters

plt.figure(figsize=(8, 6))
sns.scatterplot(x='PC1', y='PC2', hue='cluster', data=data_pca, palette='viridis')
plt.title('2D PCA of Clusters')
plt.show()

# Cluster Counts
plt.figure(figsize=(8, 6))
sns.countplot(x='cluster', data=data)
plt.title('Cluster Counts')
plt.show()
SteveHong1901 commented 1 month ago

import pandas as pd

# Sample dataframe creation
data = {
    'ClientID': [1, 2, 3],
    'Asset1': [100, 200, 150],
    'Asset2': [300, 150, 50],
    'Asset3': [250, 50, 100],
    # ... up to Asset26
    'Asset26': [50, 75, 80]
}

df = pd.DataFrame(data)

# Extract client IDs
client_ids = df['ClientID']

# Drop the client ID column for processing
df = df.drop(columns=['ClientID'])

# Function to get top 3 asset names
def get_top_assets_names(row):
    sorted_assets = row.sort_values(ascending=False).head(3)
    return sorted_assets.index.tolist()

# Apply the function to each row
top_asset_names = df.apply(get_top_assets_names, axis=1, result_type='expand')

# Create a new dataframe with top 3 asset names
top_asset_names.columns = ['Top1_Asset', 'Top2_Asset', 'Top3_Asset']

# Insert the client IDs back into the dataframe
top_asset_names.insert(0, 'ClientID', client_ids)

# Display the final dataframe
print(top_asset_names)