frdtorres / mainf24-2-BIDA

Mainf-BI
0 stars 3 forks source link

Reducción de la dimensionalidad - victor owen hurtado zapana #1

Open owen322hz opened 3 weeks ago

owen322hz commented 3 weeks ago
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

lectura del archivo

import kagglehub

# Download latest version
path = kagglehub.dataset_download("youssefaboelwafa/clustering-penguins-species")
path= (path+'\\penguins.csv')
dataset.isnull().sum()
dataset = dataset.dropna(axis=0)
dataset.info()
dataset.describe().T
cat_cols=dataset.select_dtypes(include=['object']).columns
num_cols = dataset.select_dtypes(include=np.number).columns.tolist()
print("Categorical Variables:")
print(cat_cols)
print("Numerical Variables:")
print(num_cols)
cat_cols=dataset.select_dtypes(include=['object']).columns
num_cols = dataset.select_dtypes(include=np.number).columns.tolist()
print("Categorical Variables:")
print(cat_cols)
print("Numerical Variables:")
print(num_cols)

analisis univariado

def univariate_statistics(data_column):
    print(f"Column {data_column.name}")
    print(f"With Skew = {round(data_column.skew(), 2)}")
    plt.figure(figsize= (10,5))
    plt.subplot(1,2,1)
    data_column.hist()
    plt.ylabel('count')
    plt.subplot(1, 2, 2)
    sns.boxplot(x=data_column)
    plt.show()
[univariate_statistics(dataset[column]) for column in num_cols];
q4 = dataset['flipper_length_mm'].quantile(0.998)
#  el cuartile 0.998 = 1814.307999 
dataset['flipper_length_mm'] = dataset['flipper_length_mm'].loc[dataset['flipper_length_mm'] < q4]
univariate_statistics(dataset["flipper_length_mm"])
q1 = dataset['flipper_length_mm'].quantile(0.0025)
dataset['flipper_length_mm'] = dataset['flipper_length_mm'].loc[dataset['flipper_length_mm'] > q1]
univariate_statistics(dataset["flipper_length_mm"])
dataset["sex"].value_counts()
dataset["sex"] = dataset["sex"].loc[dataset["sex"] != "."]
dataset["sex"].value_counts()
dataset['sex'].replace({"MALE": 0, "FEMALE":1},inplace=True)
dataset = dataset.dropna(axis=0)
plt.figure(figsize=(15,20))
sns.pairplot(data=dataset.drop(['sex'],axis=1))
plt.show()
plt.figure(figsize=(10, 5))
sns.heatmap(dataset.drop(['sex'],axis=1).corr(), annot = True, vmin = -1, vmax = 1)
plt.show()
dataset = dataset.reset_index(drop = True)
standardized_values = StandardScaler().fit_transform(dataset.values)
pca = PCA()
pca.fit(standardized_values)
variance = np.round(pca.explained_variance_ratio_*100, decimals = 1)

plt.figure(figsize = (10,5))
plt.plot(range(1, len(variance)+1), variance.cumsum(), marker = "o", linestyle = "--")
plt.grid()
plt.ylabel("Percentage Cumulative of Explained Variance")
plt.xlabel("Number of Components")
plt.title("Explained Variance by Component")
plt.show()
pca = PCA(n_components = 4)
pca.fit(standardized_values)
scores_pca = pca.transform(standardized_values)
w_sum_sq = []
for i in range(1,30):
  kmeans_pca = KMeans(n_clusters = i, init = "k-means++", random_state = 42)
  kmeans_pca.fit(scores_pca)
  w_sum_sq.append(kmeans_pca.inertia_)
plt.figure(figsize = (10,5))
plt.plot(range(1,30), w_sum_sq, marker = "o")
plt.title("Cluster using PCA Scores")
plt.ylabel("Within-Cluster Sum-of-Squares")
plt.xlabel("Number of clusters")
plt.show()
kmeans_pca = KMeans(n_clusters = 5, init = "k-means++", random_state = 42)
kmeans_pca.fit(scores_pca)
dataset_pca = pd.concat([dataset, pd.DataFrame(scores_pca)], axis = 1)

dataset_pca.columns.values[-4:] = ["component_1", "component_2", "component_3", "component_4"]

dataset_pca["pca_labels"] = kmeans_pca.labels_

dataset_pca["segment"] = dataset_pca["pca_labels"].map({0:"Cluster 1", 1:"Cluster 2", 2:"Cluster 3", 3:"Cluster 4", 4:"Cluster 5"})

dataset_pca_components = dataset_pca.iloc[ :, 5:9]

dataset_pca_components['segment'] = dataset_pca['segment']
sns.pairplot(dataset_pca_components[0:], hue='segment')

components = pca.components_

loading_scores = pd.DataFrame(components.T, columns=[f'PC{i+1}' for i in range(components.shape[0])], index=dataset[:5].columns)

loading_scores_abs = loading_scores.abs().sort_values(by='PC1', ascending=False)

df_with_segments = dataset.copy()
df_with_segments['segment'] = dataset_pca['segment']

features_to_plot = dataset.columns

columns_to_plot = list(features_to_plot) + ['segment']
if not all(column in df_with_segments.columns for column in columns_to_plot):
    missing = [column for column in columns_to_plot if column not in df_with_segments.columns]
sns.pairplot(df_with_segments[columns_to_plot], hue='segment', palette='viridis');
print("Influence of features to components:")
print(loading_scores_abs)

print("Top contributing features for PCA:")
print(loading_scores_abs['PC1'])