Open sarthakforwet opened 3 weeks ago
import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_squared_error, r2_score import matplotlib.pyplot as plt import seaborn as sns
def load_data(file_path): data = pd.read_csv(file_path) print("Data successfully loaded.") return data
def exploratory_data_analysis(data, target_variable): print("First 5 rows:\n", data.head()) print("Data information:\n", data.info()) print("Descriptive statistics:\n", data.describe())
# Check correlation
plt.figure(figsize=(10, 8))
sns.heatmap(data.corr(), annot=True, cmap="coolwarm")
plt.title("Correlation Matrix")
plt.show()
# Target variable distribution
sns.histplot(data[target_variable], kde=True)
plt.title(f"Distribution of Target Variable: {target_variable}")
plt.show()
def preprocess_data(data, target_variable):
X = data.drop(columns=[target_variable])
y = data[target_variable]
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
print("Data preprocessing complete.")
return X_train, X_test, y_train, y_test
def train_model(X_train, y_train): model = LinearRegression() model.fit(X_train, y_train) print("Model trained successfully.") return model
def evaluate_model(model, X_test, y_test): y_pred = model.predict(X_test) mse = mean_squared_error(y_test, y_pred) r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error (MSE): {mse}")
print(f"R^2 Score: {r2}")
# Visualize Predictions
plt.scatter(y_test, y_pred, alpha=0.7)
plt.xlabel("Actual Values")
plt.ylabel("Predictions")
plt.title("Actual Values vs. Predictions")
plt.show()
def main(file_path, target_variable): data = load_data(file_path) exploratory_data_analysis(data, target_variable) X_train, X_test, y_train, y_test = preprocess_data(data, target_variable) model = train_model(X_train, y_train) evaluate_model(model, X_test, y_test)
file_path = "path_to_your_dataset.csv" target_variable = "your_target_variable_name" main(file_path, target_variable) Explications: Load the dataset: Reads data from a CSV file. Exploratory Data Analysis (EDA): Displays initial data, statistics, and correlation matrix for understanding relationships. Data Preprocessing: Splits the dataset into training and testing sets and applies standardization. Train the Model: Trains a linear regression model on the training data. Evaluate the Model: Calculates Mean Squared Error and R^2 Score and plots predictions vs. actual values for model performance.
@DhanushNehru, I would like to add the following script -
A Python Script to perform data analysis and processing for a Regression task given a dataset and the target variable as input.