import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report
import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LogisticRegression from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report
Load the dataset
url = "https://www.kaggle.com/datasets/kartik2112/fraud-detection/download" df = pd.read_csv(url)
Explore the data
print(df.head()) print(df.info()) print(df.describe()) print(df.isnull().sum())
Visualize the distribution of the target variable
sns.countplot(x='isFraud', data=df) plt.title('Distribution of Fraudulent and Non-Fraudulent Transactions') plt.show()
Data Preprocessing
Assuming 'isFraud' is the target variable and rest are features
X = df.drop('isFraud', axis=1) y = df['isFraud']
Normalize/Standardize the data
scaler = StandardScaler() X_scaled = scaler.fit_transform(X)
Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)
Function to evaluate model performance
def evaluate_model(model, X_test, y_test): y_pred = model.predict(X_test) print("Accuracy:", accuracy_score(y_test, y_pred)) print("Precision:", precision_score(y_test, y_pred)) print("Recall:", recall_score(y_test, y_pred)) print("F1 Score:", f1_score(y_test, y_pred)) print("ROC AUC Score:", roc_auc_score(y_test, y_pred)) print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred)) print("Classification Report:\n", classification_report(y_test, y_pred))
Train Logistic Regression
log_reg = LogisticRegression() log_reg.fit(X_train, y_train) print("Logistic Regression Performance:") evaluate_model(log_reg, X_test, y_test)
Train Decision Tree
decision_tree = DecisionTreeClassifier() decision_tree.fit(X_train, y_train) print("Decision Tree Performance:") evaluate_model(decision_tree, X_test, y_test)
Train Random Forest
random_forest = RandomForestClassifier() random_forest.fit(X_train, y_train) print("Random Forest Performance:") evaluate_model(random_forest, X_test, y_test)