import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.model_selection import train_test_split from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.naive_bayes import MultinomialNB from sklearn.linear_model import LogisticRegression from sklearn.svm import SVC from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
Load the dataset
url = "https://www.kaggle.com/datasets/hijest/genre-classification-dataset-imdb/download" df = pd.read_csv(url)
Explore the data
print(df.head()) print(df.info()) print(df.describe()) print(df.isnull().sum())
Text Preprocessing function
def preprocess_text(text):
Remove punctuation, lowercase, etc.
df['clean_plot'] = df['plot'].apply(preprocess_text)
Convert text data into numerical features using TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000) X = tfidf_vectorizer.fit_transform(df['clean_plot'])
Target variable
y = df['genre']
Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
Function to evaluate model performance
def evaluate_model(model, X_test, y_test): y_pred = model.predict(X_test) print("Accuracy:", accuracy_score(y_test, y_pred)) print("Precision:", precision_score(y_test, y_pred, average='weighted')) print("Recall:", recall_score(y_test, y_pred, average='weighted')) print("F1 Score:", f1_score(y_test, y_pred, average='weighted')) print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred)) print("Classification Report:\n", classification_report(y_test, y_pred))
Train Naive Bayes
nb_model = MultinomialNB() nb_model.fit(X_train, y_train) print("Naive Bayes Performance:") evaluate_model(nb_model, X_test, y_test)
Train Logistic Regression
log_reg = LogisticRegression(max_iter=200) log_reg.fit(X_train, y_train) print("Logistic Regression Performance:") evaluate_model(log_reg, X_test, y_test)
Train Support Vector Machine
svm_model = SVC(kernel='linear') svm_model.fit(X_train, y_train) print("Support Vector Machine Performance:") evaluate_model(svm_model, X_test, y_test)