import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
Load the dataset
df = pd.read_csv('spam.csv', encoding='latin-1')
df = df[['v1', 'v2']] # Selecting only the columns 'v1' for labels and 'v2' for SMS text
df.columns = ['label', 'text'] # Renaming columns for clarity
Data preprocessing
df['label'] = df['label'].map({'ham': 0, 'spam': 1}) # Convert labels to binary (0 for ham, 1 for spam)
Import necessary libraries
import pandas as pd from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.model_selection import train_test_split from sklearn.naive_bayes import MultinomialNB from sklearn.linear_model import LogisticRegression from sklearn.svm import SVC from sklearn.metrics import accuracy_score, classification_report
Load the dataset
df = pd.read_csv('spam.csv', encoding='latin-1') df = df[['v1', 'v2']] # Selecting only the columns 'v1' for labels and 'v2' for SMS text df.columns = ['label', 'text'] # Renaming columns for clarity
Data preprocessing
df['label'] = df['label'].map({'ham': 0, 'spam': 1}) # Convert labels to binary (0 for ham, 1 for spam)
Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)
Feature extraction using TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words='english') X_train_tfidf = tfidf_vectorizer.fit_transform(X_train) X_test_tfidf = tfidf_vectorizer.transform(X_test)
Initialize classifiers
classifiers = { 'Naive Bayes': MultinomialNB(), 'Logistic Regression': LogisticRegression(max_iter=1000), 'Support Vector Machine': SVC() }
Train and evaluate each classifier
for clf_name, clf in classifiers.items(): clf.fit(X_train_tfidf, y_train) y_pred = clf.predict(X_test_tfidf) accuracy = accuracy_score(y_test, y_pred) report = classification_report(y_test, y_pred, target_names=['ham', 'spam'])