Open fabriziosalmi opened 4 months ago
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
import joblib
from sklearn.preprocessing import LabelEncoder
class AIDetector:
"""
A machine learning model to detect AI-generated content in news articles.
Attributes:
pipeline: A pipeline consisting of TF-IDF vectorization and logistic regression.
le: A LabelEncoder to encode the version numbers as integers.
"""
def __init__(self):
self.pipeline = Pipeline([
('vectorizer', TfidfVectorizer(stop_words='english')),
('clf', LogisticRegression(max_iter=10000))
])
self.le = LabelEncoder()
def load_dataset(self, json_files):
"""
Load the dataset from JSON files.
Args:
json_files (list): A list of JSON files containing the dataset.
Returns:
pd.DataFrame: A Pandas DataFrame containing the dataset.
"""
dataset = []
for file in json_files:
with open(file, 'r') as f:
data = json.load(f)
original_text = data['original']
versions = [data[f'version_{i+1}'] for i in range(len(data) - 1)]
dataset.extend([(original_text, version, i+1) for i, version in enumerate(versions)])
return pd.DataFrame(dataset, columns=['original', 'rewritten', 'version'])
def train(self, X, y):
"""
Train the model on the dataset.
Args:
X (pd.Series): The rewritten news articles.
y (pd.Series): The version numbers of the rewritten articles.
Returns:
self: The trained model.
"""
y_encoded = self.le.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)
param_grid = {
'vectorizer__max_df': [0.5, 0.75, 1.0],
'vectorizer__min_df': [0, 0.1, 0.5],
'clf__C': [0.1, 1, 10],
'clf__penalty': ['l1', 'l2']
}
grid_search = GridSearchCV(self.pipeline, param_grid, cv=5, scoring='f1_macro')
grid_search.fit(X_train, y_train)
self.pipeline = grid_search.best_estimator_
return self
def predict(self, text):
"""
Predict the version number of a rewritten news article.
Args:
text (str): The rewritten news article.
Returns:
int: The predicted version number.
"""
vectorized_text = self.pipeline.named_steps['vectorizer'].transform([text])
prediction = self.pipeline.named_steps['clf'].predict(vectorized_text)
return self.le.inverse_transform([prediction])[0]
def evaluate(self, X, y):
"""
Evaluate the model on the test set.
Args:
X (pd.Series): The rewritten news articles.
y (pd.Series): The version numbers of the rewritten articles.
Returns:
tuple: A tuple containing the accuracy, classification report, and confusion matrix.
"""
y_pred = self.pipeline.predict(X)
accuracy = accuracy_score(y, y_pred)
report = classification_report(y, y_pred)
matrix = confusion_matrix(y, y_pred)
return accuracy, report, matrix
def save(self, filename):
"""
Save the trained model to a file.
Args:
filename (str): The filename to save the model to.
"""
joblib.dump(self.pipeline, filename)
def load(self, filename):
"""
Load a trained model from a file.
Args:
filename (str): The filename to load the model from.
Returns:
self: The loaded model.
"""
self.pipeline = joblib.load(filename)
return self
Documentation:
Class: AIDetector
Attributes:
pipeline
: A pipeline consisting of TF-IDF vectorization and logistic regression.le
: A LabelEncoder to encode the version numbers as integers.Methods:
load_dataset
: Load the dataset from JSON files.train
: Train the model on the dataset.predict
: Predict the version number of a rewritten news article.evaluate
: Evaluate the model on the test set.save
: Save the trained model to a file.load
: Load a trained model from a file.Example Usage:
ai_detector = AIDetector()
dataset = ai_detector.load_dataset(json_files)
X = dataset['rewritten']
y = dataset['version']
ai_detector.train(X, y)
new_text = "This is a rewritten news article."
ai_score = ai_detector.predict(new_text)
print("AI score:", ai_score)
ai_detector.save('ai_detector_model.joblib')
loaded_ai_detector = AIDetector().load('ai_detector_model.joblib')
loaded_ai_score = loaded_ai_detector.predict(new_text)
print("Loaded AI score:", loaded_ai_score)
Improvements:
LabelEncoder
to encode the version numbers as integers, which is required for logistic regression.predict_ai_score
function to use theinverse_transform
method to convert the predicted integer back to the original version number.Documentation for the PoC:
Dataset:
The dataset consists of JSON files, each containing an original news article and its rewritten versions.
Each JSON file has the following structure:
Machine Learning Pipeline:
The pipeline consists of two stages: TF-IDF vectorization and logistic regression.
The TF-IDF vectorizer converts the text data into numerical features.
The logistic regression model predicts the version number of the rewritten article.
Hyperparameter Tuning:
max_df
andmin_df
for the TF-IDF vectorizer.C
andpenalty
for the logistic regression model.Evaluation Metrics:
Model Deployment:
Example Usage:
predict_ai_score
function takes in a rewritten news article text and returns the predicted version number.