model to score content - Githubissues

fabriziosalmi / UglyFeed

Retrieve, aggregate, filter, evaluate, rewrite and serve RSS feeds using Large Language Models for fun, research and learning purposes

GNU Affero General Public License v3.0

119 stars 3 forks source link

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
import joblib
from sklearn.preprocessing import LabelEncoder

# Load dataset
def load_dataset(json_files):
    dataset = []
    for file in json_files:
        with open(file, 'r') as f:
            data = json.load(f)
            original_text = data['original']
            versions = [data[f'version_{i+1}'] for i in range(len(data) - 1)]
            dataset.extend([(original_text, version, i+1) for i, version in enumerate(versions)])
    return pd.DataFrame(dataset, columns=['original', 'rewritten', 'version'])

# Load dataset (assuming json files are in a list called json_files)
dataset = load_dataset(json_files)

# Split dataset into training and testing sets
X = dataset['rewritten']
y = dataset['version']

le = LabelEncoder()
y_encoded = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Create a pipeline with TF-IDF vectorization and logistic regression
 pipeline = Pipeline([
        ('vectorizer', TfidfVectorizer(stop_words='english')),
        ('clf', LogisticRegression(max_iter=10000))
    ])

# Define hyperparameter tuning space
param_grid = {
    'vectorizer__max_df': [0.5, 0.75, 1.0],
    'vectorizer__min_df': [0, 0.1, 0.5],
    'clf__C': [0.1, 1, 10],
    'clf__penalty': ['l1', 'l2']
}

# Perform hyperparameter tuning using GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1_macro')
grid_search.fit(X_train, y_train)

# Get the best-performing model and its hyperparameters
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

# Evaluate the best model on the test set
y_pred = best_model.predict(X_test)
print("Test accuracy:", accuracy_score(y_test, y_pred))
print("Test classification report:")
print(classification_report(y_test, y_pred))
print("Test confusion matrix:")
print(confusion_matrix(y_test, y_pred))

# Save the trained model to a file
joblib.dump(best_model, 'ai_detector_model.joblib')

# Load the saved model and use it to make predictions on new data
def predict_ai_score(text):
    loaded_model = joblib.load('ai_detector_model.joblib')
    vectorized_text = loaded_model.named_steps['vectorizer'].transform([text])
    prediction = loaded_model.named_steps['clf'].predict(vectorized_text)
    return le.inverse_transform([prediction])[0]

# Example usage:
new_text = "This is a rewritten news article."
ai_score = predict_ai_score(new_text)
print("AI score:", ai_score)

Improvements:

Added LabelEncoder to encode the version numbers as integers, which is required for logistic regression.
Changed the predict_ai_score function to use the inverse_transform method to convert the predicted integer back to the original version number.
Improved code readability by adding whitespace and using consistent naming conventions.
Added comments to explain the code and make it easier to understand.

Documentation for the PoC:

Dataset:

The dataset consists of JSON files, each containing an original news article and its rewritten versions.

Each JSON file has the following structure:

{
"original": "Original news article text",
"version_1": "Rewritten version 1 text",
"version_2": "Rewritten version 2 text",
...
}

Machine Learning Pipeline:

The pipeline consists of two stages: TF-IDF vectorization and logistic regression.
The TF-IDF vectorizer converts the text data into numerical features.
The logistic regression model predicts the version number of the rewritten article.

Hyperparameter Tuning:

The hyperparameter tuning space includes the following parameters:
- max_df and min_df for the TF-IDF vectorizer.
- C and penalty for the logistic regression model.
The GridSearchCV algorithm performs a grid search over the hyperparameter space to find the best combination of hyperparameters.

Evaluation Metrics:

The model is evaluated using the F1 macro score, accuracy, classification report, and confusion matrix.

Model Deployment:

The trained model is saved to a file using joblib.
The saved model can be loaded and used to make predictions on new data.

Example Usage:

The predict_ai_score function takes in a rewritten news article text and returns the predicted version number.
The predicted version number can be used to train a separate model to detect AI-generated content.

import pandas as pd import numpy as np from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.model_selection import train_test_split, GridSearchCV from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score, classification_report, confusion_matrix from sklearn.pipeline import Pipeline import joblib from sklearn.preprocessing import LabelEncoder class AIDetector: """ A machine learning model to detect AI-generated content in news articles. Attributes: pipeline: A pipeline consisting of TF-IDF vectorization and logistic regression. le: A LabelEncoder to encode the version numbers as integers. """ def __init__(self): self.pipeline = Pipeline([ ('vectorizer', TfidfVectorizer(stop_words='english')), ('clf', LogisticRegression(max_iter=10000)) ]) self.le = LabelEncoder() def load_dataset(self, json_files): """ Load the dataset from JSON files. Args: json_files (list): A list of JSON files containing the dataset. Returns: pd.DataFrame: A Pandas DataFrame containing the dataset. """ dataset = [] for file in json_files: with open(file, 'r') as f: data = json.load(f) original_text = data['original'] versions = [data[f'version_{i+1}'] for i in range(len(data) - 1)] dataset.extend([(original_text, version, i+1) for i, version in enumerate(versions)]) return pd.DataFrame(dataset, columns=['original', 'rewritten', 'version']) def train(self, X, y): """ Train the model on the dataset. Args: X (pd.Series): The rewritten news articles. y (pd.Series): The version numbers of the rewritten articles. Returns: self: The trained model. """ y_encoded = self.le.fit_transform(y) X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42) param_grid = { 'vectorizer__max_df': [0.5, 0.75, 1.0], 'vectorizer__min_df': [0, 0.1, 0.5], 'clf__C': [0.1, 1, 10], 'clf__penalty': ['l1', 'l2'] } grid_search = GridSearchCV(self.pipeline, param_grid, cv=5, scoring='f1_macro') grid_search.fit(X_train, y_train) self.pipeline = grid_search.best_estimator_ return self def predict(self, text): """ Predict the version number of a rewritten news article. Args: text (str): The rewritten news article. Returns: int: The predicted version number. """ vectorized_text = self.pipeline.named_steps['vectorizer'].transform([text]) prediction = self.pipeline.named_steps['clf'].predict(vectorized_text) return self.le.inverse_transform([prediction])[0] def evaluate(self, X, y): """ Evaluate the model on the test set. Args: X (pd.Series): The rewritten news articles. y (pd.Series): The version numbers of the rewritten articles. Returns: tuple: A tuple containing the accuracy, classification report, and confusion matrix. """ y_pred = self.pipeline.predict(X) accuracy = accuracy_score(y, y_pred) report = classification_report(y, y_pred) matrix = confusion_matrix(y, y_pred) return accuracy, report, matrix def save(self, filename): """ Save the trained model to a file. Args: filename (str): The filename to save the model to. """ joblib.dump(self.pipeline, filename) def load(self, filename): """ Load a trained model from a file. Args: filename (str): The filename to load the model from. Returns: self: The loaded model. """ self.pipeline = joblib.load(filename) return self

ai_detector = AIDetector() dataset = ai_detector.load_dataset(json_files) X = dataset['rewritten'] y = dataset['version'] ai_detector.train(X, y) new_text = "This is a rewritten news article." ai_score = ai_detector.predict(new_text) print("AI score:", ai_score) ai_detector.save('ai_detector_model.joblib') loaded_ai_detector = AIDetector().load('ai_detector_model.joblib') loaded_ai_score = loaded_ai_detector.predict(new_text) print("Loaded AI score:", loaded_ai_score)

fabriziosalmi / UglyFeed

model to score content #15