fabriziosalmi / UglyFeed

Retrieve, aggregate, filter, evaluate, rewrite and serve RSS feeds using Large Language Models for fun, research and learning purposes
GNU Affero General Public License v3.0
119 stars 3 forks source link

model to score content #15

Open fabriziosalmi opened 4 months ago

fabriziosalmi commented 4 months ago
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
import joblib
from sklearn.preprocessing import LabelEncoder

# Load dataset
def load_dataset(json_files):
    dataset = []
    for file in json_files:
        with open(file, 'r') as f:
            data = json.load(f)
            original_text = data['original']
            versions = [data[f'version_{i+1}'] for i in range(len(data) - 1)]
            dataset.extend([(original_text, version, i+1) for i, version in enumerate(versions)])
    return pd.DataFrame(dataset, columns=['original', 'rewritten', 'version'])

# Load dataset (assuming json files are in a list called json_files)
dataset = load_dataset(json_files)

# Split dataset into training and testing sets
X = dataset['rewritten']
y = dataset['version']

le = LabelEncoder()
y_encoded = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Create a pipeline with TF-IDF vectorization and logistic regression
 pipeline = Pipeline([
        ('vectorizer', TfidfVectorizer(stop_words='english')),
        ('clf', LogisticRegression(max_iter=10000))
    ])

# Define hyperparameter tuning space
param_grid = {
    'vectorizer__max_df': [0.5, 0.75, 1.0],
    'vectorizer__min_df': [0, 0.1, 0.5],
    'clf__C': [0.1, 1, 10],
    'clf__penalty': ['l1', 'l2']
}

# Perform hyperparameter tuning using GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1_macro')
grid_search.fit(X_train, y_train)

# Get the best-performing model and its hyperparameters
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

# Evaluate the best model on the test set
y_pred = best_model.predict(X_test)
print("Test accuracy:", accuracy_score(y_test, y_pred))
print("Test classification report:")
print(classification_report(y_test, y_pred))
print("Test confusion matrix:")
print(confusion_matrix(y_test, y_pred))

# Save the trained model to a file
joblib.dump(best_model, 'ai_detector_model.joblib')

# Load the saved model and use it to make predictions on new data
def predict_ai_score(text):
    loaded_model = joblib.load('ai_detector_model.joblib')
    vectorized_text = loaded_model.named_steps['vectorizer'].transform([text])
    prediction = loaded_model.named_steps['clf'].predict(vectorized_text)
    return le.inverse_transform([prediction])[0]

# Example usage:
new_text = "This is a rewritten news article."
ai_score = predict_ai_score(new_text)
print("AI score:", ai_score)

Improvements:

Documentation for the PoC:

Dataset:

Hyperparameter Tuning:

Evaluation Metrics:

Model Deployment:

Example Usage:

fabriziosalmi commented 4 months ago
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
import joblib
from sklearn.preprocessing import LabelEncoder

class AIDetector:
    """
    A machine learning model to detect AI-generated content in news articles.

    Attributes:
        pipeline: A pipeline consisting of TF-IDF vectorization and logistic regression.
        le: A LabelEncoder to encode the version numbers as integers.
    """

    def __init__(self):
        self.pipeline = Pipeline([
            ('vectorizer', TfidfVectorizer(stop_words='english')),
            ('clf', LogisticRegression(max_iter=10000))
        ])
        self.le = LabelEncoder()

    def load_dataset(self, json_files):
        """
        Load the dataset from JSON files.

        Args:
            json_files (list): A list of JSON files containing the dataset.

        Returns:
            pd.DataFrame: A Pandas DataFrame containing the dataset.
        """
        dataset = []
        for file in json_files:
            with open(file, 'r') as f:
                data = json.load(f)
                original_text = data['original']
                versions = [data[f'version_{i+1}'] for i in range(len(data) - 1)]
                dataset.extend([(original_text, version, i+1) for i, version in enumerate(versions)])
        return pd.DataFrame(dataset, columns=['original', 'rewritten', 'version'])

    def train(self, X, y):
        """
        Train the model on the dataset.

        Args:
            X (pd.Series): The rewritten news articles.
            y (pd.Series): The version numbers of the rewritten articles.

        Returns:
            self: The trained model.
        """
        y_encoded = self.le.fit_transform(y)
        X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

        param_grid = {
            'vectorizer__max_df': [0.5, 0.75, 1.0],
            'vectorizer__min_df': [0, 0.1, 0.5],
            'clf__C': [0.1, 1, 10],
            'clf__penalty': ['l1', 'l2']
        }

        grid_search = GridSearchCV(self.pipeline, param_grid, cv=5, scoring='f1_macro')
        grid_search.fit(X_train, y_train)

        self.pipeline = grid_search.best_estimator_
        return self

    def predict(self, text):
        """
        Predict the version number of a rewritten news article.

        Args:
            text (str): The rewritten news article.

        Returns:
            int: The predicted version number.
        """
        vectorized_text = self.pipeline.named_steps['vectorizer'].transform([text])
        prediction = self.pipeline.named_steps['clf'].predict(vectorized_text)
        return self.le.inverse_transform([prediction])[0]

    def evaluate(self, X, y):
        """
        Evaluate the model on the test set.

        Args:
            X (pd.Series): The rewritten news articles.
            y (pd.Series): The version numbers of the rewritten articles.

        Returns:
            tuple: A tuple containing the accuracy, classification report, and confusion matrix.
        """
        y_pred = self.pipeline.predict(X)
        accuracy = accuracy_score(y, y_pred)
        report = classification_report(y, y_pred)
        matrix = confusion_matrix(y, y_pred)
        return accuracy, report, matrix

    def save(self, filename):
        """
        Save the trained model to a file.

        Args:
            filename (str): The filename to save the model to.
        """
        joblib.dump(self.pipeline, filename)

    def load(self, filename):
        """
        Load a trained model from a file.

        Args:
            filename (str): The filename to load the model from.

        Returns:
            self: The loaded model.
        """
        self.pipeline = joblib.load(filename)
        return self

Documentation:

Class: AIDetector

Attributes:

Methods:

Example Usage:

ai_detector = AIDetector()
dataset = ai_detector.load_dataset(json_files)
X = dataset['rewritten']
y = dataset['version']
ai_detector.train(X, y)

new_text = "This is a rewritten news article."
ai_score = ai_detector.predict(new_text)
print("AI score:", ai_score)

ai_detector.save('ai_detector_model.joblib')

loaded_ai_detector = AIDetector().load('ai_detector_model.joblib')
loaded_ai_score = loaded_ai_detector.predict(new_text)
print("Loaded AI score:", loaded_ai_score)