project-violet / violet

Monorepo for Project Violet
402 stars 41 forks source link

Article Recommandation 모델링 #531

Open violet-dev opened 1 month ago

violet-dev commented 1 month ago

In this case, Implicit Matrix Factorization using methods like Alternating Least Squares (ALS) can work well. Below is a PyTorch-based collaborative filtering model that handles implicit feedback using interaction data (e.g., watch_duration or a binary signal of whether a user watched a work).

import torch
import torch.nn as nn
import torch.optim as optim
import json
from torch.utils.data import Dataset, DataLoader

# Custom Dataset for loading the data
class InteractionDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        # We treat watch_duration as implicit feedback (1 if watched, 0 if not)
        implicit_feedback = 1 if item['watch_duration'] > 0 else 0
        return torch.tensor(item['user_id'], dtype=torch.long), torch.tensor(item['work_id'], dtype=torch.long), torch.tensor(implicit_feedback, dtype=torch.float32)

# Matrix Factorization Model
class MatrixFactorizationModel(nn.Module):
    def __init__(self, num_users, num_works, embedding_dim=50):
        super(MatrixFactorizationModel, self).__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.work_embedding = nn.Embedding(num_works, embedding_dim)

    def forward(self, user_ids, work_ids):
        user_embeds = self.user_embedding(user_ids)
        work_embeds = self.work_embedding(work_ids)
        dot_product = (user_embeds * work_embeds).sum(dim=1)  # Dot product
        return dot_product

# Load Data from JSON
def load_data(json_file):
    with open(json_file, 'r') as f:
        data = json.load(f)
    return data

# Training Function
def train(model, dataloader, criterion, optimizer, epochs=10):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for user_ids, work_ids, feedback in dataloader:
            optimizer.zero_grad()
            predictions = model(user_ids, work_ids)
            loss = criterion(predictions, feedback)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss:.4f}")

# Predict Function
def predict(model, user_id, work_id):
    model.eval()
    with torch.no_grad():
        prediction = model(torch.tensor([user_id]), torch.tensor([work_id]))
    return prediction.item()

# Hyperparameters
embedding_dim = 50
batch_size = 32
learning_rate = 0.001
epochs = 10

# Load the dataset from JSON
data = load_data('interactions.json')

# Determine the number of unique users and works
num_users = max([d['user_id'] for d in data]) + 1
num_works = max([d['work_id'] for d in data]) + 1

# Create DataLoader
dataset = InteractionDataset(data)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Initialize Model, Loss Function, and Optimizer
model = MatrixFactorizationModel(num_users=num_users, num_works=num_works, embedding_dim=embedding_dim)
criterion = nn.MSELoss()  # Mean Squared Error for implicit feedback
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Train the model
train(model, dataloader, criterion, optimizer, epochs)

# Example Prediction
user_id = 1
work_id = 100
predicted_feedback = predict(model, user_id, work_id)
print(f"Predicted implicit feedback for User {user_id} and Work {work_id}: {predicted_feedback:.4f}")
[
  {
    "user_id": 1,
    "work_id": 100,
    "watch_duration": 300  // in seconds, for example
  },
  {
    "user_id": 2,
    "work_id": 101,
    "watch_duration": 0  // User did not watch this work
  },
  ...
]

Explanation of Changes: Implicit Feedback: Instead of predicting ratings, we predict a score based on implicit signals like watch_duration. We convert this into a binary signal (1 if the user watched, 0 if not).

Model: Similar to before, but now trained to predict whether a user would interact with a work rather than predicting an explicit rating.

Loss Function: Mean Squared Error (MSE) is used to minimize the error between the predicted and actual interaction signals. For implicit feedback, other loss functions like Binary Cross-Entropy (BCE) could also be used if you're focusing strictly on binary interactions.

Implicit Feedback Assumptions: Users who interact with an item (watch a work) have a higher probability of being interested in similar works. Absence of interaction is treated as "unknown" or weakly negative feedback. Scalability: You can further scale this approach by experimenting with larger embedding dimensions or more sophisticated techniques such as BPR (Bayesian Personalized Ranking) or WARP (Weighted Approximate-Rank Pairwise) loss, which are specifically designed for implicit feedback.

This approach should work well for your dataset, where explicit rating data is not available.

scalability는 딱히 필요한지는 모르겠다

[
  {
    "user_id": 1,
    "work_id": 100,
    "watch_duration": 300,  // watched 300 seconds
    "duration": 600  // total length of work is 600 seconds
  },
  {
    "user_id": 2,
    "work_id": 101,
    "watch_duration": 0,  // user didn't watch
    "duration": 1200  // work is 1200 seconds long
  },
  ...
]
# Custom Dataset for loading the data
class InteractionDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        # Compute watch percentage as implicit feedback
        watch_percentage = item['watch_duration'] / item['duration'] if item['duration'] > 0 else 0
        return torch.tensor(item['user_id'], dtype=torch.long), torch.tensor(item['work_id'], dtype=torch.long), torch.tensor(watch_percentage, dtype=torch.float32)
def recommend_works_for_user(model, user_id, num_works, top_k=5):
    """
    Recommend the top K works for a given user based on model predictions.
    :param model: The trained model
    :param user_id: The ID of the user for whom to generate recommendations
    :param num_works: Total number of available works
    :param top_k: The number of top recommendations to return
    :return: A list of top K work_ids
    """
    model.eval()
    work_ids = torch.arange(num_works)  # Create a list of all possible work IDs
    user_ids = torch.tensor([user_id] * num_works)  # Repeat user_id for all works

    # Predict scores for this user for all works
    with torch.no_grad():
        predicted_scores = model(user_ids, work_ids)

    # Get the top K work IDs with highest predicted scores
    _, top_k_indices = torch.topk(predicted_scores, top_k)
    recommended_work_ids = top_k_indices.tolist()  # Convert indices to a list

    return recommended_work_ids

# Example usage
user_id = 1
num_works = num_works  # Assume this is the total number of available works
top_k = 5  # Get top 5 recommendations

recommended_works = recommend_works_for_user(model, user_id, num_works, top_k)
print(f"Top {top_k} recommended works for User {user_id}: {recommended_works}")
violet-dev commented 1 month ago
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.optim as optim
import json
from torch.utils.data import Dataset, DataLoader
from torchviz import make_dot

from torch.utils.tensorboard import SummaryWriter

writer = SummaryWriter()

print(torch.cuda.is_available())
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")
print(device)

# Custom Dataset for loading the data
class InteractionDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        # Compute watch percentage as implicit feedback
        # watch_percentage = (
        #     item["watch_duration"] / item["duration"] if item["duration"] > 0 else 0
        # )
        # return (
        #     torch.tensor(item["user_id"], dtype=torch.long),
        #     torch.tensor(item["work_id"], dtype=torch.long),
        #     torch.tensor(watch_percentage, dtype=torch.float32),
        # )

        # We treat watch_duration as implicit feedback (1 if watched, 0 if not)
        implicit_feedback = 1 if item["watch_duration"] > 0 else 0
        return (
            torch.tensor(item["user_id"], dtype=torch.long),
            torch.tensor(item["work_id"], dtype=torch.long),
            torch.tensor(implicit_feedback, dtype=torch.float32),
        )

# Matrix Factorization Model
class MatrixFactorizationModel(nn.Module):
    def __init__(self, num_users, num_works, embedding_dim=50):
        super(MatrixFactorizationModel, self).__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.work_embedding = nn.Embedding(num_works, embedding_dim)

    def forward(self, user_ids, work_ids):
        user_embeds = self.user_embedding(user_ids)
        work_embeds = self.work_embedding(work_ids)
        dot_product = (user_embeds * work_embeds).sum(dim=1)  # Dot product
        return dot_product

class EnhancedMatrixFactorizationModel(nn.Module):
    def __init__(self, num_users, num_works, embedding_dim=50):
        super(EnhancedMatrixFactorizationModel, self).__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.work_embedding = nn.Embedding(num_works, embedding_dim)
        self.user_bias = nn.Embedding(num_users, 1)
        self.work_bias = nn.Embedding(num_works, 1)

    def forward(self, user_ids, work_ids):
        user_embeds = self.user_embedding(user_ids)
        work_embeds = self.work_embedding(work_ids)
        user_biases = self.user_bias(user_ids).squeeze()
        work_biases = self.work_bias(work_ids).squeeze()

        dot_product = (user_embeds * work_embeds).sum(dim=1)
        prediction = dot_product + user_biases + work_biases
        return prediction

# Load Data from JSON
def load_data(json_file):
    with open(json_file, "r") as f:
        data = json.load(f)
    return data

def load_checkpoint(filepath, model, optimizer):
    checkpoint = torch.load(filepath)
    model.load_state_dict(checkpoint["model_state_dict"])
    optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
    epoch = checkpoint["epoch"]
    loss = checkpoint["loss"]
    print(f"Checkpoint loaded from {filepath}")
    return epoch, loss

def save_checkpoint(model, optimizer, epoch, loss, filepath):
    checkpoint = {
        "epoch": epoch,
        "model_state_dict": model.state_dict(),
        "optimizer_state_dict": optimizer.state_dict(),
        "loss": loss,
    }
    torch.save(checkpoint, filepath)
    print(f"Checkpoint saved to {filepath}")

# Training Function
def train(model, dataloader, criterion, optimizer, epochs=10, start_epoch=0):
    model.train()
    for epoch in range(start_epoch, epochs):
        total_loss = 0

        with tqdm(dataloader) as tepoch:
            tepoch.set_description(f"Epoch {epoch+1}")
            for user_ids, work_ids, feedback in tepoch:
                user_ids, work_ids, feedback = (
                    user_ids.to(device),
                    work_ids.to(device),
                    feedback.to(device),
                )
                optimizer.zero_grad()
                predictions = model(user_ids, work_ids)
                loss = criterion(predictions, feedback)
                writer.add_scalar("Loss/train", loss, epoch)
                loss.backward()
                optimizer.step()
                total_loss += loss.item()
                tepoch.set_postfix(loss=loss.item())
            save_checkpoint(
                model, optimizer, epoch, total_loss, f"checkpoint_epoch_{epoch}.pth"
            )
            print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss:.4f}")

# Predict Function
def predict(model, user_id, work_id):
    model.eval()
    with torch.no_grad():
        prediction = model(
            torch.tensor([user_id]).to(device), torch.tensor([work_id]).to(device)
        )
    return prediction.item()

def recommend_works_for_user(model, user_id, num_works, top_k=5):
    """
    Recommend the top K works for a given user based on model predictions.
    :param model: The trained model
    :param user_id: The ID of the user for whom to generate recommendations
    :param num_works: Total number of available works
    :param top_k: The number of top recommendations to return
    :return: A list of top K work_ids
    """
    model.eval()
    work_ids = torch.arange(num_works)  # Create a list of all possible work IDs
    user_ids = torch.tensor([user_id] * num_works)  # Repeat user_id for all works

    # Predict scores for this user for all works
    with torch.no_grad():
        predicted_scores = model(user_ids, work_ids)

    # Get the top K work IDs with highest predicted scores
    _, top_k_indices = torch.topk(predicted_scores, top_k)
    recommended_work_ids = top_k_indices.tolist()  # Convert indices to a list

    return recommended_work_ids

def recommend_works_for_user2(model, user_id, valid_work_ids, top_k=5):
    """
    Recommend the top K works for a given user based on model predictions, using only valid work IDs.
    :param model: The trained model
    :param user_id: The ID of the user for whom to generate recommendations
    :param valid_work_ids: List of valid work IDs from the dataset
    :param top_k: The number of top recommendations to return
    :return: A list of top K work_ids
    """
    model.eval()

    work_ids = torch.tensor(
        valid_work_ids
    )  # Convert the list of valid work IDs to a tensor
    user_ids = torch.tensor(
        [user_id] * len(valid_work_ids)
    )  # Repeat user_id for all works

    # Predict scores for this user for all valid works
    with torch.no_grad():
        predicted_scores = model(user_ids, work_ids)

    # Get the top K work IDs with the highest predicted scores
    _, top_k_indices = torch.topk(predicted_scores, top_k)
    recommended_work_ids = work_ids[
        top_k_indices
    ].tolist()  # Convert indices to actual work IDs

    return recommended_work_ids

def recommend_works_for_user3(model, user_id, valid_work_ids, top_k=5):
    model.eval()

    work_ids = torch.tensor(valid_work_ids)
    user_ids = torch.tensor([user_id] * len(valid_work_ids))

    with torch.no_grad():
        predicted_scores = model(user_ids, work_ids)

    top_k = min(top_k, len(valid_work_ids))  # Adjust top_k if necessary
    _, top_k_indices = torch.topk(predicted_scores, top_k)
    recommended_work_ids = work_ids[top_k_indices].tolist()

    return recommended_work_ids

# Hyperparameters
embedding_dim = 5
batch_size = 256 * 32
learning_rate = 0.001
epochs = 10

# Load the dataset from JSON
data = load_data("data.json")

# Determine the number of unique users and works
num_users = max([d["user_id"] for d in data]) + 1
num_works = max([d["work_id"] for d in data]) + 1

print(f"Num users: {num_users}")
print(f"Num works: {num_works}")

# Create DataLoader
dataset = InteractionDataset(data)
# dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

# Initialize Model, Loss Function, and Optimizer
model = EnhancedMatrixFactorizationModel(
    num_users=num_users, num_works=num_works, embedding_dim=embedding_dim
).to(device)
criterion = nn.MSELoss()  # Mean Squared Error for implicit feedback
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Train the model
# start_epoch, last_loss = load_checkpoint("checkpoint_epoch_9.pth", model, optimizer)
start_epoch = 0
train(model, dataloader, criterion, optimizer, epochs, start_epoch)

user_ids = torch.tensor([1, 2, 3]).to(device)
work_ids = torch.tensor([4, 5, 6]).to(device)
writer.add_graph(model, [user_ids, work_ids])
writer.flush()

#
# Prediction
#

# Example Prediction
user_id = 1
work_id = 101
predicted_feedback = predict(model, user_id, work_id)
print(
    f"Predicted implicit feedback for User {user_id} and Work {work_id}: {predicted_feedback:.4f}"
)

# Example usage
user_id = 1
num_works = num_works  # Assume this is the total number of available works
top_k = 5  # Get top 5 recommendations

recommended_works = recommend_works_for_user(model, user_id, num_works, top_k)
print(f"Top {top_k} recommended works for User {user_id}: {recommended_works}")

# Example usage
user_id = 1
valid_work_ids = [
    100,
    101,
    102,
    103,
]  # Actual list of available work_ids from your dataset
top_k = 5  # Get top 5 recommendations

for vv in valid_work_ids:
    predicted_feedback = predict(model, 1, vv)
    print(
        f"Predicted implicit feedback for User {1} and Work {vv}: {predicted_feedback:.4f}"
    )

recommended_works = recommend_works_for_user3(model, user_id, valid_work_ids, top_k)
print(f"Top {top_k} recommended works for User {user_id}: {recommended_works}")

# Create dummy input
user_ids = torch.tensor([1, 2, 3]).to(device)
work_ids = torch.tensor([4, 5, 6]).to(device)

# Forward pass to get the output
output = model(user_ids, work_ids)

# Generate the visualization
dot = make_dot(
    output, params=dict(model.named_parameters()), show_attrs=True, show_saved=True
)
dot.render("model_architecture", format="dot")