Open violet-dev opened 2 months ago
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.optim as optim
import json
from torch.utils.data import Dataset, DataLoader
from torchviz import make_dot
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter()
print(torch.cuda.is_available())
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")
print(device)
# Custom Dataset for loading the data
class InteractionDataset(Dataset):
def __init__(self, data):
self.data = data
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
item = self.data[idx]
# Compute watch percentage as implicit feedback
# watch_percentage = (
# item["watch_duration"] / item["duration"] if item["duration"] > 0 else 0
# )
# return (
# torch.tensor(item["user_id"], dtype=torch.long),
# torch.tensor(item["work_id"], dtype=torch.long),
# torch.tensor(watch_percentage, dtype=torch.float32),
# )
# We treat watch_duration as implicit feedback (1 if watched, 0 if not)
implicit_feedback = 1 if item["watch_duration"] > 0 else 0
return (
torch.tensor(item["user_id"], dtype=torch.long),
torch.tensor(item["work_id"], dtype=torch.long),
torch.tensor(implicit_feedback, dtype=torch.float32),
)
# Matrix Factorization Model
class MatrixFactorizationModel(nn.Module):
def __init__(self, num_users, num_works, embedding_dim=50):
super(MatrixFactorizationModel, self).__init__()
self.user_embedding = nn.Embedding(num_users, embedding_dim)
self.work_embedding = nn.Embedding(num_works, embedding_dim)
def forward(self, user_ids, work_ids):
user_embeds = self.user_embedding(user_ids)
work_embeds = self.work_embedding(work_ids)
dot_product = (user_embeds * work_embeds).sum(dim=1) # Dot product
return dot_product
class EnhancedMatrixFactorizationModel(nn.Module):
def __init__(self, num_users, num_works, embedding_dim=50):
super(EnhancedMatrixFactorizationModel, self).__init__()
self.user_embedding = nn.Embedding(num_users, embedding_dim)
self.work_embedding = nn.Embedding(num_works, embedding_dim)
self.user_bias = nn.Embedding(num_users, 1)
self.work_bias = nn.Embedding(num_works, 1)
def forward(self, user_ids, work_ids):
user_embeds = self.user_embedding(user_ids)
work_embeds = self.work_embedding(work_ids)
user_biases = self.user_bias(user_ids).squeeze()
work_biases = self.work_bias(work_ids).squeeze()
dot_product = (user_embeds * work_embeds).sum(dim=1)
prediction = dot_product + user_biases + work_biases
return prediction
# Load Data from JSON
def load_data(json_file):
with open(json_file, "r") as f:
data = json.load(f)
return data
def load_checkpoint(filepath, model, optimizer):
checkpoint = torch.load(filepath)
model.load_state_dict(checkpoint["model_state_dict"])
optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
epoch = checkpoint["epoch"]
loss = checkpoint["loss"]
print(f"Checkpoint loaded from {filepath}")
return epoch, loss
def save_checkpoint(model, optimizer, epoch, loss, filepath):
checkpoint = {
"epoch": epoch,
"model_state_dict": model.state_dict(),
"optimizer_state_dict": optimizer.state_dict(),
"loss": loss,
}
torch.save(checkpoint, filepath)
print(f"Checkpoint saved to {filepath}")
# Training Function
def train(model, dataloader, criterion, optimizer, epochs=10, start_epoch=0):
model.train()
for epoch in range(start_epoch, epochs):
total_loss = 0
with tqdm(dataloader) as tepoch:
tepoch.set_description(f"Epoch {epoch+1}")
for user_ids, work_ids, feedback in tepoch:
user_ids, work_ids, feedback = (
user_ids.to(device),
work_ids.to(device),
feedback.to(device),
)
optimizer.zero_grad()
predictions = model(user_ids, work_ids)
loss = criterion(predictions, feedback)
writer.add_scalar("Loss/train", loss, epoch)
loss.backward()
optimizer.step()
total_loss += loss.item()
tepoch.set_postfix(loss=loss.item())
save_checkpoint(
model, optimizer, epoch, total_loss, f"checkpoint_epoch_{epoch}.pth"
)
print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss:.4f}")
# Predict Function
def predict(model, user_id, work_id):
model.eval()
with torch.no_grad():
prediction = model(
torch.tensor([user_id]).to(device), torch.tensor([work_id]).to(device)
)
return prediction.item()
def recommend_works_for_user(model, user_id, num_works, top_k=5):
"""
Recommend the top K works for a given user based on model predictions.
:param model: The trained model
:param user_id: The ID of the user for whom to generate recommendations
:param num_works: Total number of available works
:param top_k: The number of top recommendations to return
:return: A list of top K work_ids
"""
model.eval()
work_ids = torch.arange(num_works) # Create a list of all possible work IDs
user_ids = torch.tensor([user_id] * num_works) # Repeat user_id for all works
# Predict scores for this user for all works
with torch.no_grad():
predicted_scores = model(user_ids, work_ids)
# Get the top K work IDs with highest predicted scores
_, top_k_indices = torch.topk(predicted_scores, top_k)
recommended_work_ids = top_k_indices.tolist() # Convert indices to a list
return recommended_work_ids
def recommend_works_for_user2(model, user_id, valid_work_ids, top_k=5):
"""
Recommend the top K works for a given user based on model predictions, using only valid work IDs.
:param model: The trained model
:param user_id: The ID of the user for whom to generate recommendations
:param valid_work_ids: List of valid work IDs from the dataset
:param top_k: The number of top recommendations to return
:return: A list of top K work_ids
"""
model.eval()
work_ids = torch.tensor(
valid_work_ids
) # Convert the list of valid work IDs to a tensor
user_ids = torch.tensor(
[user_id] * len(valid_work_ids)
) # Repeat user_id for all works
# Predict scores for this user for all valid works
with torch.no_grad():
predicted_scores = model(user_ids, work_ids)
# Get the top K work IDs with the highest predicted scores
_, top_k_indices = torch.topk(predicted_scores, top_k)
recommended_work_ids = work_ids[
top_k_indices
].tolist() # Convert indices to actual work IDs
return recommended_work_ids
def recommend_works_for_user3(model, user_id, valid_work_ids, top_k=5):
model.eval()
work_ids = torch.tensor(valid_work_ids)
user_ids = torch.tensor([user_id] * len(valid_work_ids))
with torch.no_grad():
predicted_scores = model(user_ids, work_ids)
top_k = min(top_k, len(valid_work_ids)) # Adjust top_k if necessary
_, top_k_indices = torch.topk(predicted_scores, top_k)
recommended_work_ids = work_ids[top_k_indices].tolist()
return recommended_work_ids
# Hyperparameters
embedding_dim = 5
batch_size = 256 * 32
learning_rate = 0.001
epochs = 10
# Load the dataset from JSON
data = load_data("data.json")
# Determine the number of unique users and works
num_users = max([d["user_id"] for d in data]) + 1
num_works = max([d["work_id"] for d in data]) + 1
print(f"Num users: {num_users}")
print(f"Num works: {num_works}")
# Create DataLoader
dataset = InteractionDataset(data)
# dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
# Initialize Model, Loss Function, and Optimizer
model = EnhancedMatrixFactorizationModel(
num_users=num_users, num_works=num_works, embedding_dim=embedding_dim
).to(device)
criterion = nn.MSELoss() # Mean Squared Error for implicit feedback
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
# Train the model
# start_epoch, last_loss = load_checkpoint("checkpoint_epoch_9.pth", model, optimizer)
start_epoch = 0
train(model, dataloader, criterion, optimizer, epochs, start_epoch)
user_ids = torch.tensor([1, 2, 3]).to(device)
work_ids = torch.tensor([4, 5, 6]).to(device)
writer.add_graph(model, [user_ids, work_ids])
writer.flush()
#
# Prediction
#
# Example Prediction
user_id = 1
work_id = 101
predicted_feedback = predict(model, user_id, work_id)
print(
f"Predicted implicit feedback for User {user_id} and Work {work_id}: {predicted_feedback:.4f}"
)
# Example usage
user_id = 1
num_works = num_works # Assume this is the total number of available works
top_k = 5 # Get top 5 recommendations
recommended_works = recommend_works_for_user(model, user_id, num_works, top_k)
print(f"Top {top_k} recommended works for User {user_id}: {recommended_works}")
# Example usage
user_id = 1
valid_work_ids = [
100,
101,
102,
103,
] # Actual list of available work_ids from your dataset
top_k = 5 # Get top 5 recommendations
for vv in valid_work_ids:
predicted_feedback = predict(model, 1, vv)
print(
f"Predicted implicit feedback for User {1} and Work {vv}: {predicted_feedback:.4f}"
)
recommended_works = recommend_works_for_user3(model, user_id, valid_work_ids, top_k)
print(f"Top {top_k} recommended works for User {user_id}: {recommended_works}")
# Create dummy input
user_ids = torch.tensor([1, 2, 3]).to(device)
work_ids = torch.tensor([4, 5, 6]).to(device)
# Forward pass to get the output
output = model(user_ids, work_ids)
# Generate the visualization
dot = make_dot(
output, params=dict(model.named_parameters()), show_attrs=True, show_saved=True
)
dot.render("model_architecture", format="dot")
In this case, Implicit Matrix Factorization using methods like Alternating Least Squares (ALS) can work well. Below is a PyTorch-based collaborative filtering model that handles implicit feedback using interaction data (e.g., watch_duration or a binary signal of whether a user watched a work).
Explanation of Changes: Implicit Feedback: Instead of predicting ratings, we predict a score based on implicit signals like watch_duration. We convert this into a binary signal (1 if the user watched, 0 if not).
Model: Similar to before, but now trained to predict whether a user would interact with a work rather than predicting an explicit rating.
Loss Function: Mean Squared Error (MSE) is used to minimize the error between the predicted and actual interaction signals. For implicit feedback, other loss functions like Binary Cross-Entropy (BCE) could also be used if you're focusing strictly on binary interactions.
Implicit Feedback Assumptions: Users who interact with an item (watch a work) have a higher probability of being interested in similar works. Absence of interaction is treated as "unknown" or weakly negative feedback. Scalability: You can further scale this approach by experimenting with larger embedding dimensions or more sophisticated techniques such as BPR (Bayesian Personalized Ranking) or WARP (Weighted Approximate-Rank Pairwise) loss, which are specifically designed for implicit feedback.
This approach should work well for your dataset, where explicit rating data is not available.
scalability는 딱히 필요한지는 모르겠다