Open un0o7 opened 1 year ago
Thank you for reporting these results. I believe @leopoldwhite and @whr000001 are looking into this now.
Hi, thank you for your interest in our work. Could you please provide more detailed information about how you train and test BotRGCN and how to preprocess the dataset? Thanks.
` import torch from torch.utils.data import Dataset from torch import nn from torch_geometric.nn import RGCNConv import torch.nn.functional as F from sklearn.metrics import f1_score from sklearn.metrics import precision_score from sklearn.metrics import recall_score from sklearn.metrics import roc_curve, auc import random import numpy as np
def seed_everything(seed): random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed(seed) torch.cuda.manual_seed_all(seed) torch.backends.cudnn.deterministic = True
seed_everything(2026)
class Twibot22(Dataset):
def __init__(
self,
root='./Data/',
device='cpu',
):
self.root = root
self.device = device
def train_val_test_mask(self):
train_idx = torch.load(self.root + 'train_idx.pt')
val_idx = torch.load(self.root + 'val_idx.pt')
test_idx = torch.load(self.root + 'test_idx.pt')
return train_idx, val_idx, test_idx
def dataloader(self):
labels = torch.load(self.root + 'label.pt').to(self.device)
des_tensor = torch.load(self.root + 'des_tensor.pt').to(self.device)
tweets_tensor = torch.load(self.root + 'tweets_tensor.pt').to(
self.device)
num_prop = torch.load(self.root + 'num_properties_tensor.pt').to(
self.device)
category_prop = torch.load(self.root + 'cat_properties_tensor.pt').to(
self.device)
edge_index = torch.load(self.root + 'edge_index.pt').to(self.device)
edge_type = torch.load(self.root + 'edge_type.pt').to(self.device)
train_idx, val_idx, test_idx = self.train_val_test_mask()
return des_tensor, tweets_tensor, num_prop, category_prop, edge_index, edge_type, labels, train_idx, val_idx, test_idx
class BotRGCN(nn.Module):
def __init__(self,
des_size=768,
tweet_size=768,
num_prop_size=5,
cat_prop_size=3,
embedding_dimension=128,
dropout=0.3):
super(BotRGCN, self).__init__()
self.dropout = dropout
self.linear_relu_des = nn.Sequential(
nn.Linear(des_size, int(embedding_dimension / 4)), nn.LeakyReLU())
self.linear_relu_tweet = nn.Sequential(
nn.Linear(tweet_size, int(embedding_dimension / 4)),
nn.LeakyReLU())
self.linear_relu_num_prop = nn.Sequential(
nn.Linear(num_prop_size, int(embedding_dimension / 4)),
nn.LeakyReLU())
self.linear_relu_cat_prop = nn.Sequential(
nn.Linear(cat_prop_size, int(embedding_dimension / 4)),
nn.LeakyReLU())
self.linear_relu_input = nn.Sequential(
nn.Linear(embedding_dimension, embedding_dimension),
nn.LeakyReLU())
self.rgcn = RGCNConv(embedding_dimension,
embedding_dimension,
num_relations=2)
self.linear_relu_output1 = nn.Sequential(
nn.Linear(embedding_dimension, embedding_dimension),
nn.LeakyReLU())
self.linear_output2 = nn.Linear(embedding_dimension, 2)
def forward(self, des, tweet, num_prop, cat_prop, edge_index, edge_type):
d = self.linear_relu_des(des)
t = self.linear_relu_tweet(tweet)
n = self.linear_relu_num_prop(num_prop)
c = self.linear_relu_cat_prop(cat_prop)
x = torch.cat((d, t, n, c), dim=1)
x = self.linear_relu_input(x)
x = self.rgcn(x, edge_index, edge_type)
x = F.dropout(x, p=self.dropout, training=self.training)
x = self.rgcn(x, edge_index, edge_type)
x = self.linear_relu_output1(x)
x = self.linear_output2(x)
return x
def accuracy(output, labels): preds = output.max(1)[1].type_as(labels) correct = preds.eq(labels).double() correct = correct.sum() return correct / len(labels)
def init_weights(m): if type(m) == nn.Linear: nn.init.kaiminguniform(m.weight)
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') embedding_size, dropout, lr, weight_decay = 32, 0.1, 1e-2, 5e-2
root = './dataset/twibot-22/'
dataset = Twibot22(root=root, device=device) des_tensor, tweets_tensor, num_prop, category_prop, edge_index, edge_type, labels, train_idx, val_idx, test_idx = dataset.dataloader( )
model = BotRGCN(cat_prop_size=3, embedding_dimension=embedding_size).to(device) loss = nn.CrossEntropyLoss() optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
def train(epoch): model.train() output = model(des_tensor, tweets_tensor, num_prop, category_prop, edge_index, edge_type) loss_train = loss(output[train_idx], labels[train_idx]) acc_train = accuracy(output[train_idx], labels[train_idx]) acc_val = accuracy(output[val_idx], labels[val_idx]) optimizer.zero_grad() loss_train.backward() optimizer.step() print( 'Epoch: {:04d}'.format(epoch + 1), 'loss_train: {:.4f}'.format(loss_train.item()), 'acc_train: {:.4f}'.format(acc_train.item()), 'acc_val: {:.4f}'.format(acc_val.item()), ) return acc_train, loss_train
def test(): model.eval() output = model(des_tensor, tweets_tensor, num_prop, category_prop, edge_index, edge_type) loss_test = loss(output[test_idx], labels[test_idx]) acc_test = accuracy(output[test_idx], labels[test_idx]) output = output.max(1)[1].to('cpu').detach().numpy() label = labels.to('cpu').detach().numpy() f1 = f1_score(label[test_idx], output[test_idx])
precision = precision_score(label[test_idx], output[test_idx])
recall = recall_score(label[test_idx], output[test_idx])
fpr, tpr, thresholds = roc_curve(label[test_idx],
output[test_idx],
pos_label=1)
Auc = auc(fpr, tpr)
print(
"Test set results:",
"test_loss= {:.4f}".format(loss_test.item()),
"test_accuracy= {:.4f}".format(acc_test.item()),
"precision= {:.4f}".format(precision.item()),
"recall= {:.4f}".format(recall.item()),
"f1_score= {:.4f}".format(f1.item()),
# "mcc= {:.4f}".format(mcc.item()),
"auc= {:.4f}".format(Auc.item()),
)
model.apply(init_weights)
epochs = 200 for epoch in range(epochs): train(epoch)
test()
`
I think your model and training codes are the same as these. Could you please provide more detailed information about your processed dataset, like how many users are in the train/val/test set, and how to split them?
I follow the same split provided in split.csv. There are 1000000 users in total and 700000 for training, 200000 for validating and 100000 for test.
We use your code to train with our processed data, but can not achieve your performance. This may be due to differences in preprocessing. Could you please provide your preprocess codes? I believe @leopoldwhite is looking into this now.
Due to my limited computing resources, I split the tweets into three parts(0-2,3-5,6-8) and then merge them. I think that may be where the problem lies in.
import torch
import pandas as pd
import numpy as np
import json
from tqdm import tqdm
import ijson
user = pd.read_json('user.json')
user_idx = user['id']
uid_index = {uid: index for index, uid in enumerate(user_idx.values)}
# id_tweet={i:[] for i in range(len(user_idx))}
id_tweet = [{'index': i, 'tweets': []} for i in range(len(user_idx))]
for i in range(9):
name = 'tweet_' + str(i) + '.json'
if i < 3:
user_tweets = ijson.items(open(name, 'r'), 'item')
elif i < 6:
user_tweets = ijson.items(open(name, 'r'), 'item')
else:
user_tweets = ijson.items(open(name, 'r'), 'item')
print("load " + name + " succ")
for each in tqdm(user_tweets):
uid = 'u' + str(each['author_id'])
text = each['text']
try:
index = uid_index[uid]
id_tweet[index]['tweets'].append(text)
except KeyError:
continue
if i == 2:
json.dump(id_tweet, open('./id_tweet0.json', 'w'))
id_tweet = [{'index': i, 'tweets': []} for i in range(len(user_idx))]
if i == 5:
json.dump(id_tweet, open('./id_tweet1.json', 'w'))
id_tweet = [{'index': i, 'tweets': []} for i in range(len(user_idx))]
print("succ")
json.dump(id_tweet, open('./id_tweet2.json', 'w'))
And then I merge these three parts into id_tweet.json.
import torch
import ijson
from transformers import pipeline
import json
from tqdm import tqdm
tweet1_path = "id_tweet0.json"
tweet2_path = "id_tweet1.json"
tweet3_path = "id_tweet2.json"
tweets1 = ijson.items(open(tweet1_path, 'r'), "item")
tweets2 = ijson.items(open(tweet2_path, 'r'), "item")
tweets3 = ijson.items(open(tweet3_path, 'r'), "item")
count = 0
id_tweet = {i: [] for i in range(250000)}
for i, (tweet1, tweet2,
tweet3) in tqdm(enumerate(zip(tweets1, tweets2, tweets3))):
if i % 250000 == 0 and i != 0:
json.dump(id_tweet, open('./id_tweet' + str(count) + '.json', 'w'))
id_tweet = {i: [] for i in range(i, i + 250000)}
count += 1
temp = tweet1['tweets'] + tweet2['tweets'] + tweet3['tweets']
id_tweet[i].append(temp)
json.dump(id_tweet, open('./id_tweet' + str(count) + '.json', 'w'))
id_tweet = [[] for i in range(1000000)]
for i, (tweet1, tweet2,
tweet3) in tqdm(enumerate(zip(tweets1, tweets2, tweets3))):
temp = tweet1['tweets'] + tweet2['tweets'] + tweet3['tweets']
temp = temp[:20]
id_tweet[i].extend(temp)
json.dump(id_tweet, open('./id_tweet.json', 'w'))
Last part is the same with yours.
import torch
from tqdm import tqdm
import numpy as np
from transformers import pipeline
import os
import pandas as pd
import json
import ijson
user_tweets = ijson.items(open("id_tweet.json", 'r'), 'item')
feature_extract = pipeline('feature-extraction',
model='roberta-base',
tokenizer='roberta-base',
device=1,
padding=True,
truncation=True,
max_length=50,
add_special_tokens=True)
def tweets_embedding():
print('Running feature2 embedding')
path = "./tweets_tensor.pt"
if True:
tweets_list = []
for i, each_user_tweets in enumerate(user_tweets):
if i % 1000 == 0:
print(i)
if len(each_user_tweets) == 0:
total_each_person_tweets = torch.zeros(768)
else:
for j in range(len(each_user_tweets)):
each_tweet = each_user_tweets[j]
if each_tweet is None:
total_word_tensor = torch.zeros(768)
else:
each_tweet_tensor = torch.tensor(
feature_extract(each_tweet))
for k, each_word_tensor in enumerate(
each_tweet_tensor[0]):
if k == 0:
total_word_tensor = each_word_tensor
else:
total_word_tensor += each_word_tensor
total_word_tensor /= each_tweet_tensor.shape[1]
if j == 0:
total_each_person_tweets = total_word_tensor
elif j == 20:
break
else:
total_each_person_tweets += total_word_tensor
if (j == 20):
total_each_person_tweets /= 20
else:
total_each_person_tweets /= len(each_user_tweets)
tweets_list.append(total_each_person_tweets)
tweet_tensor = torch.stack(tweets_list)
torch.save(tweet_tensor, path)
else:
tweets_tensor = torch.load(path)
print('Finished')
tweets_embedding()
I follow the same preprocess method on Twibot22. However, when I try BotRGCN method on it, the results are better than yours. Can you provide me with processed Twibot22 dataset, So that I can test on it.