un0o7 commented 1 year ago

I follow the same preprocess method on Twibot22. However, when I try BotRGCN method on it, the results are better than yours. Can you provide me with processed Twibot22 dataset, So that I can test on it.

BunsenFeng commented 1 year ago

Thank you for reporting these results. I believe @leopoldwhite and @whr000001 are looking into this now.

whr000001 commented 1 year ago

Hi, thank you for your interest in our work. Could you please provide more detailed information about how you train and test BotRGCN and how to preprocess the dataset? Thanks.

un0o7 commented 1 year ago

` import torch from torch.utils.data import Dataset from torch import nn from torch_geometric.nn import RGCNConv import torch.nn.functional as F from sklearn.metrics import f1_score from sklearn.metrics import precision_score from sklearn.metrics import recall_score from sklearn.metrics import roc_curve, auc import random import numpy as np

def seed_everything(seed): random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed(seed) torch.cuda.manual_seed_all(seed) torch.backends.cudnn.deterministic = True

seed_everything(2026)

class Twibot22(Dataset):

def __init__(
    self,
    root='./Data/',
    device='cpu',
):
    self.root = root
    self.device = device

def train_val_test_mask(self):

    train_idx = torch.load(self.root + 'train_idx.pt')
    val_idx = torch.load(self.root + 'val_idx.pt')
    test_idx = torch.load(self.root + 'test_idx.pt')

    return train_idx, val_idx, test_idx

def dataloader(self):
    labels = torch.load(self.root + 'label.pt').to(self.device)
    des_tensor = torch.load(self.root + 'des_tensor.pt').to(self.device)
    tweets_tensor = torch.load(self.root + 'tweets_tensor.pt').to(
        self.device)
    num_prop = torch.load(self.root + 'num_properties_tensor.pt').to(
        self.device)
    category_prop = torch.load(self.root + 'cat_properties_tensor.pt').to(
        self.device)
    edge_index = torch.load(self.root + 'edge_index.pt').to(self.device)
    edge_type = torch.load(self.root + 'edge_type.pt').to(self.device)

    train_idx, val_idx, test_idx = self.train_val_test_mask()
    return des_tensor, tweets_tensor, num_prop, category_prop, edge_index, edge_type, labels, train_idx, val_idx, test_idx

class BotRGCN(nn.Module):

def __init__(self,
             des_size=768,
             tweet_size=768,
             num_prop_size=5,
             cat_prop_size=3,
             embedding_dimension=128,
             dropout=0.3):
    super(BotRGCN, self).__init__()
    self.dropout = dropout
    self.linear_relu_des = nn.Sequential(
        nn.Linear(des_size, int(embedding_dimension / 4)), nn.LeakyReLU())
    self.linear_relu_tweet = nn.Sequential(
        nn.Linear(tweet_size, int(embedding_dimension / 4)),
        nn.LeakyReLU())
    self.linear_relu_num_prop = nn.Sequential(
        nn.Linear(num_prop_size, int(embedding_dimension / 4)),
        nn.LeakyReLU())
    self.linear_relu_cat_prop = nn.Sequential(
        nn.Linear(cat_prop_size, int(embedding_dimension / 4)),
        nn.LeakyReLU())

    self.linear_relu_input = nn.Sequential(
        nn.Linear(embedding_dimension, embedding_dimension),
        nn.LeakyReLU())

    self.rgcn = RGCNConv(embedding_dimension,
                         embedding_dimension,
                         num_relations=2)

    self.linear_relu_output1 = nn.Sequential(
        nn.Linear(embedding_dimension, embedding_dimension),
        nn.LeakyReLU())
    self.linear_output2 = nn.Linear(embedding_dimension, 2)

def forward(self, des, tweet, num_prop, cat_prop, edge_index, edge_type):
    d = self.linear_relu_des(des)
    t = self.linear_relu_tweet(tweet)
    n = self.linear_relu_num_prop(num_prop)
    c = self.linear_relu_cat_prop(cat_prop)
    x = torch.cat((d, t, n, c), dim=1)

    x = self.linear_relu_input(x)
    x = self.rgcn(x, edge_index, edge_type)
    x = F.dropout(x, p=self.dropout, training=self.training)
    x = self.rgcn(x, edge_index, edge_type)
    x = self.linear_relu_output1(x)
    x = self.linear_output2(x)

    return x

def accuracy(output, labels): preds = output.max(1)[1].type_as(labels) correct = preds.eq(labels).double() correct = correct.sum() return correct / len(labels)

def init_weights(m): if type(m) == nn.Linear: nn.init.kaiminguniform(m.weight)

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') embedding_size, dropout, lr, weight_decay = 32, 0.1, 1e-2, 5e-2

root = './dataset/twibot-22/'

dataset = Twibot22(root=root, device=device) des_tensor, tweets_tensor, num_prop, category_prop, edge_index, edge_type, labels, train_idx, val_idx, test_idx = dataset.dataloader( )

model = BotRGCN(cat_prop_size=3, embedding_dimension=embedding_size).to(device) loss = nn.CrossEntropyLoss() optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)

def train(epoch): model.train() output = model(des_tensor, tweets_tensor, num_prop, category_prop, edge_index, edge_type) loss_train = loss(output[train_idx], labels[train_idx]) acc_train = accuracy(output[train_idx], labels[train_idx]) acc_val = accuracy(output[val_idx], labels[val_idx]) optimizer.zero_grad() loss_train.backward() optimizer.step() print( 'Epoch: {:04d}'.format(epoch + 1), 'loss_train: {:.4f}'.format(loss_train.item()), 'acc_train: {:.4f}'.format(acc_train.item()), 'acc_val: {:.4f}'.format(acc_val.item()), ) return acc_train, loss_train

def test(): model.eval() output = model(des_tensor, tweets_tensor, num_prop, category_prop, edge_index, edge_type) loss_test = loss(output[test_idx], labels[test_idx]) acc_test = accuracy(output[test_idx], labels[test_idx]) output = output.max(1)[1].to('cpu').detach().numpy() label = labels.to('cpu').detach().numpy() f1 = f1_score(label[test_idx], output[test_idx])

mcc=matthews_corrcoef(label[test_idx], output[test_idx])

precision = precision_score(label[test_idx], output[test_idx])
recall = recall_score(label[test_idx], output[test_idx])
fpr, tpr, thresholds = roc_curve(label[test_idx],
                                 output[test_idx],
                                 pos_label=1)
Auc = auc(fpr, tpr)
print(
    "Test set results:",
    "test_loss= {:.4f}".format(loss_test.item()),
    "test_accuracy= {:.4f}".format(acc_test.item()),
    "precision= {:.4f}".format(precision.item()),
    "recall= {:.4f}".format(recall.item()),
    "f1_score= {:.4f}".format(f1.item()),
    # "mcc= {:.4f}".format(mcc.item()),
    "auc= {:.4f}".format(Auc.item()),
)

model.apply(init_weights)

epochs = 200 for epoch in range(epochs): train(epoch)

test()

`

whr000001 commented 1 year ago

I think your model and training codes are the same as these. Could you please provide more detailed information about your processed dataset, like how many users are in the train/val/test set, and how to split them?

un0o7 commented 1 year ago

I follow the same split provided in split.csv. There are 1000000 users in total and 700000 for training, 200000 for validating and 100000 for test.

whr000001 commented 1 year ago

We use your code to train with our processed data, but can not achieve your performance. This may be due to differences in preprocessing. Could you please provide your preprocess codes? I believe @leopoldwhite is looking into this now.

un0o7 commented 1 year ago

Due to my limited computing resources, I split the tweets into three parts（0-2，3-5，6-8） and then merge them. I think that may be where the problem lies in.

import torch
import pandas as pd
import numpy as np
import json
from tqdm import tqdm
import ijson

user = pd.read_json('user.json')

user_idx = user['id']
uid_index = {uid: index for index, uid in enumerate(user_idx.values)}

# id_tweet={i:[] for i in range(len(user_idx))}

id_tweet = [{'index': i, 'tweets': []} for i in range(len(user_idx))]

for i in range(9):
    name = 'tweet_' + str(i) + '.json'
    if i < 3:
        user_tweets = ijson.items(open(name, 'r'), 'item')
    elif i < 6:
        user_tweets = ijson.items(open(name, 'r'), 'item')
    else:
        user_tweets = ijson.items(open(name, 'r'), 'item')
    print("load " + name + " succ")
    for each in tqdm(user_tweets):
        uid = 'u' + str(each['author_id'])
        text = each['text']
        try:
            index = uid_index[uid]

            id_tweet[index]['tweets'].append(text)
        except KeyError:
            continue
    if i == 2:
        json.dump(id_tweet, open('./id_tweet0.json', 'w'))
        id_tweet = [{'index': i, 'tweets': []} for i in range(len(user_idx))]
    if i == 5:
        json.dump(id_tweet, open('./id_tweet1.json', 'w'))
        id_tweet = [{'index': i, 'tweets': []} for i in range(len(user_idx))]

print("succ")
json.dump(id_tweet, open('./id_tweet2.json', 'w'))

un0o7 commented 1 year ago

And then I merge these three parts into id_tweet.json.

import torch
import ijson
from transformers import pipeline
import json
from tqdm import tqdm

tweet1_path = "id_tweet0.json"
tweet2_path = "id_tweet1.json"
tweet3_path = "id_tweet2.json"

tweets1 = ijson.items(open(tweet1_path, 'r'), "item")
tweets2 = ijson.items(open(tweet2_path, 'r'), "item")
tweets3 = ijson.items(open(tweet3_path, 'r'), "item")
count = 0

id_tweet = {i: [] for i in range(250000)}
for i, (tweet1, tweet2,
        tweet3) in tqdm(enumerate(zip(tweets1, tweets2, tweets3))):
    if i % 250000 == 0 and i != 0:
        json.dump(id_tweet, open('./id_tweet' + str(count) + '.json', 'w'))
        id_tweet = {i: [] for i in range(i, i + 250000)}
        count += 1

    temp = tweet1['tweets'] + tweet2['tweets'] + tweet3['tweets']
    id_tweet[i].append(temp)

json.dump(id_tweet, open('./id_tweet' + str(count) + '.json', 'w'))

id_tweet = [[] for i in range(1000000)]
for i, (tweet1, tweet2,
        tweet3) in tqdm(enumerate(zip(tweets1, tweets2, tweets3))):
    temp = tweet1['tweets'] + tweet2['tweets'] + tweet3['tweets']
    temp = temp[:20]
    id_tweet[i].extend(temp)

json.dump(id_tweet, open('./id_tweet.json', 'w'))

un0o7 commented 1 year ago

Last part is the same with yours.

import torch
from tqdm import tqdm
import numpy as np
from transformers import pipeline
import os
import pandas as pd
import json
import ijson

user_tweets = ijson.items(open("id_tweet.json", 'r'), 'item')

feature_extract = pipeline('feature-extraction',
                           model='roberta-base',
                           tokenizer='roberta-base',
                           device=1,
                           padding=True,
                           truncation=True,
                           max_length=50,
                           add_special_tokens=True)

def tweets_embedding():
    print('Running feature2 embedding')
    path = "./tweets_tensor.pt"
    if True:
        tweets_list = []
        for i, each_user_tweets in enumerate(user_tweets):
            if i % 1000 == 0:
                print(i)
            if len(each_user_tweets) == 0:
                total_each_person_tweets = torch.zeros(768)
            else:
                for j in range(len(each_user_tweets)):
                    each_tweet = each_user_tweets[j]
                    if each_tweet is None:
                        total_word_tensor = torch.zeros(768)
                    else:
                        each_tweet_tensor = torch.tensor(
                            feature_extract(each_tweet))
                        for k, each_word_tensor in enumerate(
                                each_tweet_tensor[0]):
                            if k == 0:
                                total_word_tensor = each_word_tensor
                            else:
                                total_word_tensor += each_word_tensor
                        total_word_tensor /= each_tweet_tensor.shape[1]
                    if j == 0:
                        total_each_person_tweets = total_word_tensor
                    elif j == 20:
                        break
                    else:
                        total_each_person_tweets += total_word_tensor
                if (j == 20):
                    total_each_person_tweets /= 20
                else:
                    total_each_person_tweets /= len(each_user_tweets)

            tweets_list.append(total_each_person_tweets)

        tweet_tensor = torch.stack(tweets_list)
        torch.save(tweet_tensor, path)

    else:
        tweets_tensor = torch.load(path)
    print('Finished')

tweets_embedding()

LuoUndergradXJTU / TwiBot-22

Higher Performance of Botrgcn on Twibot22? #30

mcc=matthews_corrcoef(label[test_idx], output[test_idx])