worldbank / REaLTabFormer

A suite of auto-regressive and Seq2Seq (sequence-to-sequence) transformer models for tabular and relational synthetic data generation.
https://worldbank.github.io/REaLTabFormer/
MIT License
212 stars 24 forks source link

RuntimeError: Error(s) in loading state_dict for GPT2LMHeadModel: size mismatch for transformer.wte.weight #69

Open akefhabbal-qu opened 8 months ago

akefhabbal-qu commented 8 months ago

Hello, I am having the following issues:

Loading best model from rtf_checkpoint_df_attack_1\checkpoint-16700 (score: 0.2342948168516159).
{'train_runtime': 17269.7271, 'train_samples_per_second': 11.581, 'train_steps_per_second': 11.581, 'train_loss': 0.32848167264184286, 'epoch': 0.43}
  9%|████████████▍                                                                                                                                   | 17200/200000 [4:47:49<50:59:00,  1.00s/it] 
24832it [4:17:57,  1.60it/s]
Generated 0 invalid samples out of total 24832 samples generated. Sampling efficiency is: 100.0000%
Saving not-best model...
Saving model checkpoint to rtf_checkpoint_df_attack_1/not-best-disc-model
Configuration saved in rtf_checkpoint_df_attack_1/not-best-disc-model\config.json
Model weights saved in rtf_checkpoint_df_attack_1/not-best-disc-model\pytorch_model.bin
Saving model checkpoint to rtf_checkpoint_df_attack_1/mean-best-disc-model
Configuration saved in rtf_checkpoint_df_attack_1/mean-best-disc-model\config.json
Model weights saved in rtf_checkpoint_df_attack_1/mean-best-disc-model\pytorch_model.bin
Critic round: 5,                     sensitivity_threshold: 0.037334211283614145,                         val_sensitivity: 0.03887054875529969,                             val_sensitivities: [0.0419297680412371, 0.04109859684963174, 0.039277251184834126, 0.038307805992207455, 0.038934558566315954, 0.03859360301034807, 0.03871600558959475, 0.03866033471452402, 0.03731334248123336, 0.042400975505172314, 0.03819643000293937, 0.03678979171679901, 0.04004629012032763, 0.033784073835587035, 0.03900940371874332]
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend
Loading model from rtf_checkpoint_df_attack_1\checkpoint-5000.
Traceback (most recent call last):
  File "C:\Users\Qatar University\OneDrive - Qatar University\Project - IDS\Codes\Imbalance\from_paper.py", line 142, in <module>
    rtf_model_df1.fit(df_attack_1)
  File "C:\Users\Qatar University\AppData\Local\Programs\Python\Python310\lib\site-packages\realtabformer\realtabformer.py", line 458, in fit
    trainer = self._train_with_sensitivity(
  File "C:\Users\Qatar University\AppData\Local\Programs\Python\Python310\lib\site-packages\realtabformer\realtabformer.py", line 710, in _train_with_sensitivity
    trainer.train(resume_from_checkpoint=True)
  File "C:\Users\Qatar University\AppData\Local\Programs\Python\Python310\lib\site-packages\transformers\trainer.py", line 1490, in train
    self._load_from_checkpoint(resume_from_checkpoint)
  File "C:\Users\Qatar University\AppData\Local\Programs\Python\Python310\lib\site-packages\transformers\trainer.py", line 1972, in _load_from_checkpoint
    load_result = model.load_state_dict(state_dict, False)
  File "C:\Users\Qatar University\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\nn\modules\module.py", line 2153, in load_state_dict
    raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
RuntimeError: Error(s) in loading state_dict for GPT2LMHeadModel:
        size mismatch for transformer.wte.weight: copying a param with shape torch.Size([1552, 768]) from checkpoint, the shape in current model is torch.Size([1985, 768]).
        size mismatch for lm_head.weight: copying a param with shape torch.Size([1552, 768]) from checkpoint, the shape in current model is torch.Size([1985, 768]).

Here is my code:

import pandas as pd
import json
import time
from IPython.display import display, clear_output
from numba import cuda
import os
import numpy as np

'data augmentation imports'
from realtabformer import REaLTabFormer

# Get the current CUDA context and GPU information
ctx = cuda.current_context()
meminfo = ctx.get_memory_info()
device_name = cuda.gpus[0].name.decode()
cudnn_version = cuda.cudadrv.driver.get_version()

# Print the GPU information
print(
    f'GPU: {device_name}\nAvailable GPU memory: {meminfo[0] / 1024**3:.1f} GB\ncuDNN version: {cudnn_version}')

print("Loading the dataset...\n")
file_path = '.././dataset/output/sampled_dataset_1000.csv'
# file_path = '.././dataset/output/Friday-02-03-2018_TrafficForML_CICFlowMeter.csv'
df_train = pd.read_csv(file_path, low_memory=False)
df_train = df_train.convert_dtypes()
for column in df_train.columns:
    try:
        df_train[column] = pd.to_numeric(df_train[column])
    except ValueError:
        print(f"Could not convert column {column} to numeric")
        pass  # If conversion fails, keep the column as is

# Set the display option to show all rows
pd.set_option('display.max_rows', None)
print(df_train.dtypes)

# Counts how many rows of each Label are missing (100.000)
print("Counting missing rows per Label:\n")
counts_dict = {}
for attack in df_train["Label"].unique():
    counts_dict[attack] = 1010 - \
        df_train[df_train["Label"] == attack].shape[0]

print("Rows to be augmented by Label:\n")
for key, value in sorted(counts_dict.items(), key=lambda item: item[1], reverse=True):
    print(f"{key:<22} {value:>10}")

# Since 'Benign' Label already has 100.000 rows, we do not consider it for augmentation
df_attack = df_train[df_train["Label"] != "Benign"]

# calculate the size of df_attack before
size_before = df_attack.memory_usage(deep=True).sum() / 1024**2

# select only the columns that are not object type
df_numeric = df_attack.select_dtypes(exclude=["string[python]"])

# convert all numeric columns to unsigned integer type
try:
    df_numeric = df_numeric.apply(pd.to_numeric, downcast="unsigned")
except ValueError:
    print("Could not convert all numeric columns to unsigned integer type")

# Replace infinite values with NaN
df_numeric.replace([np.inf, -np.inf], np.nan, inplace=True)

# Optionally, you can drop rows with NaN values if that's appropriate for your analysis
df_numeric.dropna(inplace=True)

# print df_numeric data types before and after conversion side by side
print(pd.concat([df_attack.dtypes, df_numeric.dtypes],
      axis=1, keys=["Before", "After"]))

# drop all numeric columns from df_attack
df_attack = df_attack.drop(columns=df_numeric.columns)

# concatenate df_attack and newly convertes df_numeric columns
df_attack = pd.concat([df_attack, df_numeric], axis=1)

# calculate the size of df_attack after conversion
size_after = df_attack.memory_usage(deep=True).sum() / 1024**2

# print size before and after conversion
print(
    f"\nSize of df_attack before data conversion: {size_before:.2f} MBytes, after: {size_after:.2f} MBytes")

# print head of df_attack
df_attack.head(10).style.set_properties(**{'text-align': 'left'})

# separate df_attack by Label into 2 dataframes with similar number of lines
df_attack_1 = df_attack[df_attack["Label"].isin(["DDOS attack-HOIC",
                                                 "DDoS attacks-LOIC-HTTP",
                                                 "DoS attacks-HulkP",
                                                 "Bot",
                                                 "FTP-BruteForce",
                                                 "SSH-Bruteforce"])]

df_attack_2 = df_attack[df_attack["Label"].isin(["Infilteration",
                                                 "DoS attacks-SlowHTTPTest",
                                                 "DoS attacks-Slowloris",
                                                 "DDOS attack-LOIC-UDP",
                                                 "Brute Force -Web",
                                                 "Brute Force -XSS",
                                                 "SQL Injection"])]

print("-----------------------")
print("df_attack_1")
# print number of lines per Label for each dataframe and total number of lines for each dataframe
display(df_attack_1['Label'].value_counts().to_frame(
).style.set_properties(**{'text-align': 'left'}))
# print sum of lines for df_attack_1
print(f"Number of lines: {df_attack_1['Label'].value_counts().sum()}")
print(
    f"\nSize: {df_attack_1.memory_usage(deep=True).sum() / 1024**2:.2f} MBytes")
print("-----------------------\n")
print("df_attack_2")
# print number of lines per Label for each dataframe and total number of lines for each dataframe
display(df_attack_2['Label'].value_counts().to_frame(
).style.set_properties(**{'text-align': 'left'}))
# print sum of lines for df_attack_2
print(f"Number of lines: {df_attack_2['Label'].value_counts().sum()}")
print(f"Size: {df_attack_2.memory_usage(deep=True).sum() / 1024**2:.2f} MBytes")
print("-----------------------")

# REalTabFormer instation: non-relational data variant
rtf_model_df1 = REaLTabFormer(model_type="tabular",
                              batch_size=1,
                              gradient_accumulation_steps=1,
                              checkpoints_dir="rtf_checkpoint_df_attack_1",
                              logging_steps=100,
                              random_state=42,
                              epochs=10,
                              train_size=0.8,                    # 80% of the data for training, 20% for validation
                              numeric_max_len=12
                              )

# Note: number of bootstrap samples has influnce in required RAM memory, before optimization

# train df_attack_1 model
if len(df_attack_1) > 0:
    rtf_model_df1.fit(df_attack_1)
else:
    print("DataFrame is empty df_attack_1. Skipping fit.")

# save model
rtf_model_df1.save("rtf_models/")

rtf_model_df2 = REaLTabFormer(model_type="tabular",
                              batch_size=1,
                              gradient_accumulation_steps=1,
                              checkpoints_dir="rtf_checkpoint_df_attack_2_v2",
                              logging_steps=100,
                              random_state=42,
                              epochs=10,
                              train_size=0.8,                    # 80% of the data for training, 20% for validation
                              numeric_max_len=12
                              )

# train df_attack_2 model
if len(df_attack_2) > 0:
    rtf_model_df2.fit(df_attack_2)
else:
    print("DataFrame is empty df_attack_2. Skipping fit.")

# save model
rtf_model_df2.save("rtf_models/")

# load best df_attack_1 model (id000016979230192943532032)
rtf_model_df1 = REaLTabFormer.load_from_dir(
    path="rtf_models/id000016979230192943532032")

def sample_by_category(model, category_dict, columns):
    """
    Samples from the model by category.
    :param model: the model to sample from
    :param category_dict: a dictionary with the number of samples to generate for each category
    :param columns: the columns to include in the output dataframe
    :return: a dataframe with the samples
    """
    sampled_df = pd.DataFrame(columns=columns)
    for category, n_samples in category_dict.items():
        start_time = time.time()
        while len(sampled_df[sampled_df['Label'] == category]) < n_samples:

            samples = model.sample(n_samples=20000)
            # select only the samples with the desired category
            category_samples = samples[samples['Label'] == category]
            # discard the excess samples if necessary
            if len(category_samples) > n_samples:
                category_samples = category_samples.sample(n=n_samples)
            # add the selected samples to the sampled_df dataframe
            sampled_df = pd.concat(
                [sampled_df, category_samples], ignore_index=True)

            elapsed_time = time.time() - start_time
            if elapsed_time > 5:
                clear_output(wait=True)
                print(
                    f"{len(sampled_df[sampled_df['Label'] == category])}/{n_samples}", end="\r")
                start_time = time.time()
    return sampled_df

# SQL Injection              699913
# Brute Force -XSS           699770
# Brute Force -Web           699389
# DDOS attack-LOIC-UDP       698270
# DoS attacks-Slowloris      689010
# DoS attacks-GoldenEye      658492
# DoS attacks-SlowHTTPTest     560110
# Infilteration              538066
# SSH-Bruteforce             512411
# FTP-BruteForce             506640
# Bot                        413809
# DoS attacks-Hulk           238088
# DDoS attacks-LOIC-HTTP     123809
# DDOS attack-HOIC            13988

# Dictionary with the number of samples to be generated with rtf_model_df1
counts_df1 = {"DDOS attack-HOIC": 100,
              "DDoS attacks-LOIC-HTTP": 100,
              "DoS attacks-Hulk": 100,
              "Bot": 100,
              "FTP-BruteForce": 100,
              "SSH-Bruteforce": 100}

# Sample by category for df_attack_1
RTB_dataset_df1 = sample_by_category(
    rtf_model_df1, counts_df1, df_train.columns)

# Save RTB_dataset_df1 to csv file
RTB_dataset_df1.to_csv("../data/RTB_dataset_df1.csv", index=False)

# SQL Injection              699913
# Brute Force -XSS           699770
# Brute Force -Web           699389
# DDOS attack-LOIC-UDP       698270
# DoS attacks-Slowloris      689010
# DoS attacks-GoldenEye      658492
# DoS attacks-SlowHTTPTest     560110
# Infilteration              538066
# SSH-Bruteforce             512411
# FTP-BruteForce             506640
# Bot                        413809
# DoS attacks-Hulk           238088
# DDoS attacks-LOIC-HTTP     123809
# DDOS attack-HOIC            13988
# Dictionary with the number of samples to be generated with rtf_model_df2
counts_df2 = {"Infilteration": 100,
              "DoS attacks-SlowHTTPTest": 100,
              "DoS attacks-Slowloris": 100,
              "DDOS attack-LOIC-UDP": 8370,
              "Brute Force -Web": 9489,
              "Brute Force -XSS": 9870,
              "SQL Injection": 10013}

# Load best df_attack_2 model (id000016979230192943532032)
rtf_model_df2 = REaLTabFormer.load_from_dir(
    path="rtf_models/id000016980601259525519360")

# TESTE

# counts_df2= {"XSS": 87969}
rtf_model_df2 = REaLTabFormer.load_from_dir(path="rtf_models/id000016982514417271404544")

RTB_dataset_df2 = sample_by_category(rtf_model_df2,
                                     counts_df2,
                                     df_train.columns)

# save RTB_dataset_df2 to csv file
RTB_dataset_df2.to_csv("RTB_dataset_df2.csv", index=False)

# Load RTB_dataset_df1 and RTB_dataset_df2
RTB_dataset_df1 = pd.read_csv("../data/RTB_dataset_df1.csv", low_memory=False)
RTB_dataset_df2 = pd.read_csv(
    "../data/RTB_dataset_df2.csv", low_memory=False)

# Label per dataset
display(RTB_dataset_df1['Label'].value_counts(
).to_frame().style.set_caption("RTB_dataset_df1"))
display(RTB_dataset_df2['Label'].value_counts(
).to_frame().style.set_caption("RTB_dataset_df2"))

# Concatenate RTB_dataset_df1 and RTB_dataset_df2
RTB_dataset = pd.concat([RTB_dataset_df1, RTB_dataset_df2], ignore_index=True)
del RTB_dataset_df1, RTB_dataset_df2

# Save RTB_dataset to csv file
RTB_dataset.to_csv("../data/RTB_dataset.csv", index=False)

# Count the number of rows per Label in RTB_dataset
counts_dict_rtb = {}
for attack in RTB_dataset["Label"].unique():
    counts_dict_rtb[attack] = RTB_dataset[RTB_dataset["Label"]
                                          == attack].shape[0]

# Count the number of rows per Label in df_train
counts_dict_train = {}
for attack in df_train["Label"].unique():
    counts_dict_train[attack] = df_train[df_train["Label"]
                                         == attack].shape[0]

counts_dict_total = {}
# Sum if key exists in both dictionaries
for key in counts_dict_rtb.keys() & counts_dict_train.keys():
    counts_dict_total[key] = counts_dict_rtb[key] + counts_dict_train[key]

counts_excess = {}
# Subtract 100.000 to get the number of rows to be removed
for key, value in counts_dict_total.items():
    counts_excess[key] = value - 100000

print(counts_excess)

# Remove excess rows from RTB_dataset by selecting a random sample of rows to be removed
for key, value in counts_excess.items():
    if value > 0:
        RTB_dataset = RTB_dataset.drop(
            RTB_dataset[RTB_dataset["Label"] == key].sample(n=value, random_state=42).index)

# Concatenate RTB_dataset and df_train
df_balanced = pd.concat([df_train, RTB_dataset], ignore_index=True)

display(df_balanced['Label'].value_counts().to_frame(
).style.set_properties(**{'text-align': 'left'}))

# Save df_balanced to csv file
df_balanced.to_csv("../data/EdgeIIot_train_100k_RTB_balanced.csv", index=False)

# Fill Label column with 1 if Label is not Benign, 0 otherwise
df_balanced["Label"] = df_balanced["Label"].apply(
    lambda x: 1 if x != "Benign" else 0)

display(df_balanced.describe().T.style.set_properties(
    **{'text-align': 'left'}))