Loading best model from rtf_checkpoint_df_attack_1\checkpoint-16700 (score: 0.2342948168516159).
{'train_runtime': 17269.7271, 'train_samples_per_second': 11.581, 'train_steps_per_second': 11.581, 'train_loss': 0.32848167264184286, 'epoch': 0.43}
9%|████████████▍ | 17200/200000 [4:47:49<50:59:00, 1.00s/it]
24832it [4:17:57, 1.60it/s]
Generated 0 invalid samples out of total 24832 samples generated. Sampling efficiency is: 100.0000%
Saving not-best model...
Saving model checkpoint to rtf_checkpoint_df_attack_1/not-best-disc-model
Configuration saved in rtf_checkpoint_df_attack_1/not-best-disc-model\config.json
Model weights saved in rtf_checkpoint_df_attack_1/not-best-disc-model\pytorch_model.bin
Saving model checkpoint to rtf_checkpoint_df_attack_1/mean-best-disc-model
Configuration saved in rtf_checkpoint_df_attack_1/mean-best-disc-model\config.json
Model weights saved in rtf_checkpoint_df_attack_1/mean-best-disc-model\pytorch_model.bin
Critic round: 5, sensitivity_threshold: 0.037334211283614145, val_sensitivity: 0.03887054875529969, val_sensitivities: [0.0419297680412371, 0.04109859684963174, 0.039277251184834126, 0.038307805992207455, 0.038934558566315954, 0.03859360301034807, 0.03871600558959475, 0.03866033471452402, 0.03731334248123336, 0.042400975505172314, 0.03819643000293937, 0.03678979171679901, 0.04004629012032763, 0.033784073835587035, 0.03900940371874332]
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend
Loading model from rtf_checkpoint_df_attack_1\checkpoint-5000.
Traceback (most recent call last):
File "C:\Users\Qatar University\OneDrive - Qatar University\Project - IDS\Codes\Imbalance\from_paper.py", line 142, in <module>
rtf_model_df1.fit(df_attack_1)
File "C:\Users\Qatar University\AppData\Local\Programs\Python\Python310\lib\site-packages\realtabformer\realtabformer.py", line 458, in fit
trainer = self._train_with_sensitivity(
File "C:\Users\Qatar University\AppData\Local\Programs\Python\Python310\lib\site-packages\realtabformer\realtabformer.py", line 710, in _train_with_sensitivity
trainer.train(resume_from_checkpoint=True)
File "C:\Users\Qatar University\AppData\Local\Programs\Python\Python310\lib\site-packages\transformers\trainer.py", line 1490, in train
self._load_from_checkpoint(resume_from_checkpoint)
File "C:\Users\Qatar University\AppData\Local\Programs\Python\Python310\lib\site-packages\transformers\trainer.py", line 1972, in _load_from_checkpoint
load_result = model.load_state_dict(state_dict, False)
File "C:\Users\Qatar University\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\nn\modules\module.py", line 2153, in load_state_dict
raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
RuntimeError: Error(s) in loading state_dict for GPT2LMHeadModel:
size mismatch for transformer.wte.weight: copying a param with shape torch.Size([1552, 768]) from checkpoint, the shape in current model is torch.Size([1985, 768]).
size mismatch for lm_head.weight: copying a param with shape torch.Size([1552, 768]) from checkpoint, the shape in current model is torch.Size([1985, 768]).
Here is my code:
import pandas as pd
import json
import time
from IPython.display import display, clear_output
from numba import cuda
import os
import numpy as np
'data augmentation imports'
from realtabformer import REaLTabFormer
# Get the current CUDA context and GPU information
ctx = cuda.current_context()
meminfo = ctx.get_memory_info()
device_name = cuda.gpus[0].name.decode()
cudnn_version = cuda.cudadrv.driver.get_version()
# Print the GPU information
print(
f'GPU: {device_name}\nAvailable GPU memory: {meminfo[0] / 1024**3:.1f} GB\ncuDNN version: {cudnn_version}')
print("Loading the dataset...\n")
file_path = '.././dataset/output/sampled_dataset_1000.csv'
# file_path = '.././dataset/output/Friday-02-03-2018_TrafficForML_CICFlowMeter.csv'
df_train = pd.read_csv(file_path, low_memory=False)
df_train = df_train.convert_dtypes()
for column in df_train.columns:
try:
df_train[column] = pd.to_numeric(df_train[column])
except ValueError:
print(f"Could not convert column {column} to numeric")
pass # If conversion fails, keep the column as is
# Set the display option to show all rows
pd.set_option('display.max_rows', None)
print(df_train.dtypes)
# Counts how many rows of each Label are missing (100.000)
print("Counting missing rows per Label:\n")
counts_dict = {}
for attack in df_train["Label"].unique():
counts_dict[attack] = 1010 - \
df_train[df_train["Label"] == attack].shape[0]
print("Rows to be augmented by Label:\n")
for key, value in sorted(counts_dict.items(), key=lambda item: item[1], reverse=True):
print(f"{key:<22} {value:>10}")
# Since 'Benign' Label already has 100.000 rows, we do not consider it for augmentation
df_attack = df_train[df_train["Label"] != "Benign"]
# calculate the size of df_attack before
size_before = df_attack.memory_usage(deep=True).sum() / 1024**2
# select only the columns that are not object type
df_numeric = df_attack.select_dtypes(exclude=["string[python]"])
# convert all numeric columns to unsigned integer type
try:
df_numeric = df_numeric.apply(pd.to_numeric, downcast="unsigned")
except ValueError:
print("Could not convert all numeric columns to unsigned integer type")
# Replace infinite values with NaN
df_numeric.replace([np.inf, -np.inf], np.nan, inplace=True)
# Optionally, you can drop rows with NaN values if that's appropriate for your analysis
df_numeric.dropna(inplace=True)
# print df_numeric data types before and after conversion side by side
print(pd.concat([df_attack.dtypes, df_numeric.dtypes],
axis=1, keys=["Before", "After"]))
# drop all numeric columns from df_attack
df_attack = df_attack.drop(columns=df_numeric.columns)
# concatenate df_attack and newly convertes df_numeric columns
df_attack = pd.concat([df_attack, df_numeric], axis=1)
# calculate the size of df_attack after conversion
size_after = df_attack.memory_usage(deep=True).sum() / 1024**2
# print size before and after conversion
print(
f"\nSize of df_attack before data conversion: {size_before:.2f} MBytes, after: {size_after:.2f} MBytes")
# print head of df_attack
df_attack.head(10).style.set_properties(**{'text-align': 'left'})
# separate df_attack by Label into 2 dataframes with similar number of lines
df_attack_1 = df_attack[df_attack["Label"].isin(["DDOS attack-HOIC",
"DDoS attacks-LOIC-HTTP",
"DoS attacks-HulkP",
"Bot",
"FTP-BruteForce",
"SSH-Bruteforce"])]
df_attack_2 = df_attack[df_attack["Label"].isin(["Infilteration",
"DoS attacks-SlowHTTPTest",
"DoS attacks-Slowloris",
"DDOS attack-LOIC-UDP",
"Brute Force -Web",
"Brute Force -XSS",
"SQL Injection"])]
print("-----------------------")
print("df_attack_1")
# print number of lines per Label for each dataframe and total number of lines for each dataframe
display(df_attack_1['Label'].value_counts().to_frame(
).style.set_properties(**{'text-align': 'left'}))
# print sum of lines for df_attack_1
print(f"Number of lines: {df_attack_1['Label'].value_counts().sum()}")
print(
f"\nSize: {df_attack_1.memory_usage(deep=True).sum() / 1024**2:.2f} MBytes")
print("-----------------------\n")
print("df_attack_2")
# print number of lines per Label for each dataframe and total number of lines for each dataframe
display(df_attack_2['Label'].value_counts().to_frame(
).style.set_properties(**{'text-align': 'left'}))
# print sum of lines for df_attack_2
print(f"Number of lines: {df_attack_2['Label'].value_counts().sum()}")
print(f"Size: {df_attack_2.memory_usage(deep=True).sum() / 1024**2:.2f} MBytes")
print("-----------------------")
# REalTabFormer instation: non-relational data variant
rtf_model_df1 = REaLTabFormer(model_type="tabular",
batch_size=1,
gradient_accumulation_steps=1,
checkpoints_dir="rtf_checkpoint_df_attack_1",
logging_steps=100,
random_state=42,
epochs=10,
train_size=0.8, # 80% of the data for training, 20% for validation
numeric_max_len=12
)
# Note: number of bootstrap samples has influnce in required RAM memory, before optimization
# train df_attack_1 model
if len(df_attack_1) > 0:
rtf_model_df1.fit(df_attack_1)
else:
print("DataFrame is empty df_attack_1. Skipping fit.")
# save model
rtf_model_df1.save("rtf_models/")
rtf_model_df2 = REaLTabFormer(model_type="tabular",
batch_size=1,
gradient_accumulation_steps=1,
checkpoints_dir="rtf_checkpoint_df_attack_2_v2",
logging_steps=100,
random_state=42,
epochs=10,
train_size=0.8, # 80% of the data for training, 20% for validation
numeric_max_len=12
)
# train df_attack_2 model
if len(df_attack_2) > 0:
rtf_model_df2.fit(df_attack_2)
else:
print("DataFrame is empty df_attack_2. Skipping fit.")
# save model
rtf_model_df2.save("rtf_models/")
# load best df_attack_1 model (id000016979230192943532032)
rtf_model_df1 = REaLTabFormer.load_from_dir(
path="rtf_models/id000016979230192943532032")
def sample_by_category(model, category_dict, columns):
"""
Samples from the model by category.
:param model: the model to sample from
:param category_dict: a dictionary with the number of samples to generate for each category
:param columns: the columns to include in the output dataframe
:return: a dataframe with the samples
"""
sampled_df = pd.DataFrame(columns=columns)
for category, n_samples in category_dict.items():
start_time = time.time()
while len(sampled_df[sampled_df['Label'] == category]) < n_samples:
samples = model.sample(n_samples=20000)
# select only the samples with the desired category
category_samples = samples[samples['Label'] == category]
# discard the excess samples if necessary
if len(category_samples) > n_samples:
category_samples = category_samples.sample(n=n_samples)
# add the selected samples to the sampled_df dataframe
sampled_df = pd.concat(
[sampled_df, category_samples], ignore_index=True)
elapsed_time = time.time() - start_time
if elapsed_time > 5:
clear_output(wait=True)
print(
f"{len(sampled_df[sampled_df['Label'] == category])}/{n_samples}", end="\r")
start_time = time.time()
return sampled_df
# SQL Injection 699913
# Brute Force -XSS 699770
# Brute Force -Web 699389
# DDOS attack-LOIC-UDP 698270
# DoS attacks-Slowloris 689010
# DoS attacks-GoldenEye 658492
# DoS attacks-SlowHTTPTest 560110
# Infilteration 538066
# SSH-Bruteforce 512411
# FTP-BruteForce 506640
# Bot 413809
# DoS attacks-Hulk 238088
# DDoS attacks-LOIC-HTTP 123809
# DDOS attack-HOIC 13988
# Dictionary with the number of samples to be generated with rtf_model_df1
counts_df1 = {"DDOS attack-HOIC": 100,
"DDoS attacks-LOIC-HTTP": 100,
"DoS attacks-Hulk": 100,
"Bot": 100,
"FTP-BruteForce": 100,
"SSH-Bruteforce": 100}
# Sample by category for df_attack_1
RTB_dataset_df1 = sample_by_category(
rtf_model_df1, counts_df1, df_train.columns)
# Save RTB_dataset_df1 to csv file
RTB_dataset_df1.to_csv("../data/RTB_dataset_df1.csv", index=False)
# SQL Injection 699913
# Brute Force -XSS 699770
# Brute Force -Web 699389
# DDOS attack-LOIC-UDP 698270
# DoS attacks-Slowloris 689010
# DoS attacks-GoldenEye 658492
# DoS attacks-SlowHTTPTest 560110
# Infilteration 538066
# SSH-Bruteforce 512411
# FTP-BruteForce 506640
# Bot 413809
# DoS attacks-Hulk 238088
# DDoS attacks-LOIC-HTTP 123809
# DDOS attack-HOIC 13988
# Dictionary with the number of samples to be generated with rtf_model_df2
counts_df2 = {"Infilteration": 100,
"DoS attacks-SlowHTTPTest": 100,
"DoS attacks-Slowloris": 100,
"DDOS attack-LOIC-UDP": 8370,
"Brute Force -Web": 9489,
"Brute Force -XSS": 9870,
"SQL Injection": 10013}
# Load best df_attack_2 model (id000016979230192943532032)
rtf_model_df2 = REaLTabFormer.load_from_dir(
path="rtf_models/id000016980601259525519360")
# TESTE
# counts_df2= {"XSS": 87969}
rtf_model_df2 = REaLTabFormer.load_from_dir(path="rtf_models/id000016982514417271404544")
RTB_dataset_df2 = sample_by_category(rtf_model_df2,
counts_df2,
df_train.columns)
# save RTB_dataset_df2 to csv file
RTB_dataset_df2.to_csv("RTB_dataset_df2.csv", index=False)
# Load RTB_dataset_df1 and RTB_dataset_df2
RTB_dataset_df1 = pd.read_csv("../data/RTB_dataset_df1.csv", low_memory=False)
RTB_dataset_df2 = pd.read_csv(
"../data/RTB_dataset_df2.csv", low_memory=False)
# Label per dataset
display(RTB_dataset_df1['Label'].value_counts(
).to_frame().style.set_caption("RTB_dataset_df1"))
display(RTB_dataset_df2['Label'].value_counts(
).to_frame().style.set_caption("RTB_dataset_df2"))
# Concatenate RTB_dataset_df1 and RTB_dataset_df2
RTB_dataset = pd.concat([RTB_dataset_df1, RTB_dataset_df2], ignore_index=True)
del RTB_dataset_df1, RTB_dataset_df2
# Save RTB_dataset to csv file
RTB_dataset.to_csv("../data/RTB_dataset.csv", index=False)
# Count the number of rows per Label in RTB_dataset
counts_dict_rtb = {}
for attack in RTB_dataset["Label"].unique():
counts_dict_rtb[attack] = RTB_dataset[RTB_dataset["Label"]
== attack].shape[0]
# Count the number of rows per Label in df_train
counts_dict_train = {}
for attack in df_train["Label"].unique():
counts_dict_train[attack] = df_train[df_train["Label"]
== attack].shape[0]
counts_dict_total = {}
# Sum if key exists in both dictionaries
for key in counts_dict_rtb.keys() & counts_dict_train.keys():
counts_dict_total[key] = counts_dict_rtb[key] + counts_dict_train[key]
counts_excess = {}
# Subtract 100.000 to get the number of rows to be removed
for key, value in counts_dict_total.items():
counts_excess[key] = value - 100000
print(counts_excess)
# Remove excess rows from RTB_dataset by selecting a random sample of rows to be removed
for key, value in counts_excess.items():
if value > 0:
RTB_dataset = RTB_dataset.drop(
RTB_dataset[RTB_dataset["Label"] == key].sample(n=value, random_state=42).index)
# Concatenate RTB_dataset and df_train
df_balanced = pd.concat([df_train, RTB_dataset], ignore_index=True)
display(df_balanced['Label'].value_counts().to_frame(
).style.set_properties(**{'text-align': 'left'}))
# Save df_balanced to csv file
df_balanced.to_csv("../data/EdgeIIot_train_100k_RTB_balanced.csv", index=False)
# Fill Label column with 1 if Label is not Benign, 0 otherwise
df_balanced["Label"] = df_balanced["Label"].apply(
lambda x: 1 if x != "Benign" else 0)
display(df_balanced.describe().T.style.set_properties(
**{'text-align': 'left'}))
Hello, I am having the following issues:
Here is my code: