Closed RasmusOrsoe closed 1 year ago
Attached are the two requested plots and two plots of the evolution in validation loss. Perhaps it suggest that the 1e-05 run should be allowed to run even further, with a lower eps, let me know if you think so.
I have also tried a fixed learning rate og 1e-02, but it does not work. The loss explodes and I have not included it in the graphs.
I have also added the code from the updated script in the bottom, but could not attach a .py file.
For the learning rate of 1e-05, I had to adjust the number of epochs up, since the training did not converge in 150 epochs, but around 160 epochs. Just let me know if I should change something.
Zenith resolution plot
Azimuth resolution plot
Zenith validation loss plot
Azimuth validation loss plot
Code used:
import os
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import EarlyStopping
from pytorch_lightning.loggers import WandbLogger
import torch
from torch.optim.adam import Adam
import pandas as pd
from graphnet.training.loss_functions import VonMisesFisher2DLoss
from graphnet.data.constants import FEATURES, TRUTH
from graphnet.models import StandardModel
from graphnet.models.detector.icecube import IceCubeDeepCore
from graphnet.models.gnn.dynedge import DynEdge
from graphnet.models.graph_builders import KNNGraphBuilder
from graphnet.models.task.reconstruction import ZenithReconstructionWithKappa, AzimuthReconstructionWithKappa
from graphnet.training.callbacks import ProgressBar, PiecewiseLinearLR
from graphnet.training.utils import (
get_predictions,
make_dataloader,
save_results,
)
from graphnet.utilities.logging import get_logger
logger = get_logger()
# Configurations
torch.multiprocessing.set_sharing_strategy("file_system")
# Constants
features = FEATURES.DEEPCORE
truth = TRUTH.DEEPCORE[:-1]
# Make sure W&B output directory exists
WANDB_DIR = "./wandb/"
os.makedirs(WANDB_DIR, exist_ok=True)
def train(config,wandb_logger,learning_rate):
train_selection = pd.read_csv('/groups/icecube/petersen/GraphNetDatabaseRepository/northeren_tracks/dev_northern_tracks_full_part_1/selections/benchmark_train_selection.csv').reset_index(drop = True)['event_no'].ravel().tolist()
validation_selection = pd.read_csv('/groups/icecube/petersen/GraphNetDatabaseRepository/northeren_tracks/dev_northern_tracks_full_part_1/selections/benchmark_validate_selection.csv').reset_index(drop = True)['event_no'].ravel().tolist()
test_selection = pd.read_csv('/groups/icecube/petersen/GraphNetDatabaseRepository/northeren_tracks/dev_northern_tracks_full_part_1/selections/benchmark_test_selection.csv').reset_index(drop = True)['event_no'].ravel().tolist()
# Log configuration to W&B
wandb_logger.experiment.config.update(config)
# Common variables
logger.info(f"features: {features}")
logger.info(f"truth: {truth}")
training_dataloader = make_dataloader(db = config['db'],
selection = train_selection, #config['train_selection'],
pulsemaps = config['pulsemap'],
features = features,
truth = truth,
batch_size = config['batch_size'],
num_workers = config['num_workers'],
shuffle = True)
validation_dataloader = make_dataloader(db = config['db'],
selection = validation_selection, #config["validation_selection"],
pulsemaps = config['pulsemap'],
features = features,
truth = truth,
batch_size = config['batch_size'],
num_workers = config['num_workers'],
shuffle = False)
test_dataloader = make_dataloader(db = config['db'],
selection = test_selection, #config["test_selection"],
pulsemaps = config['pulsemap'],
features = features,
truth = truth,
batch_size = config['batch_size'],
num_workers = config['num_workers'],
shuffle = False)
# Building model
detector = IceCubeDeepCore(
graph_builder=KNNGraphBuilder(nb_nearest_neighbours=8),
)
gnn = DynEdge(
nb_inputs=detector.nb_outputs,
nb_neighbours = config["nb_neighbors"],
global_pooling_schemes = ['min', 'max', 'mean'],
add_global_variables_after_pooling=True,
)
if config["target"] =='zenith':
task = ZenithReconstructionWithKappa(
hidden_size=gnn.nb_outputs,
target_labels=config["target"],
loss_function=VonMisesFisher2DLoss(),
)
elif config["target"] == 'azimuth':
task = AzimuthReconstructionWithKappa(
hidden_size=gnn.nb_outputs,
target_labels=config["target"],
loss_function=VonMisesFisher2DLoss(),
)
model = StandardModel(
detector=detector,
gnn=gnn,
tasks=[task],
optimizer_class=Adam,
optimizer_kwargs={"lr": learning_rate, "eps": 1e-03},
scheduler_class= None,
)
# Training model
callbacks = [
EarlyStopping(
monitor="val_loss",
patience=config["patience"],
),
ProgressBar(),
]
trainer = Trainer(
default_root_dir=f'~/{config["run_name"]}',
accelerator=config["accelerator"],
devices=config["devices"],
max_epochs=config["n_epochs"],
callbacks=callbacks,
log_every_n_steps=1,
logger=wandb_logger,
)
try:
trainer.fit(model, training_dataloader, validation_dataloader)
except KeyboardInterrupt:
logger.warning("[ctrl+c] Exiting gracefully.")
pass
# Predict on Test Set and save results to file
results = get_predictions(
trainer = trainer,
model = model,
dataloader = test_dataloader,
prediction_columns =[config["target"] + "_pred", config["target"] + "_kappa"],
additional_attributes=[config["target"], "event_no", "energy"],
)
save_results(config["db"], config["run_name"] + '_test_set', results, config["archive"], model)
# Predict on Validation Set and save results to file
results = get_predictions(
trainer = trainer,
model = model,
dataloader = validation_dataloader,
prediction_columns =[config["target"] + "_pred", config["target"] + "_kappa"],
additional_attributes=[config["target"], "event_no", "energy"],
)
save_results(config["db"], config["run_name"] + '_validation_set', results, config["archive"], model)
# Main function definition
def main():
for target in ['azimuth']:#['zenith']:#['zenith', 'azimuth']:
pulsemap = 'TWSRTHVInIcePulsesIC'
nb_neighbours = 8
n_epochs = 1000
learning_rate = 1e-05 #[1e-05, 1e-04, 1e-03]
archive = "/groups/icecube/peter/storage/northern_tracks/Output/fixed_learning_rate"
tag = 'Fixed_learning_rate_=_' + str(learning_rate)
run_name = f"dynedgev2_{tag}_{target}_{pulsemap}_k={nb_neighbours}_epochs={n_epochs}"
# Initialise Weights & Biases (W&B) run
wandb_logger = WandbLogger(
name=run_name,
project="NortherenTracks_Benchmark",
entity="graphnet-team",
save_dir=WANDB_DIR,
log_model=True
)
# Selections
#train_selection = pd.read_csv('/groups/icecube/petersen/GraphNetDatabaseRepository/northeren_tracks/dev_northern_tracks_full_part_1/selections/benchmark_train_selection.csv').reset_index(drop = True)['event_no'].ravel().tolist()
#validation_selection = pd.read_csv('/groups/icecube/petersen/GraphNetDatabaseRepository/northeren_tracks/dev_northern_tracks_full_part_1/selections/benchmark_validate_selection.csv').reset_index(drop = True)['event_no'].ravel().tolist()
#test_selection = pd.read_csv('/groups/icecube/petersen/GraphNetDatabaseRepository/northeren_tracks/dev_northern_tracks_full_part_1/selections/benchmark_test_selection.csv').reset_index(drop = True)['event_no'].ravel().tolist()
# Configuration
config = {
"db": "/groups/icecube/petersen/GraphNetDatabaseRepository/northeren_tracks/dev_northern_tracks_full_part_1/data/dev_northern_tracks_full_part_1.db",
#"train_selection": train_selection,
#"test_selection": test_selection,
#"validation_selection": validation_selection,
"pulsemap": pulsemap,
"batch_size": 512,
"num_workers": 20,
"accelerator": "gpu",
"devices": [0],
"target": target,
"n_epochs": n_epochs,
"patience": 10,
"archive": archive,
"run_name": run_name,
"nb_neighbors": nb_neighbours,
}
train(config,wandb_logger,learning_rate)
wandb_logger.finalize("success")
# Main function call
if __name__ == "__main__":
main()
What is benchmarked How is the zenith and azimuthal resolution curves impacted if we replace the learning rate schedule with a fixed, global learning rate?
Target variables used for evaluation zenith, azimuth
Step-by-step Run the
benchmark.py
training script with original settings, but with the following modifications:set
n_epochs = 150
andpatience = 10
Change the part of the code that originally says
to
where
learning_rate
is a variable.Benchmark deliverance Reply to this issue with two plots and attach the modified training script and other relevant changes. Each plot is a
matplotlib.pyplot.subplots
figure, where the upper plot shows resolution, and the bottom plot shows the relative improvement. See attached example.dynedge baseline
anddynedge with fixed, global learning rate = x
. Plot has relative improvement shown in the bottom.azimuth resolution vs. true energy with curves from
dynedge baseline
anddynedge with fixed, global learning rate = x
. Plot has relative improvement shown in the bottom.Relative improvement of the modification w.r.t. the baseline is given by (
relative improvement = (1 - resolution_modification/resolution_baseline)*100
).Example Figure