Open jakubMitura14 opened 2 years ago
cross referencing from https://discuss.ray.io/t/checkpointing-errors-on-complex-models/7637/4 While using TuneReportCheckpointCallback I get
Trial returned a result which did not include the specified metric
When using TuneReportCallback such error do not occur
pytorch-lightning 1.6.5 ray 2.0.0 ray-lightning 0.3.0
full list of python packages https://docs.google.com/document/d/1PlgcDDYKZ9qB-7YioISuEOFIfWpTk1xFYFA2dW4yqUo/edit
As indicated in part of the code with stars *** TuneReportCheckpointCallback gives error when TuneReportCallback do not
minimal working example not finding metrics becouse of checkpointing
code
"""Simple example using RayAccelerator and Ray Tune""" import functools import glob import importlib.util import math import multiprocessing as mp import operator import os import shutil import sys import tempfile import time import warnings from datetime import datetime from functools import partial from glob import glob from os import path as pathOs from os.path import basename, dirname, exists, isdir, join, split from pathlib import Path #from picai_eval.picai_eval import evaluate_case from statistics import mean from typing import List, Optional, Sequence, Tuple, Union import gdown import matplotlib.pyplot as plt import monai import numpy as np import pandas as pd import pytorch_lightning as pl import ray import seaborn as sns import SimpleITK as sitk import torch import torch.nn as nn import torch.nn.functional as F import torchio import torchio as tio import torchmetrics from pl_bolts.datamodules.mnist_datamodule import MNISTDataModule from pytorch_lightning import (Callback, LightningDataModule, LightningModule, Trainer) from pytorch_lightning.strategies import Strategy from ray import air, tune from ray.air import session from ray.tune import CLIReporter from ray.tune.integration.pytorch_lightning import ( TuneReportCallback, TuneReportCheckpointCallback) from ray.tune.schedulers import ASHAScheduler, PopulationBasedTraining from ray_lightning import RayShardedStrategy, RayStrategy from ray_lightning.tune import TuneReportCallback, get_tune_resources from report_guided_annotation import extract_lesion_candidates from scipy.ndimage import gaussian_filter from sklearn.model_selection import train_test_split from torch.nn.intrinsic.qat import ConvBnReLU3d from torch.utils.cpp_extension import load from torch.utils.data import DataLoader, Dataset, random_split from torchmetrics import Precision from torchmetrics.functional import precision_recall ray.init(num_cpus=24) data_dir = '/home/sliceruser/mnist' MNISTDataModule(data_dir=data_dir).prepare_data() num_cpus_per_worker=6 test_l_dir = '/home/sliceruser/test_l_dir' class netaA(nn.Module): def __init__(self, config ) -> None: super().__init__() layer_1, layer_2 = config["layer_1"], config["layer_2"] self.model = nn.Sequential( torch.nn.Linear(28 * 28, layer_1), torch.nn.Linear(layer_1, layer_2), torch.nn.Linear(layer_2, 10) ) def forward(self, x): return self.model(x) class LightningMNISTClassifier(pl.LightningModule): def __init__(self, config, data_dir=None): super(LightningMNISTClassifier, self).__init__() self.data_dir = data_dir or os.getcwd() self.lr = config["lr"] self.batch_size = config["batch_size"] self.accuracy = torchmetrics.Accuracy() self.netA= netaA(config) def forward(self, x): batch_size, channels, width, height = x.size() x = x.view(batch_size, -1) x= self.netA(x) x = F.log_softmax(x, dim=1) return x def configure_optimizers(self): return torch.optim.Adam(self.parameters(), lr=self.lr) def training_step(self, train_batch, batch_idx): x, y = train_batch logits = self.forward(x) loss = F.nll_loss(logits, y.long()) acc = self.accuracy(logits, y) self.log("ptl/train_loss", loss) self.log("ptl/train_accuracy", acc) return loss def validation_step(self, val_batch, batch_idx): x, y = val_batch logits = self.forward(x) loss = F.nll_loss(logits, y.long()) acc = self.accuracy(logits, y) return {"val_loss": loss, "val_accuracy": acc} def validation_epoch_end(self, outputs): avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean() avg_acc = torch.stack([x["val_accuracy"] for x in outputs]).mean() self.log("ptl/val_loss", avg_loss) self.log("ptl/val_accuracy", avg_acc) def train_mnist(config, data_dir=None, num_epochs=10, num_workers=1, use_gpu=True, callbacks=None): model = LightningMNISTClassifier(config, data_dir) callbacks = callbacks or [] print(" aaaaaaaaaa ") trainer = pl.Trainer( max_epochs=num_epochs, callbacks=callbacks, progress_bar_refresh_rate=0, strategy=RayStrategy( num_workers=num_workers, use_gpu=use_gpu))#, init_hook=download_data dm = MNISTDataModule( data_dir=data_dir, num_workers=2, batch_size=config["batch_size"]) trainer.fit(model, dm) def tune_mnist(data_dir, num_samples=2, num_epochs=10, num_workers=2, use_gpu=True): config = { "layer_1": tune.choice([32, 64, 128]), "layer_2": tune.choice([64, 128, 256]), "lr": tune.loguniform(1e-4, 1e-1), "batch_size": tune.choice([32, 64, 128]), } metrics = {"loss": "ptl/val_loss", "acc": "ptl/val_accuracy"} #*********************************************** #do not work callbacks = [TuneReportCheckpointCallback(metrics, on="validation_end",filename="checkpointtt")] #works #callbacks = [TuneReportCallback(metrics, on="validation_end")] #*********************************************** trainable = tune.with_parameters( train_mnist, data_dir=data_dir, num_epochs=num_epochs, num_workers=num_workers, use_gpu=use_gpu, callbacks=callbacks) analysis = tune.run( trainable, metric="loss", mode="min", config=config, num_samples=num_samples, resources_per_trial=get_tune_resources( num_workers=num_workers, use_gpu=use_gpu), name="tune_mnist") print("Best hyperparameters found were: ", analysis.best_config) tune_mnist(data_dir)
full error https://docs.google.com/document/d/17bcUZOQsJZipf0nkRr2uRiDCkekYia5uHbm0niqLyHM/edit?usp=sharing
Hey,
Any updates on this? Were you able to solve it?
No, frankly because I could not I migrated to pure optuna
cross referencing from https://discuss.ray.io/t/checkpointing-errors-on-complex-models/7637/4 While using TuneReportCheckpointCallback I get
When using TuneReportCallback such error do not occur
pytorch-lightning 1.6.5 ray 2.0.0 ray-lightning 0.3.0
full list of python packages https://docs.google.com/document/d/1PlgcDDYKZ9qB-7YioISuEOFIfWpTk1xFYFA2dW4yqUo/edit
As indicated in part of the code with stars *** TuneReportCheckpointCallback gives error when TuneReportCallback do not
minimal working example not finding metrics becouse of checkpointing
code
full error https://docs.google.com/document/d/17bcUZOQsJZipf0nkRr2uRiDCkekYia5uHbm0niqLyHM/edit?usp=sharing