Faulty PyTorch checkpoints saved to s3 by ClearML

erogol commented 2 years ago

When I try to load a checkpoint from S3 that is saved by ClearML I get this error.

In [3]: torch.load("/Users/Downloads/best_model_271.pth.tar")
---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-3-77c52233c439> in <module>
----> 1 torch.load("/Users/Downloads/best_model_271.pth.tar")

~/miniforge3/lib/python3.9/site-packages/torch/serialization.py in load(f, map_location, pickle_module, **pickle_load_args)
    598             # reset back to the original position.
    599             orig_position = opened_file.tell()
--> 600             with _open_zipfile_reader(opened_file) as opened_zipfile:
    601                 if _is_torchscript_zip(opened_zipfile):
    602                     warnings.warn("'torch.load' received a zip file that looks like a TorchScript archive"

~/miniforge3/lib/python3.9/site-packages/torch/serialization.py in __init__(self, name_or_buffer)
    240 class _open_zipfile_reader(_opener):
    241     def __init__(self, name_or_buffer) -> None:
--> 242         super(_open_zipfile_reader, self).__init__(torch._C.PyTorchFileReader(name_or_buffer))
    243
    244

RuntimeError: PytorchStreamReader failed reading zip archive: failed finding central directory

However if I try to load the same checkpoint from the local copy it works.

I've checked the file sizes and they are also the same. I am not sure what is wrong and any help or pointer would be appreciated.

This is our training library that uses ClearML

https://github.com/coqui-ai/Trainer

And checkpoints are created by

https://github.com/coqui-ai/TTS/

Let me know if you need any further input.

jkhenning commented 2 years ago

Hi @erogol ,

Does this happen every time? Can you share a code sample to reproduce?

erogol commented 2 years ago

If you install the Trainer (https://github.com/coqui-ai/Trainer) and set up ClearML

git clone https://github.com/coqui-ai/Trainer
cd Trainer 
pip install -e .

then

import os
from dataclasses import dataclass, field

import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader
from torchvision import transforms
from torchvision.datasets import MNIST

from trainer import Trainer, TrainerArgs, TrainerConfig, TrainerModel

@dataclass
class MnistModelConfig(TrainerConfig):
    optimizer: str = "Adam"
    lr: float = 0.001
    epochs: int = 5
    print_step: int = 1
    plot_step: int = 1
    save_step: int = 1
    dashboard_logger: str = "clearml"
    project_name: str = "pytorch-mnist"
    run_name: str = "test-run"

class MnistModel(TrainerModel):
    def __init__(self):
        super().__init__()

        # mnist images are (1, 28, 28) (channels, height, width)
        self.layer_1 = nn.Linear(28 * 28, 128)
        self.layer_2 = nn.Linear(128, 256)
        self.layer_3 = nn.Linear(256, 10)

    def forward(self, x):
        batch_size, _, _, _ = x.size()

        # (b, 1, 28, 28) -> (b, 1*28*28)
        x = x.view(batch_size, -1)
        x = self.layer_1(x)
        x = F.relu(x)
        x = self.layer_2(x)
        x = F.relu(x)
        x = self.layer_3(x)

        x = F.log_softmax(x, dim=1)
        return x

    def train_step(self, batch, criterion):
        x, y = batch
        logits = self(x)
        loss = criterion(logits, y)
        return {"model_outputs": logits}, {"loss": loss}

    def eval_step(self, batch, criterion):
        x, y = batch
        logits = self(x)
        loss = criterion(logits, y)
        return {"model_outputs": logits}, {"loss": loss}

    def get_criterion(self):
        return torch.nn.NLLLoss()

    def get_data_loader(self, config, assets, is_eval, data_items, verbose, num_gpus, rank=0):
        transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])
        dataset = MNIST(os.getcwd(), train=not is_eval, download=True, transform=transform)
        mnist_train = DataLoader(dataset, batch_size=8)
        return mnist_train

def test_train_mnist():
    model = MnistModel()
    trainer = Trainer(TrainerArgs(), MnistModelConfig(), model=model, output_path=os.getcwd())
    trainer.fit()

if __name__ == "__main__":
    test_train_mnist()

erogol commented 2 years ago

I guess I found the reason. I was saving torch files like

    with fsspec.open(path, "wb") as f:
        torch.save(state, f)

if I don't use fsspec the problem disappears.

jkhenning commented 2 years ago

Nice catch @erogol , any specific reason to use fsspec? I assume it's because somehow the file is opened differently?

erogol commented 2 years ago

just to support saving on the cloud even if we track experiment locally

jkhenning commented 2 years ago

I see. Well, it might be worth understanding the difference between that and the normal flow.

I that's fine with you, I can mark this as a feature request 🙂

allegroai / clearml

Faulty PyTorch checkpoints saved to s3 by ClearML #608