Closed ubbikk closed 1 year ago
@ubbikk What code do you have before creating the Trainer? It is important that there are not CUDA functions running before the call to trainer.fit
happens. This is the major limitation for DDP in notebooks. Could you share the full code so we can make suer this is not the case?
Also if you're running in a notebook try to restart the kernel and re-execute all cells in order if you have no cuda calls beforehand.
@justusschock yes, I always restart kernel + I've tried shutdown the notebook + restart, but the result was the same
@awaelchli I double-checked my code and there are no .cuda()
calls. Also nvidia-smi doesn't show any gpu activity before I execute the cell with Trainer.fit
. The code(it's ML part) is pretty straightforward:
import torch
from transformers import AutoModel
class SemV2(torch.nn.Module):
def __init__(self, params):
super().__init__()
self.params = params
self.model_name = params['model_name']
self.encoder = self.get_encoder()
hidden_size = self.encoder.config.hidden_size
targets_num = params['targets_num']
self.classifier = torch.nn.Linear(hidden_size, targets_num)
self.criterion = torch.nn.BCEWithLogitsLoss(reduction='none')
def get_encoder(self):
if self.model_name.startswith('bert'):
return AutoModel.from_pretrained(self.model_name, add_pooling_layer=False)
if self.model_name.startswith('roberta'):
return AutoModel.from_pretrained(self.model_name, add_pooling_layer=False)
elif self.model_name.startswith('distilbert'):
return AutoModel.from_pretrained(self.model_name)
else:
raise Exception(f'Unsupported model {self.model_name}')
def forward(self, input_ids, attention_mask, targets, targets_masks, tokens_masks):
x = self.encoder(input_ids, attention_mask=attention_mask)
x = x['last_hidden_state']
logits = self.classifier(x)
loss = self.criterion(logits, targets.float())
# loss = attention_mask.unsqueeze(-1)*loss
loss = loss * tokens_masks
loss = loss.transpose(0, 1) * targets_masks
loss = loss.mean()
return logits, targets, loss
def infer(self, input_ids, attention_mask):
x = self.encoder(input_ids, attention_mask=attention_mask)
x = x['last_hidden_state']
logits = self.classifier(x)
return logits
class Estimator(LightningModule):
def __init__(self, model, params, train_ds, val_ds, *args, **kwargs):
super().__init__(*args, **kwargs)
self.model = model
self.params = params
self.model = model
self.train_ds = train_ds
self.val_ds = val_ds
self.epochs = params['epochs']
self.batch_size = params['batch_size']
self.train_losses = []
self.val_losses = []
self.train_target = []
self.train_preds = []
self.val_target = []
self.val_preds = []
self.test_preds = None
self.pp = []
def training_step(self, batch, batch_idx):
logits, target, loss= self.model(*batch)
logits = logits.detach()
preds = torch.sigmoid(logits).cpu().numpy()
logits = logits.cpu().numpy()
target = target.cpu().numpy()
loss_val = loss.detach().cpu().item()
self.train_losses.append(loss_val)
self.train_target.append(target)
self.train_preds.append(preds)
self.log('train/loss', loss)
return loss
def validation_step(self, batch, batch_idx):
with torch.no_grad():
logits, target, loss = self.model(*batch)
logits = logits.detach()
preds = torch.sigmoid(logits).cpu().numpy()
logits = logits.cpu().numpy()
target = target.cpu().numpy()
loss_val = loss.detach().cpu().item()
self.val_target.append(target)
self.val_preds.append(preds)
self.val_losses.append(loss_val)
def on_validation_epoch_end(self) -> None:
preds = itertools.chain(*self.val_preds)
preds= list(preds)
extract_predictions_for_dataset(self.val_ds.examples, preds, [0.5]*TARGETS_NUM, [0.2]*TARGETS_NUM)
acc = calc_accuracy(estimator.val_ds)
self.val_accuracy = acc
print(acc)
def test_step(self, batch, batch_idx):
pass
def test_epoch_end(self, outputs) -> None:
pass
def on_train_epoch_start(self) -> None:
self.train_target = []
self.train_preds = []
def on_validation_epoch_start(self) -> None:
self.val_target = []
self.val_preds = []
def configure_optimizers(self):
return Adam(self.model.parameters(), lr=3e-5)
model_name = 'roberta-base'
suffix = model_name.replace('-', '_')
example_per_template = 10
val_sz = 10_000
epochs = 1
batch_size = 16
raw_ds_fp = f'data/raw_ds_{example_per_template}_{suffix}'
params = {
'epochs':epochs,
'batch_size':batch_size,
'targets_num':TARGETS_NUM,
'model_name':model_name
}
raw_ds = rb(raw_ds_fp)
train_ds = SemDataset(raw_ds[:-val_sz])
val_ds = SemDataset(raw_ds[-val_sz:])
train_loader = DataLoader(train_ds, batch_size=batch_size, collate_fn=collate, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=batch_size, collate_fn=collate, shuffle=False)
model = SemV2(params)
trainer = Trainer(max_epochs=epochs,
# precision=16,
# progress_bar_refresh_rate=20,
gradient_clip_val=1,
num_sanity_val_steps=0,
# terminate_on_nan=True,
# val_check_interval=1.,
# accelerator="gpu",
accelerator="gpu",
devices=2,
strategy="ddp_notebook",
# overfit_batches=1,
enable_checkpointing=False
)
trainer.fit(estimator, train_loader, val_loader)
@ubbikk I tried but couldn't reproduce the error, given the same version of lightning and pytorch. Since your code is missing some definitions, I used our bug report model with the same trainer settings as you have:
import transformers
import os
import torch
from torch.utils.data import DataLoader, Dataset
from pytorch_lightning import LightningModule, Trainer
class RandomDataset(Dataset):
def __init__(self, size, length):
self.len = length
self.data = torch.randn(length, size)
def __getitem__(self, index):
return self.data[index]
def __len__(self):
return self.len
class BoringModel(LightningModule):
def __init__(self):
super().__init__()
self.layer = torch.nn.Linear(32, 2)
def forward(self, x):
return self.layer(x)
def training_step(self, batch, batch_idx):
loss = self(batch).sum()
self.log("train_loss", loss)
return {"loss": loss}
def validation_step(self, batch, batch_idx):
loss = self(batch).sum()
self.log("valid_loss", loss)
def test_step(self, batch, batch_idx):
loss = self(batch).sum()
self.log("test_loss", loss)
def configure_optimizers(self):
return torch.optim.SGD(self.layer.parameters(), lr=0.1)
def run():
train_data = DataLoader(RandomDataset(32, 64), batch_size=2)
val_data = DataLoader(RandomDataset(32, 64), batch_size=2)
test_data = DataLoader(RandomDataset(32, 64), batch_size=2)
model = BoringModel()
trainer = Trainer(
default_root_dir=os.getcwd(),
limit_train_batches=1,
limit_val_batches=1,
limit_test_batches=1,
max_epochs=1,
precision=16,
gradient_clip_val=1,
num_sanity_val_steps=0,
accelerator="gpu",
enable_checkpointing=False,
devices=2,
strategy="ddp_notebook"
)
trainer.fit(model, train_dataloaders=train_data, val_dataloaders=val_data)
trainer.test(model, dataloaders=test_data)
if __name__ == "__main__":
run()
Can you confirm this runs fine?
I'm also a bit confused as to why you are seeing this error message.
RuntimeError: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the 'spawn' start method
If you were using 1.8.5, you should actually see this error message:
RuntimeError: Lightning can't create new processes if CUDA is already initialized. Did you manually call
torch.cuda.*
functions, have moved the model to the device, or allocated memory on the GPU any other way? Please remove any such calls, or change the selected strategy. You will have to restart the Python kernel.
We specifically override the message to give better guidance for the user.
Could you once again print(pytorch_lightning.__version__)
in your code to make absolutely sure you are using 1.8.5?
Using your example I was able to find the problem, this line caused the error:
from spacy import displacy
It's a module for visualization in Jupyter. I'm not sure, but it seems the problem is that they do it by running a web server inside the notebook and that's incompatible with CUDA.
@awaelchli @justusschock thank you very much!
@ubbikk Glad you were able to find the root cause. Depending on how you use this package, you might get away with importing it locally in the function you need. This way, it gets imported after DDP created processes, which should then be fine. But that might not be your use case.
Bug description
I'm trying to train on multi-GPU from Jupyter. I do according to this https://pytorch-lightning.readthedocs.io/en/stable/accelerators/gpu_intermediate.html#distributed-data-parallel-in-notebooks but it throws an exception:
How to reproduce the bug
Error messages and logs
Environment
Current environment
``` * CUDA: - GPU: - Tesla V100-SXM2-16GB - Tesla V100-SXM2-16GB - Tesla V100-SXM2-16GB - Tesla V100-SXM2-16GB - available: True - version: 11.7 * Lightning: - lightning-utilities: 0.4.2 - pytorch-lightning: 1.8.5.post0 - torch: 1.13.0 - torch-model-archiver: 0.5.3b20220226 - torch-workflow-archiver: 0.2.4b20220513 - torchaudio: 0.13.0 - torchmetrics: 0.11.0 - torchserve: 0.6.0b20220513 - torchtext: 0.14.0 - torchvision: 0.14.0 * Packages: - aiohttp: 3.8.3 - aiosignal: 1.3.1 - aniso8601: 9.0.1 - ansi2html: 1.8.0 - anyio: 3.6.2 - argon2-cffi: 21.3.0 - argon2-cffi-bindings: 21.2.0 - arrow: 1.2.3 - asttokens: 2.1.0 - async-timeout: 4.0.2 - attrs: 22.1.0 - awscli: 1.22.101 - babel: 2.11.0 - backcall: 0.2.0 - backports.functools-lru-cache: 1.6.4 - beautifulsoup4: 4.11.1 - bleach: 5.0.1 - blis: 0.7.8 - bokeh: 2.4.3 - boto3: 1.21.46 - botocore: 1.24.46 - brotlipy: 0.7.0 - captum: 0.5.0 - catalogue: 2.0.8 - certifi: 2022.9.24 - cffi: 1.15.1 - charset-normalizer: 2.1.1 - click: 8.1.3 - cloudpickle: 2.2.0 - colorama: 0.4.3 - confection: 0.0.3 - contextlib2: 21.6.0 - cryptography: 38.0.3 - cycler: 0.11.0 - cymem: 2.0.7 - dataclasses: 0.8 - debugpy: 1.6.3 - decorator: 5.1.1 - defusedxml: 0.7.1 - dill: 0.3.6 - docutils: 0.15.2 - dparse: 0.6.2 - entrypoints: 0.4 - executing: 1.2.0 - fastai: 2.1.10 - fastcore: 1.5.27 - fastjsonschema: 2.16.2 - fastprogress: 1.0.3 - filelock: 3.6.0 - flask: 2.2.2 - flask-restful: 0.3.9 - flit-core: 3.8.0 - fonttools: 4.38.0 - frozenlist: 1.3.3 - fsspec: 2022.11.0 - future: 0.18.2 - google-pasta: 0.2.0 - gym: 0.26.2 - gym-notices: 0.0.8 - horovod: 0.26.1 - huggingface-hub: 0.11.1 - idna: 3.4 - imageio: 2.16.2 - importlib-metadata: 4.13.0 - importlib-resources: 5.10.0 - ipykernel: 6.17.1 - ipython: 8.6.0 - ipython-genutils: 0.2.0 - ipywidgets: 8.0.2 - itsdangerous: 2.1.2 - jedi: 0.18.1 - jinja2: 3.1.2 - jmespath: 1.0.1 - joblib: 1.2.0 - json5: 0.9.5 - jsonschema: 4.17.0 - jupyter-client: 7.4.4 - jupyter-core: 5.0.0 - jupyter-server: 1.23.1 - jupyterlab: 3.3.4 - jupyterlab-pygments: 0.2.2 - jupyterlab-server: 2.16.2 - jupyterlab-widgets: 3.0.3 - kiwisolver: 1.4.4 - langcodes: 3.3.0 - lightning-utilities: 0.4.2 - llvmlite: 0.39.1 - markupsafe: 2.1.1 - matplotlib: 3.5.3 - matplotlib-inline: 0.1.6 - mistune: 2.0.4 - multidict: 6.0.3 - multiprocess: 0.70.14 - munkres: 1.1.4 - murmurhash: 1.0.9 - nbclassic: 0.4.8 - nbclient: 0.7.0 - nbconvert: 7.2.4 - nbformat: 5.7.0 - nest-asyncio: 1.5.6 - notebook: 6.4.12 - notebook-shim: 0.2.2 - numba: 0.56.4 - numpy: 1.23.4 - nvgpu: 0.9.0 - packaging: 21.3 - pandas: 1.4.4 - pandocfilters: 1.5.0 - parso: 0.8.3 - pathos: 0.3.0 - pathy: 0.6.2 - patsy: 0.5.3 - pexpect: 4.8.0 - pickleshare: 0.7.5 - pillow: 9.0.1 - pip: 22.3.1 - pkgutil-resolve-name: 1.3.10 - platformdirs: 2.5.2 - plotly: 5.6.0 - pox: 0.3.2 - ppft: 1.7.6.6 - preshed: 3.0.8 - prometheus-client: 0.15.0 - prompt-toolkit: 3.0.32 - protobuf: 3.20.1 - protobuf3-to-dict: 0.1.5 - psutil: 5.9.4 - ptyprocess: 0.7.0 - pure-eval: 0.2.2 - pyarrow: 10.0.0 - pyasn1: 0.4.8 - pybind11: 2.9.2 - pybind11-global: 2.9.2 - pycparser: 2.21 - pydantic: 1.10.2 - pyfunctional: 1.4.3 - pygame: 2.1.2 - pygments: 2.13.0 - pynvml: 11.4.1 - pyopenssl: 22.1.0 - pyparsing: 3.0.9 - pyqt5: 5.12.3 - pyqt5-sip: 4.19.18 - pyqtchart: 5.12 - pyqtwebengine: 5.12.1 - pyrsistent: 0.19.2 - pysocks: 1.7.1 - python-dateutil: 2.8.2 - pytorch-lightning: 1.8.5.post0 - pytz: 2022.6 - pyyaml: 5.4.1 - pyzmq: 24.0.1 - regex: 2022.10.31 - requests: 2.28.1 - rsa: 4.7.2 - s3fs: 0.4.2 - s3transfer: 0.5.2 - sagemaker: 2.116.0 - schema: 0.7.5 - scikit-learn: 1.0 - scipy: 1.8.1 - seaborn: 0.11.2 - send2trash: 1.8.0 - setuptools: 65.5.1 - shap: 0.40.0 - shellingham: 1.5.0 - six: 1.16.0 - sklearn: 0.0.post1 - slicer: 0.0.7 - smart-open: 5.2.1 - smclarify: 0.2 - smdebug-rulesconfig: 1.0.1 - sniffio: 1.3.0 - soupsieve: 2.3.2.post1 - spacy: 3.4.2 - spacy-legacy: 3.0.10 - spacy-loggers: 1.0.3 - srsly: 2.4.5 - stack-data: 0.6.0 - statsmodels: 0.13.5 - tabulate: 0.9.0 - tenacity: 8.1.0 - tensorboardx: 2.5.1 - termcolor: 2.1.0 - terminado: 0.17.0 - thinc: 8.1.5 - threadpoolctl: 3.1.0 - tinycss2: 1.2.1 - tokenizers: 0.13.2 - toml: 0.10.2 - torch: 1.13.0 - torch-model-archiver: 0.5.3b20220226 - torch-workflow-archiver: 0.2.4b20220513 - torchaudio: 0.13.0 - torchmetrics: 0.11.0 - torchserve: 0.6.0b20220513 - torchtext: 0.14.0 - torchvision: 0.14.0 - tornado: 6.2 - tqdm: 4.63.2 - traitlets: 5.5.0 - transformers: 4.25.1 - typer: 0.4.2 - typing-extensions: 4.4.0 - unicodedata2: 15.0.0 - urllib3: 1.26.11 - wasabi: 0.10.0 - wcwidth: 0.2.5 - webencodings: 0.5.1 - websocket-client: 1.4.2 - werkzeug: 2.2.2 - wheel: 0.38.4 - widgetsnbextension: 4.0.3 - yarl: 1.8.2 - zipp: 3.10.0 * System: - OS: Linux - architecture: - 64bit - ELF - processor: x86_64 - python: 3.9.13 - version: #26~20.04.1-Ubuntu SMP Sat Oct 15 03:22:07 UTC 2022 ```More info
I've tried 1.8.5.post0 and 1.8.5.
cc @justusschock @awaelchli