Closed ngocnd2402 closed 1 year ago
Hi @ngocnd2402. This is a general error of Python on Windows. We do not support Windows officially, but if you provide some more information, perhaps we could help. Could you send information on the OS, relevant HW (GPU + CUDA), and Python environment? Can you share the full error stack trace?
I have the same problem: Windows Server 2022 Torch 1.13.0 Quadro P4000 GPU Python 3.8 CUDA 12.0 AttributeError Traceback (most recent call last) Cell In[1], line 1 ----> 1 from super_gradients.training import models 3 yolo_nas_l = models.get("yolo_nas_l", pretrained_weights="coco")
File ~\anaconda3\envs\yolo-nas\lib\site-packages\super_gradients__init__.py:2 1 from super_gradients.common import init_trainer, is_distributed, object_names ----> 2 from super_gradients.training import losses, utils, datasets_utils, DataAugmentation, Trainer, KDTrainer, QATTrainer 3 from super_gradients.common.registry.registry import ARCHITECTURES 4 from super_gradients.sanity_check import env_sanity_check
File ~\anaconda3\envs\yolo-nas\lib\site-packages\super_gradients\training__init__.py:2 1 # PACKAGE IMPORTS FOR EXTERNAL USAGE ----> 2 import super_gradients.training.utils.distributed_training_utils as distributed_training_utils 3 from super_gradients.training.datasets import datasets_utils, DataAugmentation 4 from super_gradients.training.sg_trainer import Trainer
File ~\anaconda3\envs\yolo-nas\lib\site-packages\super_gradients\training\utils\distributed_training_utils.py:13 11 from torch.distributed.elastic.multiprocessing import Std 12 from torch.distributed.elastic.multiprocessing.errors import record ---> 13 from torch.distributed.launcher.api import LaunchConfig, elastic_launch 15 from super_gradients.common.environment.ddp_utils import init_trainer 16 from super_gradients.common.data_types.enum import MultiGPUMode
File ~\anaconda3\envs\yolo-nas\lib\site-packages\torch\distributed\launcher__init__.py:10 1 #!/usr/bin/env/python3 2 3 # Copyright (c) Facebook, Inc. and its affiliates. (...) 6 # This source code is licensed under the BSD-style license found in the 7 # LICENSE file in the root directory of this source tree. ---> 10 from torch.distributed.launcher.api import ( # noqa: F401 11 LaunchConfig, 12 elastic_launch, 13 launch_agent, 14 )
File ~\anaconda3\envs\yolo-nas\lib\site-packages\torch\distributed\launcher\api.py:15 13 import torch.distributed.elastic.rendezvous.registry as rdzv_registry 14 from torch.distributed.elastic import events, metrics ---> 15 from torch.distributed.elastic.agent.server.api import WorkerSpec 16 from torch.distributed.elastic.agent.server.local_elastic_agent import LocalElasticAgent 17 from torch.distributed.elastic.multiprocessing import SignalException, Std
File ~\anaconda3\envs\yolo-nas\lib\site-packages\torch\distributed\elastic\agent\server__init__.py:40 9 """ 10 The elastic agent is the control plane of torchelastic. It is a process 11 that launches and manages underlying worker processes. The agent is (...) 28 in the same job) to make a collective decision. 29 """ 31 from .api import ( # noqa: F401 32 ElasticAgent, 33 RunResult, (...) 38 WorkerState, 39 ) ---> 40 from .local_elastic_agent import TORCHELASTIC_ENABLE_FILE_TIMER, TORCHELASTIC_TIMER_FILE
File ~\anaconda3\envs\yolo-nas\lib\site-packages\torch\distributed\elastic\agent\server\local_elastic_agent.py:19 16 import uuid 17 from typing import Any, Dict, Optional, Tuple ---> 19 import torch.distributed.elastic.timer as timer 20 from torch.distributed.elastic import events 22 from torch.distributed.elastic.agent.server.api import ( 23 RunResult, 24 SimpleElasticAgent, (...) 27 WorkerState, 28 )
File ~\anaconda3\envs\yolo-nas\lib\site-packages\torch\distributed\elastic\timer__init__.py:44 42 from .api import TimerClient, TimerRequest, TimerServer, configure, expires # noqa: F401 43 from .local_timer import LocalTimerClient, LocalTimerServer # noqa: F401 ---> 44 from .file_based_local_timer import FileTimerClient, FileTimerServer, FileTimerRequest
File ~\anaconda3\envs\yolo-nas\lib\site-packages\torch\distributed\elastic\timer\file_based_local_timer.py:63
51 def to_json(self) -> str:
52 return json.dumps(
53 {
54 "version": self.version,
(...)
59 },
60 )
---> 63 class FileTimerClient(TimerClient):
64 """
65 Client side of FileTimerServer
. This client is meant to be used
66 on the same host that the FileTimerServer
is running on and uses
(...)
79 negative or zero signal will not kill the process.
80 """
81 def init(self, file_path: str, signal=signal.SIGKILL) -> None:
File ~\anaconda3\envs\yolo-nas\lib\site-packages\torch\distributed\elastic\timer\file_based_local_timer.py:81, in FileTimerClient()
63 class FileTimerClient(TimerClient):
64 """
65 Client side of FileTimerServer
. This client is meant to be used
66 on the same host that the FileTimerServer
is running on and uses
(...)
79 negative or zero signal will not kill the process.
80 """
---> 81 def init(self, file_path: str, signal=signal.SIGKILL) -> None:
82 super().init()
83 self._file_path = file_path
AttributeError: module 'signal' has no attribute 'SIGKILL'
I solve it by go to the file of the error and replace SIGKILL into SIGILL
cool. many thanks. works
@ngocnd2402 , does this solve your issue ?
yeah, it works. Thanks <3
Describe the bug
A clear and concise description of what the bug is.
To Reproduce
Steps to reproduce the behavior:
Expected behavior
A clear and concise description of what you expected to happen.
Screenshots
If applicable, add screenshots to help explain your problem.
Environment:
Additional context
Add any other context about the problem here.