CUDA_VISIBLE_DEVICES=RR & env={"CUDA_VISIBLE_DEVICES":"RR"} can not work

ruanrz commented 1 year ago

I tried to deploy multiple replica wtih multiple GPUs, but CUDA_VISIBLE_DEVICES=RR and env={"CUDA_VISIBLE_DEVICES":"RR"} do not work as document said.

Code

# CUDA_VISIBLE_DEVICES=RR JINA_MP_START_METHOD=spawn python test_flow.py

from diffusers import DiffusionPipeline,EulerAncestralDiscreteScheduler
from diffusers import DPMSolverMultistepScheduler
from jina import Executor, requests,Flow
import torch
import time

class ZRExecutor(Executor):
    def __init__(self,**kwargs):
        super().__init__(**kwargs)
        print('torch.cuda.device_count()',torch.cuda.device_count())
        print('torch.cuda.current_device()',torch.cuda.current_device())
        before_load=(torch.cuda.memory_allocated())/1024/1024
        print('before load model:',before_load)
        model_path = "#######"
        lms = EulerAncestralDiscreteScheduler(
            beta_start=0.00085, 
            beta_end=0.012, 
            beta_schedule="scaled_linear"
        )

        pipe = DiffusionPipeline.from_pretrained(
            model_path,
            cache_dir="./huggingface",
            resume_download=True,
            custom_pipeline="lpw_stable_diffusion",
            torch_dtype=torch.float16,
            scheduler=lms,
            use_auth_token="#######",
            safety_checker=None
            )

        pipe.to("cuda")

        print('after load model:',(torch.cuda.memory_allocated())/1024/1024)
        print('used memory:',(torch.cuda.memory_allocated()-before_load)/1024/1024)

def main():
    f = Flow().add(uses=ZRExecutor,name='testens',replicas=3,env={"CUDA_VISIBLE_DEVICES":"RR"})
    with f:
        f.block()

if __name__ == '__main__': 
    main()

it will raise error

ERROR  testens/rep-0@778717 RuntimeError('No CUDA GPUs are available') during <class 'jina.serve.runtimes.worker.WorkerRuntime'> initialization                                 [02/03/23 17:26:04]
        add "--quiet-error" to suppress the exception details
       Traceback (most recent call last):
         File "/root/envs/(***)/lib/python3.8/site-packages/jina/orchestrate/pods/__init__.py", line 76, in run
           runtime = runtime_cls(
         File "/root/envs/(***)/lib/python3.8/site-packages/jina/serve/runtimes/worker/__init__.py", line 36, in __init__
           super().__init__(args, **kwargs)
         File "/root/envs/(***)/lib/python3.8/site-packages/jina/serve/runtimes/asyncio.py", line 88, in __init__
           self._loop.run_until_complete(self.async_setup())
         File "/root/envs/(***)/lib/python3.8/asyncio/base_events.py", line 616, in run_until_complete
           return future.result()
         File "/root/envs/(***)/lib/python3.8/site-packages/jina/serve/runtimes/worker/__init__.py", line 101, in async_setup
           self._request_handler = WorkerRequestHandler(
         File "/root/envs/(***)/lib/python3.8/site-packages/jina/serve/runtimes/worker/request_handling.py", line 49, in __init__
           self._load_executor(
         File "/root/envs/(***)/lib/python3.8/site-packages/jina/serve/runtimes/worker/request_handling.py", line 140, in _load_executor
           self._executor: BaseExecutor = BaseExecutor.load_config(
         File "/root/envs/(***)/lib/python3.8/site-packages/jina/jaml/__init__.py", line 760, in load_config
           obj = JAML.load(tag_yml, substitute=False, runtime_args=runtime_args)
         File "/root/envs/(***)/lib/python3.8/site-packages/jina/jaml/__init__.py", line 174, in load
           r = yaml.load(stream, Loader=get_jina_loader_with_runtime(runtime_args))
         File "/root/envs/(***)/lib/python3.8/site-packages/yaml/__init__.py", line 81, in load
           return loader.get_single_data()
         File "/root/envs/(***)/lib/python3.8/site-packages/yaml/constructor.py", line 51, in get_single_data
           return self.construct_document(node)
         File "/root/envs/(***)/lib/python3.8/site-packages/yaml/constructor.py", line 55, in construct_document
           data = self.construct_object(node)
         File "/root/envs/(***)/lib/python3.8/site-packages/yaml/constructor.py", line 100, in construct_object
           data = constructor(self, node)
         File "/root/envs/(***)/lib/python3.8/site-packages/jina/jaml/__init__.py", line 582, in _from_yaml
           return get_parser(cls, version=data.get('version', None)).parse(
         File "/root/envs/(***)/lib/python3.8/site-packages/jina/jaml/parsers/executor/legacy.py", line 45, in parse
           obj = cls(
         File "/root/envs/(***)/lib/python3.8/site-packages/jina/serve/executors/decorators.py", line 60, in arg_wrapper
           f = func(self, *args, **kwargs)
         File "/root/envs/(***)/lib/python3.8/site-packages/jina/serve/helper.py", line 71, in arg_wrapper
           f = func(self, *args, **kwargs)
         File "/root/autodl-nas/zrr/jina_test/test_flow.py", line 30, in __init__
           pipe.to("cuda")
         File "/root/envs/(***)/lib/python3.8/site-packages/diffusers/pipelines/pipeline_utils.py", line 272, in to
           module.to(torch_device)
         File "/root/envs/(***)/lib/python3.8/site-packages/transformers/modeling_utils.py", line 1682, in to
           return super().to(*args, **kwargs)
         File "/root/envs/(***)/lib/python3.8/site-packages/torch/nn/modules/module.py", line 987, in to
           return self._apply(convert)
         File "/root/envs/(***)/lib/python3.8/site-packages/torch/nn/modules/module.py", line 639, in _apply
           module._apply(fn)
         File "/root/envs/(***)/lib/python3.8/site-packages/torch/nn/modules/module.py", line 639, in _apply
           module._apply(fn)
         File "/root/envs/(***)/lib/python3.8/site-packages/torch/nn/modules/module.py", line 639, in _apply
           module._apply(fn)
         File "/root/envs/(***)/lib/python3.8/site-packages/torch/nn/modules/module.py", line 662, in _apply
           param_applied = fn(param)
         File "/root/envs/(***)/lib/python3.8/site-packages/torch/nn/modules/module.py", line 985, in convert
           return t.to(device, dtype if t.is_floating_point() or t.is_complex() else None, non_blocking)
         File "/root/envs/(***)/lib/python3.8/site-packages/torch/cuda/__init__.py", line 229, in _lazy_init
           torch._C._cuda_init()
       RuntimeError: No CUDA GPUs are available

If I remove CUDA_VISIBLE_DEVICES=RR and run the code, it will run successfully. However I checked the GPU usage, it seems that all models are running on GPU 0. And the script output torch.cuda.current_device() 0 three times.

nvidia-smi
Fri Feb  3 17:47:23 2023
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.60.02    Driver Version: 510.60.02    CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|===============================+======================+======================|
|   0  NVIDIA RTX A5000    On   | 00000000:01:00.0 Off |                  Off |
| 30%   29C    P2    58W / 230W |   8491MiB / 24564MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA RTX A5000    On   | 00000000:25:00.0 Off |                  Off |
| 30%   23C    P8    14W / 230W |      2MiB / 24564MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   2  NVIDIA RTX A5000    On   | 00000000:41:00.0 Off |                  Off |
| 30%   22C    P8    14W / 230W |      2MiB / 24564MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+

+-----------------------------------------------------------------------------+
| Processes:                                                                  |
|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
|        ID   ID                                                   Usage      |
|=============================================================================|
|    0   N/A  N/A    997681      C                                    2811MiB |
|    0   N/A  N/A    997682      C                                    2839MiB |
|    0   N/A  N/A    997683      C                                    2839MiB |
+-----------------------------------------------------------------------------+

Jina version

jina --version-full
- jina 3.13.2
- docarray 0.21.0
- jcloud 0.2.1
- jina-hubble-sdk 0.32.0
- jina-proto 0.1.13
- protobuf 4.21.12
- proto-backend upb
- grpcio 1.47.2
- pyyaml 6.0
- python 3.8.10
- platform Linux
- platform-release 5.4.0-91-generic
- platform-version #102-Ubuntu SMP Fri Nov 5 16:31:28 UTC 2021
- architecture x86_64
- processor x86_64
- uid 2485377892355
- session-id 87650b9c-a3b0-11ed-8d5d-0242ac110003
- uptime 2023-02-03T18:50:09.712441
- ci-vendor (unset)
- internal False
* JINA_DEFAULT_HOST (unset)
* JINA_DEFAULT_TIMEOUT_CTRL (unset)
* JINA_DEPLOYMENT_NAME (unset)
* JINA_DISABLE_UVLOOP (unset)
* JINA_EARLY_STOP (unset)
* JINA_FULL_CLI (unset)
* JINA_GATEWAY_IMAGE (unset)
* JINA_GRPC_RECV_BYTES (unset)
* JINA_GRPC_SEND_BYTES (unset)
* JINA_HUB_NO_IMAGE_REBUILD (unset)
* JINA_LOG_CONFIG (unset)
* JINA_LOG_LEVEL (unset)
* JINA_LOG_NO_COLOR (unset)
* JINA_MP_START_METHOD (unset)
* JINA_OPTOUT_TELEMETRY (unset)
* JINA_RANDOM_PORT_MAX (unset)
* JINA_RANDOM_PORT_MIN (unset)
* JINA_LOCKS_ROOT (unset)
* JINA_K8S_ACCESS_MODES (unset)
* JINA_K8S_STORAGE_CLASS_NAME (unset)
* JINA_K8S_STORAGE_CAPACITY (unset)
* JINA_STREAMER_ARGS (unset)

JoanFM commented 1 year ago

Hey @ruanrz ,

can you give us the output of this command?

nvidia-smi -L

Also, what is the value of the CUDA_TOTAL_DEVICES environment variable?

JoanFM commented 1 year ago

can u do this in ur Executor and tell us what is printed?

class ZRExecutor(Executor):
    def __init__(self,**kwargs):
        super().__init__(**kwargs)
        import os
        print(f' ENVIRONMENT VARIABLE CUDA_VISIBLE_DEVICES: {os.environ["CUDA_VISIBLE_DEVICES"]}')

ruanrz commented 1 year ago

can u do this in ur Executor and tell us what is printed?

@JoanFM

nvidia-smi -L

GPU 0: NVIDIA RTX A5000 (UUID: GPU-9fcbfc3e-c4c8-ecfc-9a2c-ec9dc1d2bdc1)
GPU 1: NVIDIA RTX A5000 (UUID: GPU-076c3ca9-0750-1f41-e8ea-5f0433ae2cc2)
GPU 2: NVIDIA RTX A5000 (UUID: GPU-570317ab-d3de-f466-da1e-cfccb6a5b75f)

and CUDA_TOTAL_DEVICES environment variable is not defined

ENVIRONMENT VARIABLE CUDA_VISIBLE_DEVICES: 1
torch.cuda.device_count() 3
torch.cuda.current_device() 0
ENVIRONMENT VARIABLE CUDA_VISIBLE_DEVICES: 2
torch.cuda.device_count() 3
torch.cuda.current_device() 0
ENVIRONMENT VARIABLE CUDA_VISIBLE_DEVICES: 0
torch.cuda.device_count() 3
torch.cuda.current_device() 0

and all models still running on GPU 0 by checking nvidia-smi VRAM usage

JoanFM commented 1 year ago

This is weird that the current_device does not respect the environment variable in that speciric Executor, there seems to be something weird with the torch usage. We will check on our end as well

fqzhao-win commented 1 year ago

I also encountered the same problem, f=Flow (protocol='http '). add (uses=MyExecutor, replicas=3, env={"CUDA_VISIBLE_DEVICES": "RR"}), but all replicas are still running on the 0 th graphics card

fqzhao-win commented 1 year ago

1675493664262

JoanFM commented 1 year ago

Hello @ruanrz, @fqzhao-win ,

I believe what happens is that torch is imported before the Executor starts and this is why the CUDA_VISIBLE_DEVICES takes place.

Would u try doing these things (one or the other) 1- Refactor ur code to separate the Executor and the Flow and include the Executor from a module or a file (https://docs.jina.ai/concepts/flow/add-executors/#define-executor-with-uses) 2- Keep it as it is, but hide the import torch inside the method where this is needed, so that the module is not imported from the start and affectes the new processes of Executors

I believe this should solve your issues.

JoanFM commented 1 year ago

Hello @ruanrz , @fqzhao-win, have u tried the suggested alternative?

ruanrz commented 1 year ago

Hi @JoanFM Soory for the late reply，I tried the second suggestion，and installed the latest version of Jina. It runs successfully. here is my code:


from jina import Executor, requests,Flow
import time

class ZRExecutor(Executor):
    def __init__(self,**kwargs):
        super().__init__(**kwargs)
        import os
        print(f' ENVIRONMENT VARIABLE CUDA_VISIBLE_DEVICES: {os.environ["CUDA_VISIBLE_DEVICES"]}')

        import torch
        print('torch.cuda.device_count()',torch.cuda.device_count())
        print('torch.cuda.current_device()',torch.cuda.current_device())
        from diffusers import DiffusionPipeline,EulerAncestralDiscreteScheduler
        from diffusers import DPMSolverMultistepScheduler

        model_path = "#####"
        lms = EulerAncestralDiscreteScheduler(
            beta_start=0.00085, 
            beta_end=0.012, 
            beta_schedule="scaled_linear"
        )

        pipe = DiffusionPipeline.from_pretrained(
            model_path,
            cache_dir="./huggingface",
            resume_download=True,
            custom_pipeline="lpw_stable_diffusion",
            torch_dtype=torch.float16,
            scheduler=lms,
            use_auth_token="#####",
            safety_checker=None,
        )

        pipe.to("cuda")

def main():
   f = Flow().add(uses=ZRExecutor,name='testens',replicas=2,env={"CUDA_VISIBLE_DEVICES":"RR"})
    with f:
        f.block()

if __name__ == '__main__': 
    main()

nvidia-smi

+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.89.02    Driver Version: 525.89.02    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|===============================+======================+======================|
|   0  NVIDIA RTX A5000    On   | 00000000:01:00.0 Off |                  Off |
| 30%   26C    P8    16W / 230W |   2758MiB / 24564MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA RTX A5000    On   | 00000000:25:00.0 Off |                  Off |
| 30%   26C    P8    15W / 230W |   2760MiB / 24564MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+

+-----------------------------------------------------------------------------+
| Processes:                                                                  |
|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
|        ID   ID                                                   Usage      |
|=============================================================================|
+-----------------------------------------------------------------------------+

Thank you for your patient advice and guidance

jina-ai / jina

CUDA_VISIBLE_DEVICES=RR & env={"CUDA_VISIBLE_DEVICES":"RR"} can not work #5653

Code

Jina version