tkestack / vcuda-controller

Other
488 stars 156 forks source link

can't find function libcuda.so.440.100 in cuEGLInit #10

Closed nlnjnj closed 4 years ago

nlnjnj commented 4 years ago

So what specific driver and cuda version should I install?

Also this log may be incorrect:

original

  if (unlikely(!cuda_library_entry[idx].fn_ptr)) {
    LOGGER(4, "can't find function %s in %s", cuda_filename,
           cuda_library_entry[idx].name);
  }

can't find function libcuda.so.440.100 in cuEGLInit

fix

  if (unlikely(!cuda_library_entry[idx].fn_ptr)) {
    LOGGER(4, "can't find function '%s' in %s", cuda_library_entry[idx].name, 
           cuda_filename);
  }

can't find function 'cuEGLInit' in libcuda.so.440.100

(pid=343086) WARNING:root:remote calling tf.config.list_physical_devices('GPU')
(pid=343086) 2020-09-11 07:52:40.449785: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcuda.so.1
(pid=343086) /tmp/cuda-control/src/loader.c:941 config file: /etc/vcuda/19e8c670ddea331f7e12e46fa90a0fdb8510192d98ac939c35bbc25df2e17b07/vcuda.config
(pid=343086) /tmp/cuda-control/src/loader.c:942 pid file: /etc/vcuda/19e8c670ddea331f7e12e46fa90a0fdb8510192d98ac939c35bbc25df2e17b07/pids.config
(pid=343086) /tmp/cuda-control/src/loader.c:946 register to remote: pod uid: 98cd4783-221f-4080-9aec-f6448dc9bbc2, cont id: 19e8c670ddea331f7e12e46fa90a0fdb8510192d98ac939c35bbc25df2e17b07
(pid=343086) /tmp/cuda-control/src/loader.c:1044 pod uid          : 98cd4783-221f-4080-9aec-f6448dc9bbc2
(pid=343086) /tmp/cuda-control/src/loader.c:1045 limit            : 50
(pid=343086) /tmp/cuda-control/src/loader.c:1046 container name   : nvidia2
(pid=343086) /tmp/cuda-control/src/loader.c:1047 total utilization: 50
(pid=343086) /tmp/cuda-control/src/loader.c:1048 total gpu memory : 2684354560
(pid=343086) /tmp/cuda-control/src/loader.c:1049 driver version   : 440.100
(pid=343086) /tmp/cuda-control/src/loader.c:1050 hard limit mode  : 0
(pid=343086) /tmp/cuda-control/src/loader.c:1051 enable mode      : 1
(pid=343086) /tmp/cuda-control/src/loader.c:767 Start hijacking
(pid=343086) /tmp/cuda-control/src/loader.c:783 can't find function libcuda.so.440.100 in cuEGLInit
(pid=343086) /tmp/cuda-control/src/hijack_call.c:481 cuInit error no CUDA-capable device is detected
(pid=343086) *** Aborted at 1599810760 (unix time) try "date -d @1599810760" if you are using GNU date ***
(pid=343086) PC: @                0x0 (unknown)
(pid=343086) *** SIGABRT (@0x53c2e) received by PID 343086 (TID 0x7f419a0f4740) from PID 343086; stack trace: ***
(pid=343086)     @     0x7f4199cd18a0 (unknown)
(pid=343086)     @     0x7f419990cf47 gsignal
(pid=343086)     @     0x7f419990e8b1 abort
(pid=343086)     @     0x7f418a733d01 google::LogMessage::Flush()
(pid=343086)     @     0x7f418a733dd1 google::LogMessage::~LogMessage()
(pid=343086)     @     0x7f418a71ed29 ray::RayLog::~RayLog()
(pid=343086)     @     0x7f418a3bc32d ray::CoreWorkerProcess::~CoreWorkerProcess()
(pid=343086)     @     0x7f418a3bc3da std::unique_ptr<>::~unique_ptr()
(pid=343086)     @     0x7f41999110f1 (unknown)
(pid=343086)     @     0x7f41999111ea exit
(pid=343086)     @     0x7f410af2b497 initialization
(pid=343086)     @     0x7f4199cce827 __pthread_once_slow
(pid=343086)     @     0x7f410af2ce3b cuInit
(pid=343086)     @     0x7f412fd55da0 cuInit
(pid=343086)     @     0x7f412fc8f19f stream_executor::gpu::(anonymous namespace)::InternalInit()
(pid=343086)     @     0x7f412fc8f42d stream_executor::gpu::GpuDriver::Init()
(pid=343086)     @     0x7f4146f77162 stream_executor::gpu::CudaPlatform::VisibleDeviceCount()
(pid=343086)     @     0x7f414694852f tensorflow::BaseGPUDeviceFactory::CacheDeviceIds()
(pid=343086)     @     0x7f414694862f tensorflow::BaseGPUDeviceFactory::ListPhysicalDevices()
(pid=343086)     @     0x7f4146a74e9d tensorflow::DeviceFactory::ListAllPhysicalDevices()
(pid=343086)     @     0x7f415c47a35d tensorflow::TF_ListPhysicalDevices()
(pid=343086)     @     0x7f415c474646 _ZZN8pybind1112cpp_function10initializeIRPFNS_6objectEvES2_JEJNS_4nameENS_5scopeENS_7siblingEEEEvOT_PFT0_DpT1_EDpRKT2_ENUlRNS_6detail13function_callEE1_4_FUNESM_
(pid=343086)     @     0x7f415c49e239 pybind11::cpp_function::dispatcher()
(pid=343086)     @     0x556b0ee23c94 _PyMethodDef_RawFastCallKeywords
(pid=343086)     @     0x556b0ee23db1 _PyCFunction_FastCallKeywords
(pid=343086)     @     0x556b0ee8f5be _PyEval_EvalFrameDefault
(pid=343086)     @     0x556b0ee2320b _PyFunction_FastCallKeywords
(pid=343086)     @     0x556b0ee8ae70 _PyEval_EvalFrameDefault
(pid=343086)     @     0x556b0edd3b00 _PyEval_EvalCodeWithName
(pid=343086)     @     0x556b0ee23497 _PyFunction_FastCallKeywords
(pid=343086)     @     0x556b0ee8ae70 _PyEval_EvalFrameDefault
(pid=343086)     @     0x556b0edd32b9 _PyEval_EvalCodeWithName
mYmNeo commented 4 years ago

So what specific driver and cuda version should I install?

Also this log may be incorrect:

original

  if (unlikely(!cuda_library_entry[idx].fn_ptr)) {
    LOGGER(4, "can't find function %s in %s", cuda_filename,
           cuda_library_entry[idx].name);
  }

can't find function libcuda.so.440.100 in cuEGLInit

fix

  if (unlikely(!cuda_library_entry[idx].fn_ptr)) {
    LOGGER(4, "can't find function '%s' in %s", cuda_library_entry[idx].name, 
           cuda_filename);
  }

can't find function 'cuEGLInit' in libcuda.so.440.100

(pid=343086) WARNING:root:remote calling tf.config.list_physical_devices('GPU')
(pid=343086) 2020-09-11 07:52:40.449785: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcuda.so.1
(pid=343086) /tmp/cuda-control/src/loader.c:941 config file: /etc/vcuda/19e8c670ddea331f7e12e46fa90a0fdb8510192d98ac939c35bbc25df2e17b07/vcuda.config
(pid=343086) /tmp/cuda-control/src/loader.c:942 pid file: /etc/vcuda/19e8c670ddea331f7e12e46fa90a0fdb8510192d98ac939c35bbc25df2e17b07/pids.config
(pid=343086) /tmp/cuda-control/src/loader.c:946 register to remote: pod uid: 98cd4783-221f-4080-9aec-f6448dc9bbc2, cont id: 19e8c670ddea331f7e12e46fa90a0fdb8510192d98ac939c35bbc25df2e17b07
(pid=343086) /tmp/cuda-control/src/loader.c:1044 pod uid          : 98cd4783-221f-4080-9aec-f6448dc9bbc2
(pid=343086) /tmp/cuda-control/src/loader.c:1045 limit            : 50
(pid=343086) /tmp/cuda-control/src/loader.c:1046 container name   : nvidia2
(pid=343086) /tmp/cuda-control/src/loader.c:1047 total utilization: 50
(pid=343086) /tmp/cuda-control/src/loader.c:1048 total gpu memory : 2684354560
(pid=343086) /tmp/cuda-control/src/loader.c:1049 driver version   : 440.100
(pid=343086) /tmp/cuda-control/src/loader.c:1050 hard limit mode  : 0
(pid=343086) /tmp/cuda-control/src/loader.c:1051 enable mode      : 1
(pid=343086) /tmp/cuda-control/src/loader.c:767 Start hijacking
(pid=343086) /tmp/cuda-control/src/loader.c:783 can't find function libcuda.so.440.100 in cuEGLInit
(pid=343086) /tmp/cuda-control/src/hijack_call.c:481 cuInit error no CUDA-capable device is detected
(pid=343086) *** Aborted at 1599810760 (unix time) try "date -d @1599810760" if you are using GNU date ***
(pid=343086) PC: @                0x0 (unknown)
(pid=343086) *** SIGABRT (@0x53c2e) received by PID 343086 (TID 0x7f419a0f4740) from PID 343086; stack trace: ***
(pid=343086)     @     0x7f4199cd18a0 (unknown)
(pid=343086)     @     0x7f419990cf47 gsignal
(pid=343086)     @     0x7f419990e8b1 abort
(pid=343086)     @     0x7f418a733d01 google::LogMessage::Flush()
(pid=343086)     @     0x7f418a733dd1 google::LogMessage::~LogMessage()
(pid=343086)     @     0x7f418a71ed29 ray::RayLog::~RayLog()
(pid=343086)     @     0x7f418a3bc32d ray::CoreWorkerProcess::~CoreWorkerProcess()
(pid=343086)     @     0x7f418a3bc3da std::unique_ptr<>::~unique_ptr()
(pid=343086)     @     0x7f41999110f1 (unknown)
(pid=343086)     @     0x7f41999111ea exit
(pid=343086)     @     0x7f410af2b497 initialization
(pid=343086)     @     0x7f4199cce827 __pthread_once_slow
(pid=343086)     @     0x7f410af2ce3b cuInit
(pid=343086)     @     0x7f412fd55da0 cuInit
(pid=343086)     @     0x7f412fc8f19f stream_executor::gpu::(anonymous namespace)::InternalInit()
(pid=343086)     @     0x7f412fc8f42d stream_executor::gpu::GpuDriver::Init()
(pid=343086)     @     0x7f4146f77162 stream_executor::gpu::CudaPlatform::VisibleDeviceCount()
(pid=343086)     @     0x7f414694852f tensorflow::BaseGPUDeviceFactory::CacheDeviceIds()
(pid=343086)     @     0x7f414694862f tensorflow::BaseGPUDeviceFactory::ListPhysicalDevices()
(pid=343086)     @     0x7f4146a74e9d tensorflow::DeviceFactory::ListAllPhysicalDevices()
(pid=343086)     @     0x7f415c47a35d tensorflow::TF_ListPhysicalDevices()
(pid=343086)     @     0x7f415c474646 _ZZN8pybind1112cpp_function10initializeIRPFNS_6objectEvES2_JEJNS_4nameENS_5scopeENS_7siblingEEEEvOT_PFT0_DpT1_EDpRKT2_ENUlRNS_6detail13function_callEE1_4_FUNESM_
(pid=343086)     @     0x7f415c49e239 pybind11::cpp_function::dispatcher()
(pid=343086)     @     0x556b0ee23c94 _PyMethodDef_RawFastCallKeywords
(pid=343086)     @     0x556b0ee23db1 _PyCFunction_FastCallKeywords
(pid=343086)     @     0x556b0ee8f5be _PyEval_EvalFrameDefault
(pid=343086)     @     0x556b0ee2320b _PyFunction_FastCallKeywords
(pid=343086)     @     0x556b0ee8ae70 _PyEval_EvalFrameDefault
(pid=343086)     @     0x556b0edd3b00 _PyEval_EvalCodeWithName
(pid=343086)     @     0x556b0ee23497 _PyFunction_FastCallKeywords
(pid=343086)     @     0x556b0ee8ae70 _PyEval_EvalFrameDefault
(pid=343086)     @     0x556b0edd32b9 _PyEval_EvalCodeWithName

can't find function libcuda.so.440.100 in cuEGLInit is a warning not raising a exit error.Your problem is cuInit error no CUDA-capable device is detected. Maybe you specify a wrong idx of gpu card.