Closed LidarADAS closed 2 years ago
Maybe it was caused by user privileges, try to execute deviceQuery under /usr/local/cuda/samples/1_Utilities/deviceQuery and check the result, or else execute the following commands and try again.
$ sudo usermod -aG video $USER
$ newgrp
I try to find /usr/local/cuda/samples/1_Utilities/deviceQuery in docker, but there is no smples folder under cuda.
Then I executed the commands you provided, and no change, the error still exists.
Is there any other suggestion ?
Could you please post the folder(/usr/local/cuda/)'s content? Maybe the 'samples' folder has been deleted, but it's not a problem, I will post it in the following reply!
$ ls /usr/local/cuda/samples/1_Utilities/deviceQuery
deviceQuery.cpp Makefile NsightEclipse.xml readme.txt
# Location of the CUDA Toolkit
CUDA_PATH ?= /usr/local/cuda-10.0
##############################
# start deprecated interface #
##############################
ifeq ($(x86_64),1)
$(info WARNING - x86_64 variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=x86_64 instead)
TARGET_ARCH ?= x86_64
endif
ifeq ($(ARMv7),1)
$(info WARNING - ARMv7 variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=armv7l instead)
TARGET_ARCH ?= armv7l
endif
ifeq ($(aarch64),1)
$(info WARNING - aarch64 variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=aarch64 instead)
TARGET_ARCH ?= aarch64
endif
ifeq ($(ppc64le),1)
$(info WARNING - ppc64le variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=ppc64le instead)
TARGET_ARCH ?= ppc64le
endif
ifneq ($(GCC),)
$(info WARNING - GCC variable has been deprecated)
$(info WARNING - please use HOST_COMPILER=$(GCC) instead)
HOST_COMPILER ?= $(GCC)
endif
ifneq ($(abi),)
$(error ERROR - abi variable has been removed)
endif
############################
# end deprecated interface #
############################
# architecture
HOST_ARCH := $(shell uname -m)
TARGET_ARCH ?= $(HOST_ARCH)
ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le))
TARGET_SIZE := 64
else ifneq (,$(filter $(TARGET_ARCH),armv7l))
TARGET_SIZE := 32
endif
else
TARGET_SIZE := $(shell getconf LONG_BIT)
endif
else
$(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
endif
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le))
$(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
endif
endif
# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
TARGET_ARCH = armv7l
endif
# operating system
HOST_OS := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
TARGET_OS ?= $(HOST_OS)
ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
$(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
endif
# host compiler
ifeq ($(TARGET_OS),darwin)
ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
HOST_COMPILER ?= clang++
endif
else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
ifeq ($(TARGET_OS),linux)
HOST_COMPILER ?= arm-linux-gnueabihf-g++
else ifeq ($(TARGET_OS),qnx)
ifeq ($(QNX_HOST),)
$(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
endif
ifeq ($(QNX_TARGET),)
$(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
endif
export QNX_HOST
export QNX_TARGET
HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
else ifeq ($(TARGET_OS),android)
HOST_COMPILER ?= arm-linux-androideabi-g++
endif
else ifeq ($(TARGET_ARCH),aarch64)
ifeq ($(TARGET_OS), linux)
HOST_COMPILER ?= aarch64-linux-gnu-g++
else ifeq ($(TARGET_OS),qnx)
ifeq ($(QNX_HOST),)
$(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
endif
ifeq ($(QNX_TARGET),)
$(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
endif
export QNX_HOST
export QNX_TARGET
HOST_COMPILER ?= $(QNX_HOST)/usr/bin/aarch64-unknown-nto-qnx7.0.0-g++
else ifeq ($(TARGET_OS), android)
HOST_COMPILER ?= aarch64-linux-android-clang++
endif
else ifeq ($(TARGET_ARCH),ppc64le)
HOST_COMPILER ?= powerpc64le-linux-gnu-g++
endif
endif
HOST_COMPILER ?= g++
NVCC := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
# internal flags
NVCCFLAGS := -m${TARGET_SIZE}
CCFLAGS :=
LDFLAGS :=
# build flags
ifeq ($(TARGET_OS),darwin)
LDFLAGS += -rpath $(CUDA_PATH)/lib
CCFLAGS += -arch $(HOST_ARCH)
else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
CCFLAGS += -mfloat-abi=hard
else ifeq ($(TARGET_OS),android)
LDFLAGS += -pie
CCFLAGS += -fpie -fpic -fexceptions
endif
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
ifneq ($(TARGET_FS),)
GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
ifeq ($(GCCVERSIONLTEQ46),1)
CCFLAGS += --sysroot=$(TARGET_FS)
endif
LDFLAGS += --sysroot=$(TARGET_FS)
LDFLAGS += -rpath-link=$(TARGET_FS)/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
endif
endif
ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
ifneq ($(TARGET_FS),)
GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
ifeq ($(GCCVERSIONLTEQ46),1)
CCFLAGS += --sysroot=$(TARGET_FS)
endif
LDFLAGS += --sysroot=$(TARGET_FS)
LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L $(TARGET_FS)/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L $(TARGET_FS)/usr/lib/aarch64-linux-gnu
LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
CCFLAGS += -isystem=$(TARGET_FS)/usr/include
CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu
endif
endif
endif
ifeq ($(TARGET_OS),qnx)
CCFLAGS += -DWIN_INTERFACE_CUSTOM
LDFLAGS += -lsocket
endif
# Install directory of different arch
CUDA_INSTALL_TARGET_DIR :=
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
else ifeq ($(TARGET_ARCH),ppc64le)
CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
endif
# Debug build flags
ifeq ($(dbg),1)
NVCCFLAGS += -g -G
BUILD_TYPE := debug
else
BUILD_TYPE := release
endif
ALL_CCFLAGS :=
ALL_CCFLAGS += $(NVCCFLAGS)
ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
SAMPLE_ENABLED := 1
ALL_LDFLAGS :=
ALL_LDFLAGS += $(ALL_CCFLAGS)
ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
# Common includes and paths for CUDA
INCLUDES := -I../../common/inc
LIBRARIES :=
################################################################################
# Gencode arguments
ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
SMS ?= 30 32 53 61 62 70 72 75
else
SMS ?= 30 61 70 75
endif
ifeq ($(SMS),)
$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
SAMPLE_ENABLED := 0
endif
ifeq ($(GENCODE_FLAGS),)
# Generate SASS code for each SM architecture listed in $(SMS)
$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
HIGHEST_SM := $(lastword $(sort $(SMS)))
ifneq ($(HIGHEST_SM),)
GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
endif
endif
ifeq ($(SAMPLE_ENABLED),0)
EXEC ?= @echo "[@]"
endif
################################################################################
# Target rules
all: build
build: deviceQuery
check.deps:
ifeq ($(SAMPLE_ENABLED),0)
@echo "Sample will be waived due to the above missing dependencies"
else
@echo "Sample is ready - all dependencies have been met"
endif
deviceQuery.o:deviceQuery.cpp
$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
deviceQuery: deviceQuery.o
$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
$(EXEC) mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
$(EXEC) cp $@ ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
run: build
$(EXEC) ./deviceQuery
clean:
rm -f deviceQuery deviceQuery.o
rm -rf ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/deviceQuery
clobber: clean
// std::system includes
#include <cuda_runtime.h>
#include <helper_cuda.h>
#include <iostream>
#include <memory>
#include <string>
int *pArgc = NULL;
char **pArgv = NULL;
#if CUDART_VERSION < 5000
// CUDA-C includes
#include <cuda.h>
// This function wraps the CUDA Driver API into a template function
template <class T>
inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute,
int device) {
CUresult error = cuDeviceGetAttribute(attribute, device_attribute, device);
if (CUDA_SUCCESS != error) {
fprintf(
stderr,
"cuSafeCallNoSync() Driver API error = %04d from file <%s>, line %i.\n",
error, __FILE__, __LINE__);
exit(EXIT_FAILURE);
}
}
#endif /* CUDART_VERSION < 5000 */
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv) {
pArgc = &argc;
pArgv = argv;
printf("%s Starting...\n\n", argv[0]);
printf(
" CUDA Device Query (Runtime API) version (CUDART static linking)\n\n");
int deviceCount = 0;
cudaError_t error_id = cudaGetDeviceCount(&deviceCount);
if (error_id != cudaSuccess) {
printf("cudaGetDeviceCount returned %d\n-> %s\n",
static_cast<int>(error_id), cudaGetErrorString(error_id));
printf("Result = FAIL\n");
exit(EXIT_FAILURE);
}
// This function call returns 0 if there are no CUDA capable devices.
if (deviceCount == 0) {
printf("There are no available device(s) that support CUDA\n");
} else {
printf("Detected %d CUDA Capable device(s)\n", deviceCount);
}
int dev, driverVersion = 0, runtimeVersion = 0;
for (dev = 0; dev < deviceCount; ++dev) {
cudaSetDevice(dev);
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, dev);
printf("\nDevice %d: \"%s\"\n", dev, deviceProp.name);
// Console log
cudaDriverGetVersion(&driverVersion);
cudaRuntimeGetVersion(&runtimeVersion);
printf(" CUDA Driver Version / Runtime Version %d.%d / %d.%d\n",
driverVersion / 1000, (driverVersion % 100) / 10,
runtimeVersion / 1000, (runtimeVersion % 100) / 10);
printf(" CUDA Capability Major/Minor version number: %d.%d\n",
deviceProp.major, deviceProp.minor);
char msg[256];
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
sprintf_s(msg, sizeof(msg),
" Total amount of global memory: %.0f MBytes "
"(%llu bytes)\n",
static_cast<float>(deviceProp.totalGlobalMem / 1048576.0f),
(unsigned long long)deviceProp.totalGlobalMem);
#else
snprintf(msg, sizeof(msg),
" Total amount of global memory: %.0f MBytes "
"(%llu bytes)\n",
static_cast<float>(deviceProp.totalGlobalMem / 1048576.0f),
(unsigned long long)deviceProp.totalGlobalMem);
#endif
printf("%s", msg);
printf(" (%2d) Multiprocessors, (%3d) CUDA Cores/MP: %d CUDA Cores\n",
deviceProp.multiProcessorCount,
_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) *
deviceProp.multiProcessorCount);
printf(
" GPU Max Clock rate: %.0f MHz (%0.2f "
"GHz)\n",
deviceProp.clockRate * 1e-3f, deviceProp.clockRate * 1e-6f);
#if CUDART_VERSION >= 5000
// This is supported in CUDA 5.0 (runtime API device properties)
printf(" Memory Clock rate: %.0f Mhz\n",
deviceProp.memoryClockRate * 1e-3f);
printf(" Memory Bus Width: %d-bit\n",
deviceProp.memoryBusWidth);
if (deviceProp.l2CacheSize) {
printf(" L2 Cache Size: %d bytes\n",
deviceProp.l2CacheSize);
}
#else
// This only available in CUDA 4.0-4.2 (but these were only exposed in the
// CUDA Driver API)
int memoryClock;
getCudaAttribute<int>(&memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE,
dev);
printf(" Memory Clock rate: %.0f Mhz\n",
memoryClock * 1e-3f);
int memBusWidth;
getCudaAttribute<int>(&memBusWidth,
CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev);
printf(" Memory Bus Width: %d-bit\n",
memBusWidth);
int L2CacheSize;
getCudaAttribute<int>(&L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev);
if (L2CacheSize) {
printf(" L2 Cache Size: %d bytes\n",
L2CacheSize);
}
#endif
printf(
" Maximum Texture Dimension Size (x,y,z) 1D=(%d), 2D=(%d, "
"%d), 3D=(%d, %d, %d)\n",
deviceProp.maxTexture1D, deviceProp.maxTexture2D[0],
deviceProp.maxTexture2D[1], deviceProp.maxTexture3D[0],
deviceProp.maxTexture3D[1], deviceProp.maxTexture3D[2]);
printf(
" Maximum Layered 1D Texture Size, (num) layers 1D=(%d), %d layers\n",
deviceProp.maxTexture1DLayered[0], deviceProp.maxTexture1DLayered[1]);
printf(
" Maximum Layered 2D Texture Size, (num) layers 2D=(%d, %d), %d "
"layers\n",
deviceProp.maxTexture2DLayered[0], deviceProp.maxTexture2DLayered[1],
deviceProp.maxTexture2DLayered[2]);
printf(" Total amount of constant memory: %lu bytes\n",
deviceProp.totalConstMem);
printf(" Total amount of shared memory per block: %lu bytes\n",
deviceProp.sharedMemPerBlock);
printf(" Total number of registers available per block: %d\n",
deviceProp.regsPerBlock);
printf(" Warp size: %d\n",
deviceProp.warpSize);
printf(" Maximum number of threads per multiprocessor: %d\n",
deviceProp.maxThreadsPerMultiProcessor);
printf(" Maximum number of threads per block: %d\n",
deviceProp.maxThreadsPerBlock);
printf(" Max dimension size of a thread block (x,y,z): (%d, %d, %d)\n",
deviceProp.maxThreadsDim[0], deviceProp.maxThreadsDim[1],
deviceProp.maxThreadsDim[2]);
printf(" Max dimension size of a grid size (x,y,z): (%d, %d, %d)\n",
deviceProp.maxGridSize[0], deviceProp.maxGridSize[1],
deviceProp.maxGridSize[2]);
printf(" Maximum memory pitch: %lu bytes\n",
deviceProp.memPitch);
printf(" Texture alignment: %lu bytes\n",
deviceProp.textureAlignment);
printf(
" Concurrent copy and kernel execution: %s with %d copy "
"engine(s)\n",
(deviceProp.deviceOverlap ? "Yes" : "No"), deviceProp.asyncEngineCount);
printf(" Run time limit on kernels: %s\n",
deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No");
printf(" Integrated GPU sharing Host Memory: %s\n",
deviceProp.integrated ? "Yes" : "No");
printf(" Support host page-locked memory mapping: %s\n",
deviceProp.canMapHostMemory ? "Yes" : "No");
printf(" Alignment requirement for Surfaces: %s\n",
deviceProp.surfaceAlignment ? "Yes" : "No");
printf(" Device has ECC support: %s\n",
deviceProp.ECCEnabled ? "Enabled" : "Disabled");
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
printf(" CUDA Device Driver Mode (TCC or WDDM): %s\n",
deviceProp.tccDriver ? "TCC (Tesla Compute Cluster Driver)"
: "WDDM (Windows Display Driver Model)");
#endif
printf(" Device supports Unified Addressing (UVA): %s\n",
deviceProp.unifiedAddressing ? "Yes" : "No");
printf(" Device supports Compute Preemption: %s\n",
deviceProp.computePreemptionSupported ? "Yes" : "No");
printf(" Supports Cooperative Kernel Launch: %s\n",
deviceProp.cooperativeLaunch ? "Yes" : "No");
printf(" Supports MultiDevice Co-op Kernel Launch: %s\n",
deviceProp.cooperativeMultiDeviceLaunch ? "Yes" : "No");
printf(" Device PCI Domain ID / Bus ID / location ID: %d / %d / %d\n",
deviceProp.pciDomainID, deviceProp.pciBusID, deviceProp.pciDeviceID);
const char *sComputeMode[] = {
"Default (multiple host threads can use ::cudaSetDevice() with device "
"simultaneously)",
"Exclusive (only one host thread in one process is able to use "
"::cudaSetDevice() with this device)",
"Prohibited (no host thread can use ::cudaSetDevice() with this "
"device)",
"Exclusive Process (many threads in one process is able to use "
"::cudaSetDevice() with this device)",
"Unknown",
NULL};
printf(" Compute Mode:\n");
printf(" < %s >\n", sComputeMode[deviceProp.computeMode]);
}
// If there are 2 or more GPUs, query to determine whether RDMA is supported
if (deviceCount >= 2) {
cudaDeviceProp prop[64];
int gpuid[64]; // we want to find the first two GPUs that can support P2P
int gpu_p2p_count = 0;
for (int i = 0; i < deviceCount; i++) {
checkCudaErrors(cudaGetDeviceProperties(&prop[i], i));
// Only boards based on Fermi or later can support P2P
if ((prop[i].major >= 2)
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
// on Windows (64-bit), the Tesla Compute Cluster driver for windows
// must be enabled to support this
&& prop[i].tccDriver
#endif
) {
// This is an array of P2P capable GPUs
gpuid[gpu_p2p_count++] = i;
}
}
// Show all the combinations of support P2P GPUs
int can_access_peer;
if (gpu_p2p_count >= 2) {
for (int i = 0; i < gpu_p2p_count; i++) {
for (int j = 0; j < gpu_p2p_count; j++) {
if (gpuid[i] == gpuid[j]) {
continue;
}
checkCudaErrors(
cudaDeviceCanAccessPeer(&can_access_peer, gpuid[i], gpuid[j]));
printf("> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n",
prop[gpuid[i]].name, gpuid[i], prop[gpuid[j]].name, gpuid[j],
can_access_peer ? "Yes" : "No");
}
}
}
}
// csv masterlog info
// *****************************
// exe and CUDA driver name
printf("\n");
std::string sProfileString = "deviceQuery, CUDA Driver = CUDART";
char cTemp[16];
// driver version
sProfileString += ", CUDA Driver Version = ";
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
sprintf_s(cTemp, 10, "%d.%d", driverVersion/1000, (driverVersion%100)/10);
#else
snprintf(cTemp, sizeof(cTemp), "%d.%d", driverVersion / 1000,
(driverVersion % 100) / 10);
#endif
sProfileString += cTemp;
// Runtime version
sProfileString += ", CUDA Runtime Version = ";
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
sprintf_s(cTemp, 10, "%d.%d", runtimeVersion/1000, (runtimeVersion%100)/10);
#else
snprintf(cTemp, sizeof(cTemp), "%d.%d", runtimeVersion / 1000,
(runtimeVersion % 100) / 10);
#endif
sProfileString += cTemp;
// Device count
sProfileString += ", NumDevs = ";
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
sprintf_s(cTemp, 10, "%d", deviceCount);
#else
snprintf(cTemp, sizeof(cTemp), "%d", deviceCount);
#endif
sProfileString += cTemp;
sProfileString += "\n";
printf("%s", sProfileString.c_str());
printf("Result = PASS\n");
// finish
exit(EXIT_SUCCESS);
}
Put deviceQuery.cpp and Makefile in the same folder, and then execute 'make' to build deviceQuery.
I clone the cuda from github(https://github.com/NVIDIA/cuda-samples), then compiled the deviceQuery. When I run the binary, I get the result:
./deviceQuery Starting...
CUDA Device Query (Runtime API) version (CUDART static linking)
Detected 1 CUDA Capable device(s)
Device 0: "GeForce GTX 1080 Ti" CUDA Driver Version / Runtime Version 10.1 / 10.0 CUDA Capability Major/Minor version number: 6.1 Total amount of global memory: 11176 MBytes (11719409664 bytes) (28) Multiprocessors, (128) CUDA Cores/MP: 3584 CUDA Cores GPU Max Clock rate: 1645 MHz (1.64 GHz) Memory Clock rate: 5505 Mhz Memory Bus Width: 352-bit L2 Cache Size: 2883584 bytes Maximum Texture Dimension Size (x,y,z) 1D=(131072), 2D=(131072, 65536), 3D=(16384, 16384, 16384) Maximum Layered 1D Texture Size, (num) layers 1D=(32768), 2048 layers Maximum Layered 2D Texture Size, (num) layers 2D=(32768, 32768), 2048 layers Total amount of constant memory: 65536 bytes Total amount of shared memory per block: 49152 bytes Total number of registers available per block: 65536 Warp size: 32 Maximum number of threads per multiprocessor: 2048 Maximum number of threads per block: 1024 Max dimension size of a thread block (x,y,z): (1024, 1024, 64) Max dimension size of a grid size (x,y,z): (2147483647, 65535, 65535) Maximum memory pitch: 2147483647 bytes Texture alignment: 512 bytes Concurrent copy and kernel execution: Yes with 2 copy engine(s) Run time limit on kernels: Yes Integrated GPU sharing Host Memory: No Support host page-locked memory mapping: Yes Alignment requirement for Surfaces: Yes Device has ECC support: Disabled Device supports Unified Addressing (UVA): Yes Device supports Compute Preemption: Yes Supports Cooperative Kernel Launch: Yes Supports MultiDevice Co-op Kernel Launch: Yes Device PCI Domain ID / Bus ID / location ID: 0 / 1 / 0 Compute Mode: < Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >
deviceQuery, CUDA Driver = CUDART, CUDA Driver Version = 10.1, CUDA Runtime Version = 10.0, NumDevs = 1 Result = PASS
I debug apollo's lane perception code, and found the error was occured when cudaGetDevice . But when I do the same option in deviceQuery, the result is right.
I debug apollo's lane perception code, and found the error was occured when cudaGetDevice . But when I do the same option in deviceQuery, the result is right.
Can you please share the steps you followed to debug the Lane Perception code?Did you use VSCode ?
I debug apollo's lane perception code, and found the error was occured when cudaGetDevice . But when I do the same option in deviceQuery, the result is right.
Hello. Did you solve the problem? My error is the same with yours.
I solved the problem following https://blog.csdn.net/qq_41481731/article/details/86658523
Run
cd /usr/local/cuda-10.0/samples/1_Utilities/deviceQuery sudo make sudo ./deviceQuery
I clone the cuda from github(https://github.com/NVIDIA/cuda-samples), then compiled the deviceQuery. When I run the binary, I get the result:
./deviceQuery Starting...
CUDA Device Query (Runtime API) version (CUDART static linking)
Detected 1 CUDA Capable device(s)
Device 0: "GeForce GTX 1080 Ti" CUDA Driver Version / Runtime Version 10.1 / 10.0 CUDA Capability Major/Minor version number: 6.1 Total amount of global memory: 11176 MBytes (11719409664 bytes) (28) Multiprocessors, (128) CUDA Cores/MP: 3584 CUDA Cores GPU Max Clock rate: 1645 MHz (1.64 GHz) Memory Clock rate: 5505 Mhz Memory Bus Width: 352-bit L2 Cache Size: 2883584 bytes Maximum Texture Dimension Size (x,y,z) 1D=(131072), 2D=(131072, 65536), 3D=(16384, 16384, 16384) Maximum Layered 1D Texture Size, (num) layers 1D=(32768), 2048 layers Maximum Layered 2D Texture Size, (num) layers 2D=(32768, 32768), 2048 layers Total amount of constant memory: 65536 bytes Total amount of shared memory per block: 49152 bytes Total number of registers available per block: 65536 Warp size: 32 Maximum number of threads per multiprocessor: 2048 Maximum number of threads per block: 1024 Max dimension size of a thread block (x,y,z): (1024, 1024, 64) Max dimension size of a grid size (x,y,z): (2147483647, 65535, 65535) Maximum memory pitch: 2147483647 bytes Texture alignment: 512 bytes Concurrent copy and kernel execution: Yes with 2 copy engine(s) Run time limit on kernels: Yes Integrated GPU sharing Host Memory: No Support host page-locked memory mapping: Yes Alignment requirement for Surfaces: Yes Device has ECC support: Disabled Device supports Unified Addressing (UVA): Yes Device supports Compute Preemption: Yes Supports Cooperative Kernel Launch: Yes Supports MultiDevice Co-op Kernel Launch: Yes Device PCI Domain ID / Bus ID / location ID: 0 / 1 / 0 Compute Mode: < Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >
deviceQuery, CUDA Driver = CUDART, CUDA Driver Version = 10.1, CUDA Runtime Version = 10.0, NumDevs = 1 Result = PASS
I have same question that cyber_launch start GPUassert and i /usr/local/cuda can't find samples. The apollo guide doesn't mention install cuda.I follow apollo Installing NVIDIA Container Toolkit and finally install nvidia-docker2. why i lack CUDA? Do you mean you lack CUDA in host machine or you lack CUDA in docker?
Closed due to inactivity. If the problem persists, pls feel free to reopen it or create a new one and refer to it.
System information
ubuntu 18.04
source
5.0
GPU: 1080ti
Driver Version: 430.50
CUDA Version: 10.0
Steps to reproduce the issue:
[NVBLAS] No Gpu available [NVBLAS] NVBLAS_CONFIG_FILE environment variable is NOT set : relying on default config filename 'nvblas.conf' [NVBLAS] Cannot open default config file 'nvblas.conf' [NVBLAS] Config parsed [NVBLAS] CPU Blas library need to be provided F1202 16:42:09.057770 11360 cuda_util.cu:50] Check failed: cuda_error == cudaSuccess (30 vs. 0) unknown error
Then I tried many modules which need GPU, the same error was got. But modules do not need GPU could be launch successfully.
Supporting materials (screenshots, command lines, code/script snippets):