QuarkContainer / Quark

A secure container runtime with CRI/OCI interface
Apache License 2.0
302 stars 47 forks source link

CUDA Context Issue #1300

Open chengchen666 opened 1 month ago

chengchen666 commented 1 month ago

In host machine, different process will use different cuda context to do resource isolation. This should be the same for Quark. However, all process inside Quark will go into qvisor, which is a single process in Host. So, ideally, we should create different cuda context for processes in Quark. But now we decide to not distinguish cuda context, in another word, we will use single cuda context for all processes in quark. This is only a temporary workaround. One reason for this is for simplicity, another reason, which is the most important reason is: we are meeting some issues when create cuda context by using Rust. There's some bugs when we create cudaContext from a Rust program, but no issue from a C program. Error code from CUDA side shows OUT OF MEMORY, but actually we have enough GPU memory & RAM. And the logic are the same for Rust program and C program. And it shouldn't be any issue with the usage of cuda Context related APIs. Nothing relates to CUDA version or Quark itself. Must be some issue from Rust Runtime(if there's such thing). Will stop working on branch : https://github.com/QuarkContainer/Quark/tree/GPU-cuLibAndCtx

Here's the code for reproduce this issue: C code:

//nvcc -std=c++11 -I/usr/local/cuda/include -L/usr/local/cuda/lib64 -lcuda -lcudart driver_api.cu -o test
#include <iostream>
#include <iomanip>
#include <cmath>
#include <string>
#include <cuda.h>
#include <pthread.h>
#include <cuda_runtime_api.h>
#include <unistd.h>
#include <sched.h>
#include <sys/mman.h>
#include <sys/wait.h>
#include <linux/version.h>

using namespace std;

#define gpuDriverErrorCheck(ans) {gpuAssert((ans),__FILE__,__LINE__); }
inline void gpuAssert(CUresult code,  const char *file, int line)
{
    if (code != 0 )
    {
        fprintf(stderr, "GPU assert: %d %s %d\n",
            code, file, line);
    }
}

#define CUDACHECK(cmd) do {                         \
  cudaError_t err = cmd;                            \
  if (err != 0) {                         \
    printf("Failed: Cuda error %s:%d '%s'\n",       \
        __FILE__,__LINE__,cudaGetErrorString(err)); \
    exit(EXIT_FAILURE);                             \
  }                                                 \
} while(0)

int main()
{   
    int result = 0;

    //using cuda driver api
    //initialize cuda driver
    gpuDriverErrorCheck(cuInit(0)); 
//    CUDACHECK(cudaInitDevice(0,0,0));
  //  CUDACHECK(cudaSetDevice(0));
    // int count = 0;
    // gpuDriverErrorCheck(cuDeviceGetCount(&count));
    CUdevice device;
    gpuDriverErrorCheck(cuDeviceGet(&device, 0));

    CUcontext ctx_prim;
    gpuDriverErrorCheck(cuDevicePrimaryCtxRetain(&ctx_prim, device));
    gpuDriverErrorCheck(cuCtxSetCurrent(ctx_prim));
    printf("ctx_prim is %x\n", ctx_prim);
    CUcontext ctx_curr;
    gpuDriverErrorCheck(cuCtxGetCurrent(&ctx_curr));
    printf("current ctx is %x\n", ctx_curr);

    CUcontext ctx;
    gpuDriverErrorCheck(cuCtxCreate(&ctx, 0, device));
    gpuDriverErrorCheck(cuCtxSetCurrent(ctx));
    printf("ctx is %x\n", ctx);
    gpuDriverErrorCheck(cuCtxSetCurrent(ctx));
    gpuDriverErrorCheck(cuCtxGetCurrent(&ctx_curr));
    printf("current ctx is %x\n", ctx_curr);
    gpuDriverErrorCheck(cuCtxPopCurrent(&ctx_curr));
    printf("current ctx is %x\n", ctx_curr);
    gpuDriverErrorCheck(cuCtxPopCurrent(&ctx_curr));
    printf("current ctx is %x\n", ctx_curr);
    return 0;
}

Rust code:


use std::os::raw::*;
use cuda_driver_sys::{
    CUcontext, CUdevice, CUdeviceptr, CUfunction, CUfunction_attribute, CUmodule, CUresult,
    CUstream
};
use std::thread;
use cuda_runtime_sys::{
    cudaError_t
};

#[link(name = "cuda")]
extern "C" {
    pub fn cuInit(Flags: c_uint) -> CUresult;
    pub fn cuCtxGetCurrent(pctx: u64) -> CUresult;
    pub fn cuCtxSetCurrent(ctx: u64) -> CUresult;
    pub fn cuDevicePrimaryCtxRetain(pctx: u64, dev: i32) -> CUresult;
    pub fn cuCtxCreate(pctx: u64, flags: c_uint, dev: CUdevice) -> CUresult;
    pub fn cuCtxPushCurrent(pctx: CUcontext) -> CUresult;
    pub fn cuDevicePrimaryCtxGetState(
        dev: CUdevice,
        flags: *mut c_uint,
        active: *mut c_int,
    ) -> CUresult;
    pub fn cuDeviceGet(device: u64, ordinal: i32) -> CUresult;
    pub fn cuCtxCreate_v4(pctx: u64, ctxCreateParams: u64, flags: c_uint, dev: CUdevice) -> CUresult;
    pub fn cuCtxGetExecAffinity(aff: u64, _type: u32) -> CUresult;
}

#[link(name = "cudart")]
extern "C" {
    pub fn cudaInitDevice(device: i32, deviceFlags: u32, flags: u32) -> cudaError_t;
    pub fn cudaSetDevice(device: c_int) -> cudaError_t;
}
fn main() {
    println!("Hello, world!");
    let ret = unsafe { cuInit(0) as u32 };
    if ret != 0 {
        println!("cuda init error");
    }
    let mut ptr = 0;
    ptr = unsafe { libc::malloc(100)} as *mut _ as u64;
    println!("ptr is {:x}", ptr);
    // let ret = unsafe {cudaInitDevice(0,0,0) as u32};
    // if ret != 0 {
    //     println!("cudaInitDevice error");
    // }
    // let ret = unsafe { cudaSetDevice(0) as u32 };
    // if ret != 0 {
    //     println!("cudaSetDevice error");
    // }
    let cudaCtx = InitCUDACtx();
}

// pub struct cuctx {
//     a: [u8;0]
// }

fn InitCUDACtx() -> u64 {
    let mut device: i32 = 0;
    let ret = unsafe { cuDeviceGet(&mut device as *mut _ as u64, 0) };
    if ret as u32 != 0 {
        println!("nvidia.rs: error caused by cuDeviceGet: {}", ret as u32);
    }
    let mut ctx: u64 = 0;

    // let mut flags: c_uint = Default::default();
    // let mut active: c_int = Default::default();

    // let ret11 = unsafe {
    //     cuDevicePrimaryCtxGetState(
    //         device as CUdevice,
    //         &mut flags as *mut c_uint,
    //         &mut active as *mut c_int,
    //     )
    // };
    // if ret11 as u32 != 0 {
    //     println!("nvidia.rs: error caused by cuDevicePrimaryCtxGetState: {}", ret11 as u32);
    // } else {
    //     println!("primary ctx is activate {:?}", active as u32);
    // }

    let ret4 = unsafe { cuDevicePrimaryCtxRetain(&mut ctx as *mut _ as u64, device) };
    if ret4 as u32 != 0 {
        println!("nvidia.rs: error caused by cuDevicePrimaryCtxRetain: {}", ret4 as u32);
    } else {
        println!("primary ctx is {:x},", ctx);
    }
    let ret3 = unsafe {cuCtxSetCurrent(ctx)};
    if ret3 as u32 != 0 {
        println!("nvidia.rs: error caused by cuCtxSetCurrent1: {}", ret3 as u32);
    } else {
        println!("curr ctx is {:x}", ctx);
    }
    let mut pctx: u64 = 0;
    let ret3 = unsafe {cuCtxGetCurrent(&mut pctx as *mut _ as u64)};
    if ret3 as u32 != 0 {
        println!("nvidia.rs: error caused by cuCtxGetCurrent: {}", ret3 as u32);
    } else {
        println!("get curr ctx is {:x}", pctx);
    }

    let mut new_ctx: u64 = 0;
    let ret2 = unsafe {cuCtxCreate(&mut new_ctx as *mut _ as u64, 0, device)};
    if ret2 as u32 != 0 {
        println!("nvidia.rs: error caused by cuCtxCreate: {}", ret2 as u32);
    }
    //thread::sleep_ms(4000);
    let ret3 = unsafe {cuCtxSetCurrent(new_ctx)};
    if ret3 as u32 != 0 {
        println!("nvidia.rs: error caused by cuCtxSetCurrent2: {}", ret3 as u32);
    } else {
        println!("curr ctx is {:x}", new_ctx);
    }

    let mut new_pctx: u64 = 0;
    let ret3 = unsafe {cuCtxGetCurrent(&mut new_pctx as *mut _ as u64)};
    if ret3 as u32 != 0 {
        println!("nvidia.rs: error caused by cuCtxGetCurrent: {}", ret3 as u32);
    } else {
        println!("curr ctx is {:x}", new_pctx);
    }

    return ctx;
}

cargo.toml:

[package]
name = "rust"
version = "0.1.0"
edition = "2021"

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[dependencies]

libc = "=0.2.11"
cuda-driver-sys = { version = "0.3.0"}
cuda-runtime-sys = { version = "0.3.0-alpha.1"}
chengchen666 commented 1 month ago

In my windows WSL ubuntu system, rust has no issue. Really weird behaviour.

rustup 1.27.1 (54dd3d00f 2024-04-24)
info: This is the version for the rustup toolchain manager, not the rustc compiler.
info: The currently active `rustc` version is `rustc 1.76.0-nightly (d86d65bbc 2023-12-10)`