TommyUW commented 1 year ago

Hello: In this program, I am trying to perform vector addition with StarPU and CUDA. I tried to register data with STARPU_CUDA_RAM. However, there are segmentation fault appeared. My ubuntu is 20.04, with StarPU 1.4 and CUDA 12.2. However, this program is operable on other machines. What should I do to make STARPU_CUDA_RAM work? Do I need to switch the edition of CUDA? Thank you very much.

include

define N 16

/ CUDA Kernel / global void vector_addition(int d_u, int d_v, int d_w) { int i = blockDim.x blockIdx.x + threadIdx.x; d_w[i] = d_u[i] + d_v[i]; }

/ CPU Version of Kernel / void cpu_vector_addition(void buffers[], void cl_arg) { int u = (int )STARPU_VECTOR_GET_PTR(buffers[0]); int v = (int )STARPU_VECTOR_GET_PTR(buffers[1]); int w = (int )STARPU_VECTOR_GET_PTR(buffers[2]);

for(int i = 0; i < N; i++)
{
    w[i] = u[i] + v[i];
}

}

/ CUDA Version of Kernel / void cuda_vector_addition(void buffers[], void cl_arg) { int u = (int )STARPU_VECTOR_GET_PTR(buffers[0]); int v = (int )STARPU_VECTOR_GET_PTR(buffers[1]); int w = (int )STARPU_VECTOR_GET_PTR(buffers[2]); printf("cuda called?"); vector_addition<<<N/16, 16>>>(u, v, w); }

int main() { starpu_init(NULL);

int a[N], b[N], c[N];
for(int i = 0; i < N; i++)
{
    a[i] = i;
    b[i] = N - i;
}

/* Register with StarPU */
starpu_data_handle_t handle_a, handle_b, handle_c;
starpu_vector_data_register(&handle_a, STARPU_CUDA_RAM, (uintptr_t)a, N, sizeof(int));
starpu_vector_data_register(&handle_b, STARPU_CUDA_RAM, (uintptr_t)b, N, sizeof(int));
starpu_vector_data_register(&handle_c, STARPU_CUDA_RAM, (uintptr_t)c, N, sizeof(int));

/* Codelet Definition */
struct starpu_codelet cl;
starpu_codelet_init(&cl);
//cl.cpu_funcs[0] = cpu_vector_addition;
cl.cuda_funcs[0] = cuda_vector_addition;
cl.nbuffers = 3;
cl.modes[0] = STARPU_R;
cl.modes[1] = STARPU_R;
cl.modes[2] = STARPU_W;

/* Task Submission */
struct starpu_task *task = starpu_task_create();
task->cl = &cl;
task->handles[0] = handle_a;
task->handles[1] = handle_b;
task->handles[2] = handle_c;
task->detach = 0;  // Set the detach flag to 0
starpu_task_submit(task);

/* Wait for all tasks to complete */
starpu_task_wait(task);

/* Unregister Data */
starpu_data_unregister(handle_a);
starpu_data_unregister(handle_b);
starpu_data_unregister(handle_c);

/* Shutdown StarPU */
starpu_shutdown();

/* Print results */
for (int i = 0; i < N; i++)
{
    printf("%i + %i = %i\n", a[i], b[i], c[i]);
}

return 0;

} Makefile: CFLAGS := $(shell pkg-config --cflags starpu-1.4) LDFLAGS := $(shell pkg-config --libs starpu-1.4)

Specify the CUDA library directory and libraries

LDFLAGS += -L/usr/local/cuda-12.0/lib64 LDLIBS := -lstarpu-1.4 -lcudart

NVCC := nvcc # Assuming you have defined NVCC NVCCFLAGS := # Your NVCC flags here, if any

all: vector_addition_starpu

vector_addition_starpu: vector_addition_starpu.o $(NVCC) $(CFLAGS) $(LDFLAGS) vector_addition_starpu.o $(LDLIBS) -o vector_addition_starpu

vector_addition_starpu.o: vector_addition_starpu.cu $(NVCC) $(NVCCFLAGS) $(CFLAGS) -c vector_addition_starpu.cu -o vector_addition_starpu.o

clean: rm -f vector_addition_starpu *.o

sthibaul commented 1 year ago

STARPU_CUDA_RAM is a enum starpu_node_kind, not a memory node number.

The pointer that you passed to starpu_vector_data_register are pointers allocated in the main memory, so use STARPU_MAIN_RAM which is the memory node number for the main memory.

sthibaul commented 1 year ago

StarPU will automatically transfer that to the GPU before executing the task

starpu-runtime / starpu

STARPU_CUDA_RAM cannot be called #27

include

include

include

define N 16

Specify the CUDA library directory and libraries