Closed TommyUW closed 11 months ago
Hello,
int *d_a, *d_b, *d_c;
size_t N_B = N * sizeof(int);
cudaMalloc(&d_a, N_B);
// Copy to device
cudaMemcpy(d_a, a, N_B, cudaMemcpyHostToDevice);
starpu_data_handle_t handle_a, handle_b, handle_c;
starpu_vector_data_register(&handle_a, STARPU_MAIN_RAM, (uintptr_t)d_a, N, sizeof(int));
This is incoherent: your starpu_vector_data_register call is passing a CUDA pointer but telling that this is CPU memory (STARPU_MAIN_RAM)...
Either:
unsigned node;
starpu_memory_node_get_ids_by_type(STARPU_CUDA_RAM, &node, 1);
starpu_vector_data_register(&handle_a, node, (uintptr_t)d_a, N, sizeof(int));
Or:
(really preferred)
cudaMalloc
, no cudaMemcpy
, and just:starpu_data_handle_t handle_a, handle_b, handle_c;
starpu_vector_data_register(&handle_a, STARPU_MAIN_RAM, (uintptr_t)a, N, sizeof(int));
StarPU will handle everything. And replace
cudaMemcpy(c, d_c, N_B, cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
by either starpu_data_unregister(handle_c);
or
starpu_data_acquire(handle_c, STARPU_R);
Samuel
Hello, I wrote a simple vector addition program with StarPU and CUDA. After I printed out d_w, d_u, and d_v, I can confirm that the GPU calculated the data successfully. However, after I transferred the result from GPU to CPU and printed the array c, I found that all the elements in array c was still zero. I am using the StarPU-master, with CUDA 12.2. How should I change my StarPU code? Or are there any steps that I missed during the installation process of StarPU? Thank you very much.
include
include
include
define N 64
/ CUDA Kernel /
static global void vector_addition(int d_u, int d_v, int d_w, int num) { int i = blockDim.x blockIdx.x + threadIdx.x; if(i<N){ d_w[i] = d_u[i] + d_v[i]; printf("Thread %d: d_u[%d] = %d, d_v[%d] = %d, d_w[%d] = %d\n", i, i, d_u[i], i, d_v[i], i, d_w[i]);
} }
void cuda_vector_addition(void buffers[], void cl_arg) { printf("recdeived?\n"); int u = (int )STARPU_VECTOR_GET_PTR(buffers[0]); int v = (int )STARPU_VECTOR_GET_PTR(buffers[1]); int w = (int )STARPU_VECTOR_GET_PTR(buffers[2]); int n = STARPU_VECTOR_GET_NX(buffers[2]); int num = (int )cl_arg; int threads_per_block = 64; int nblocks = (N + threads_per_block-1) / threads_per_block; vector_addition<<<nblocks, threads_per_block, 0, starpu_cuda_get_local_stream()>>>(u, v, w, *num);
}
struct starpu_codelet cl = { .where = STARPU_CUDA,
ifdef STARPU_USE_CUDA
endif
};
int main() { int a = (int)malloc(Nsizeof(int)); int b = (int)malloc(Nsizeof(int)); int c = (int)malloc(N*sizeof(int));
}