Hello. Thanks for this very useful tool.
Here is the issue I have when running CUDA code with managed memory,
It seems that the per-process list does not take managed memory into account.
Here is a screenshot of what I get running the minimal example below:
#include <stdlib.h>
#include <unistd.h>
#define NUM_ALLOCS 1024*1024
__global__ void access_memory(int** buf) {
int idx = threadIdx.x + blockDim.x*blockIdx.x;
if (idx >= NUM_ALLOCS)
return;
buf[idx][0] = 0; //* Copies allocated memory to GPU memory
}
int main() {
int** buffer;
//* Allocate managed memory
cudaMallocManaged(&buffer, NUM_ALLOCS, cudaMemAttachHost);
for (int i=0; i < NUM_ALLOCS; i++) {
cudaMallocManaged(&(buffer[i]), 49152, cudaMemAttachHost);
}
//* Access memory from the GPU
dim3 gridSize(NUM_ALLOCS/1024 +1);
dim3 blockSize(1024);
access_memory<<<gridSize, blockSize>>>(buffer);
sleep(10);
//* Free Memory
for (int i=0; i < NUM_ALLOCS; i++) {
cudaFree(buffer[i]);
}
cudaFree(buffer);
return 0;
}
Does the NVIDIA driver expose such information ? Would it be possible to use it ?
Hello. Thanks for this very useful tool. Here is the issue I have when running CUDA code with managed memory, It seems that the per-process list does not take managed memory into account. Here is a screenshot of what I get running the minimal example below:
Does the NVIDIA driver expose such information ? Would it be possible to use it ?