GPGPU-Sim provides a detailed simulation model of contemporary NVIDIA GPUs running CUDA and/or OpenCL workloads. It includes support for features such as TensorCores and CUDA Dynamic Parallelism as well as a performance visualization tool, AerialVisoin, and an integrated energy model, GPUWattch.
I found 2 issues here:
---- shader.cc
I think the pI1 shall be pI2.
if ((pI1->oprnd_type == INT_OP) || (pI1->oprnd_type == UN_OP)) { //these counters get added up in mcPat to compute scheduler power
m_stats->m_num_INTdecoded_insn[m_sid]++;
---- I tried to enable cooperative_groups in bellow cuda, but seem it doesn't work , something issue with PTX, do you know the reasons?
device int atomicAggInc(int *ptr) {
auto g = cg::coalesced_threads();
int prev;
if (g.thread_rank() == 0)
prev = atomicAdd(ptr,g.size());
I found 2 issues here: ---- shader.cc I think the pI1 shall be pI2. if ((pI1->oprnd_type == INT_OP) || (pI1->oprnd_type == UN_OP)) { //these counters get added up in mcPat to compute scheduler power m_stats->m_num_INTdecoded_insn[m_sid]++; ---- I tried to enable cooperative_groups in bellow cuda, but seem it doesn't work , something issue with PTX, do you know the reasons?
device int atomicAggInc(int *ptr) { auto g = cg::coalesced_threads(); int prev;
if (g.thread_rank() == 0) prev = atomicAdd(ptr,g.size());
}
global void vectorAdd(float A, const float B, float *C , int numElements) {
int i = blockDim.x * blockIdx.x + threadIdx.x;
//if (i < numElements) { // C[i] = A[i] + B[i]; //} if ( i%10 == 0){ int rankIdx = atomicAggInc(&count); printf ("blockIdx = %d, threadIdx = %d, rank = %d \n",blockIdx.x ,threadIdx.x,rankIdx); } }