Bruce-Lee-LY / cuda_hgemm

Several optimization methods of half-precision general matrix multiplication (HGEMM) using tensor core with WMMA API and MMA PTX instruction.
MIT License
290 stars 66 forks source link

请教一个 `wmma_async_stage2.cu` 中的代码细节 #9

Closed luliyucoordinate closed 5 months ago

luliyucoordinate commented 6 months ago

为什么这一行的for循环会按照chunk_k分成两组去做? https://github.com/Bruce-Lee-LY/cuda_hgemm/blob/10a8a8451f0dcd162b3790045cd7597cb48b8beb/src/wmma/wmma_async_stage2.cu#L208

#pragma unroll
        for (size_t i = 0; i < A_smem_iters / CHUNK_K; ++i) { <- 这个位置
            uint32_t A_smem_lane_addr =
                __cvta_generic_to_shared(&smem[A_smem_idx][0]) + (lane_id % CHUNK_COPY_LINE_LANES) * THREAD_COPY_BYTES;

            CP_ASYNC_CG(A_smem_lane_addr, A_lane_ptr, THREAD_COPY_BYTES);

            A_lane_ptr = (int4 *)((half *)A_lane_ptr + CHUNK_COPY_LINES_PER_WARP * K);
            A_smem_idx += CHUNK_COPY_LINES_PER_WARP;
        }

        ...

        smem_load_idx = (smem_load_idx + 1) % K_STAGE;
        smem_load_off = smem_load_idx * smem_stage_off;

#pragma unroll
        for (size_t i = (CHUNK_K - 1) * A_smem_iters / CHUNK_K; i < A_smem_iters; ++i) { <- 这个位置
            uint32_t A_smem_lane_addr =
                __cvta_generic_to_shared(&smem[A_smem_idx][0]) + (lane_id % CHUNK_COPY_LINE_LANES) * THREAD_COPY_BYTES;

            CP_ASYNC_CG(A_smem_lane_addr, A_lane_ptr, THREAD_COPY_BYTES);

            A_lane_ptr = (int4 *)((half *)A_lane_ptr + CHUNK_COPY_LINES_PER_WARP * K);
            A_smem_idx += CHUNK_COPY_LINES_PER_WARP;
        }

这二者之间好像没有做什么特殊的逻辑?谢谢🤣