brucefan1983 / CUDA-Programming

Sample codes for my CUDA programming book
GNU General Public License v3.0
1.51k stars 316 forks source link

block、thread、wrap间的执行顺序 #21

Open zhaiyi000 opened 1 year ago

zhaiyi000 commented 1 year ago

您好,请教几个问题。 我的显卡是2080Ti

当我读到,109页10.3 更多线程束内的基本函数中的Listing 10.2(如下)

#include "error.cuh"
#include <stdio.h>

const unsigned WIDTH = 8;
const unsigned BLOCK_SIZE = 16;
const unsigned FULL_MASK = 0xffffffff;

void __global__ test_warp_primitives(void);

int main(int argc, char **argv)
{
    test_warp_primitives<<<1, BLOCK_SIZE>>>();
    CHECK(cudaDeviceSynchronize());
    return 0;
}

void __global__ test_warp_primitives(void)
{
    int tid = threadIdx.x;
    int lane_id = tid % WIDTH;

    if (tid == 0) printf("threadIdx.x: ");
    printf("%2d ", tid);
    if (tid == 0) printf("\n");

    if (tid == 0) printf("lane_id:     ");
    printf("%2d ", lane_id);
    if (tid == 0) printf("\n");

    unsigned mask1 = __ballot_sync(FULL_MASK, tid > 0);
    unsigned mask2 = __ballot_sync(FULL_MASK, tid == 0);
    if (tid == 0) printf("FULL_MASK = %x\n", FULL_MASK);
    if (tid == 1) printf("mask1     = %x\n", mask1);
    if (tid == 0) printf("mask2     = %x\n", mask2);

    int result = __all_sync(FULL_MASK, tid);
    if (tid == 0) printf("all_sync (FULL_MASK): %d\n", result);

    result = __all_sync(mask1, tid);
    if (tid == 1) printf("all_sync     (mask1): %d\n", result);

    result = __any_sync(FULL_MASK, tid);
    if (tid == 0) printf("any_sync (FULL_MASK): %d\n", result);

    result = __any_sync(mask2, tid);
    if (tid == 0) printf("any_sync     (mask2): %d\n", result);

    int value = __shfl_sync(FULL_MASK, tid, 2, WIDTH);
    if (tid == 0) printf("shfl:      ");
    printf("%2d ", value);
    if (tid == 0) printf("\n");

    value = __shfl_up_sync(FULL_MASK, tid, 1, WIDTH);
    if (tid == 0) printf("shfl_up:   ");
    printf("%2d ", value);
    if (tid == 0) printf("\n");

    value = __shfl_down_sync(FULL_MASK, tid, 1, WIDTH);
    if (tid == 0) printf("shfl_down: ");
    printf("%2d ", value);
    if (tid == 0) printf("\n");

    value = __shfl_xor_sync(FULL_MASK, tid, 1, WIDTH);
    if (tid == 0) printf("shfl_xor:  ");
    printf("%2d ", value);
    if (tid == 0) printf("\n");
}

输出是

threadIdx.x:  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 
lane_id:      0  1  2  3  4  5  6  7  0  1  2  3  4  5  6  7 
FULL_MASK = ffffffff
mask1     = fffe
mask2     = 1
all_sync (FULL_MASK): 0
all_sync     (mask1): 1
any_sync (FULL_MASK): 1
any_sync     (mask2): 0
shfl:       2  2  2  2  2  2  2  2 10 10 10 10 10 10 10 10 
shfl_up:    0  0  1  2  3  4  5  6  8  8  9 10 11 12 13 14 
shfl_down:  1  2  3  4  5  6  7  7  9 10 11 12 13 14 15 15 
shfl_xor:   1  0  3  2  5  4  7  6  9  8 11 10 13 12 15 14 

其中的

if (tid == 0) printf("threadIdx.x: ");
printf("%2d ", tid);

想请问:

  1. 对于线程束中的不同线程,为什么第二行一定在第一行之后执行?
  2. 对于线程束中的不同线程,为什么会顺序执行第二行? 我尝试加入如下代码
    int k = 0;
    if (tid == 0) {
        for ( int i = 0; i < 100000000; i++ ) {
            k = k + 1;
        }
    }
    if (tid == 0) printf("threadIdx.x: ");
    printf("%2d ", tid);
    if (tid == 0) printf("\n");

    但是仍然如下输出,我期待线程0应该是最晚执行完成的。

    threadIdx.x:  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 

书中106页讲从伏特架构开始,引入了独立线程调度(independent thread scheduling)机制。 每个线程有自己的程序计数器。我的理解是线程之间的执行顺序应该是随机的。这也正是线程束内同步函数 _syncwarp()存在的意义。