[tsuki] 読み進めメモ

enp1s0 commented 5 years ago

@ reduce_kernel.h

__device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
#if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
      int32_t rv, z=0;
      asm("vadd4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
      return rv;
#else
      return addChar4(x, y);
#endif
}

なぜvadd4を使うのは__CUDA_ARCH__ < 500の場合のみなんだろう?

enp1s0 commented 5 years ago

@ ruduce_kernel.h

__device__ half2 operator()(const half2 x, const half2 y) const {
#if __CUDA_ARCH__ >= 530 && __CUDA_ARCH__ != 610
      return __hadd2(x, y);
#else
      float2 fx, fy, fr;
      fx = __half22float2(x);
      fy = __half22float2(y);
      fr.x = fx.x + fy.x;
      fr.y = fx.y + fy.y;
      return __float22half2_rn(fr);
#endif
}

PascalではP100は半精度演算が速いけど，GeForce系は遅いのでfloatにキャストして演算している?

enp1s0 commented 5 years ago

ncclChannelにはring,treeが入っていて，channel = ringとは限らない?

  struct ncclChannel {
    union {
      struct {
        struct ncclRing ring;
        struct ncclTree tree;

        int id;
        int nthreads;
        int buffSize;

        // Communication structures
        struct ncclPeer* peers;
        struct ncclPeer* devPeers;

        // Operation list for aggregation
        struct ncclColl* collectives;
        struct ncclColl* devCollectives;
        int collStart;
        int collCount;
        int collFifoHead; // Only used by GPU
        int collFifoTail; // Only used by CPU
      };
      int data[0x80];
    };
  };

enp1s0 commented 5 years ago

@ src/channel.cc

ncclResult_t initChannel(struct ncclComm* comm, int channelid) {
  struct ncclChannel* channel = comm->channels+channelid;
  channel->id = channelid;

  // Setup intermediate buffering
  channel->buffSize = ncclParamBuffsize();

  // Ring index to user rank table.
  NCCLCHECK(ncclCudaCalloc(&channel->ring.devUserRanks, comm->nRanks));
  NCCLCHECK(ncclCalloc(&channel->ring.userRanks, comm->nRanks));

  // Communication structures with peers.
  NCCLCHECK(ncclCudaCalloc(&channel->devPeers, comm->nRanks));
  NCCLCHECK(ncclCalloc(&channel->peers, comm->nRanks));
  for (size_t i=0; i<comm->nRanks; ++i) {
    channel->peers[i].send.comm = comm;
    channel->peers[i].recv.comm = comm;
  }

  // Per-channel operation list.
  NCCLCHECK(ncclCudaHostAlloc((void**)&channel->collectives, (void**)&channel->devCollectives, sizeof(struct ncclColl)*NCCL_MAX_OPS));
  return ncclSuccess;
}

を見ている感じだとCommの下で更にグループ分けしたもの? HW構成を意識して．

enp1s0 commented 5 years ago

vadd4の件

SASSで見てみたらsm_75だとPRMTが大量に発生してVADD4はいなくなる． sm_30だとVADD4命令になる． PTXの時点ではsm_75でもvadd4はエラーとならないので，ptxasに消される?
もしかしてCC 5系から消された???

enp1s0 commented 5 years ago

CC 3.5のTITANで実験したところ，確かにvadd4を使うことで高速化される．

enp1s0 commented 5 years ago

CUDA 8用のオプションもあるということはCooperative groupsを使う実装と使わない実装があるのか．速度差とかあるのかな．

enp1s0 commented 5 years ago

Grid/Blockサイズの決め方

@ s/enqueue.cc getKernelInfo

Blockサイズ（まだ読み途中）

int nt = NCCL_LL_MIN_NTHREADS; // #define NCCL_LL_MIN_NTHREADS 64
while (DIVUP(info->nBytes, nt*info->nchunksPerLoop) > perThreadLLThreshold && nt*2 <= maxLLNthreads) nt *= 2;

で，

info->nBytes = info->count * ncclTypeSize(info->datatype);

で，

*count = comm->nRanks;

（これか???）となっている．ただしこれは無視．

if (info->coll == ncclCollAllGather || info->coll == ncclCollReduceScatter) info->nBytes *= info->comm->nRanks; // count is per rank

enp1s0 commented 5 years ago

cudaLaunchCooperativeKernelMultiDevice

でLauchしているのは単にmulti GPU環境でのLauchを1発で終わらせたかったからで，cooperative groupを使って大規模な同期を取りたいとかではなさそう．

cudaLaunchCooperativeKernelMultiDevice

はcooperative_group.hではなくcuda_runtime_api.hで宣言されている．

CUDA 9.0より前ではcudaLaunchCooperativeKernelMultiDeviceは使えないのでデバイスの個数分だけループしてLaunchしている．

ncclResult_t ncclLaunchCooperativeKernelMultiDevice(struct cudaLaunchParams *paramsList, int* cudaDevs, int numDevices, int cgMode) {
#if CUDART_VERSION >= 9000
  if (cgMode & 0x01) {
    CUDACHECK(cudaLaunchCooperativeKernelMultiDevice(paramsList, numDevices,
            // These flags are to reduce the latency of using this API
            cudaCooperativeLaunchMultiDeviceNoPreSync|cudaCooperativeLaunchMultiDeviceNoPostSync));
    return ncclSuccess;
  }
#endif
  int savedDev;
  CUDACHECK(cudaGetDevice(&savedDev));
  for (int i = 0; i < numDevices; i++) {
    struct cudaLaunchParams* params = paramsList+i;
    CUDACHECK(cudaSetDevice(cudaDevs[i]));
    CUDACHECK(cudaLaunchKernel(params->func, params->gridDim, params->blockDim, params->args, params->sharedMem, params->stream));
  }
  CUDACHECK(cudaSetDevice(savedDev));
  return ncclSuccess;
}

やっぱり

CUDACHECK(cudaGetDevice(&savedDev));
...
CUDACHECK(cudaSetDevice(savedDev));

ではさみますよね...

enp1s0 commented 5 years ago

PTXって

asm volatile("ld.volatile.global.v2.u64 {%0,%1}, [%2];" : "=l"(v.x), "=l"(v.y) : "l"(p) : "memory");

みたいに2つのレジスタに対してldできたんですね...(s/c/d/common_kernel.h:242)
いつも1レジスタにldしてmvで2つにしていました

enp1s0 commented 5 years ago

intraのGPU構成ごとにCooperativeGroupsでカーネル関数をLaunchするか変わる?? https://github.com/nccl-reader/nccl/blob/master/src/init.cc#L632

enp1s0 commented 5 years ago

GPUを制御するCPUのバランスを整える機構が入っているっぽい 4GPU-2CPU(10x2=20コア)のノードで制御CPUを取得すると

Device 0 : Path = /sys/class/pci_bus/0000:18/../../0000:18:00.0, CPUs = 0-9
Device 1 : Path = /sys/class/pci_bus/0000:18/../../0000:18:00.0, CPUs = 0-9
Device 2 : Path = /sys/class/pci_bus/0000:18/../../0000:18:00.0, CPUs = 0-9
Device 3 : Path = /sys/class/pci_bus/0000:18/../../0000:18:00.0, CPUs = 0-9

のようになるが，これはすべて1すのCPUで制御してしまうという認識でいいのかな? これを

Device 0 : Path = /sys/class/pci_bus/0000:18/../../0000:18:00.0, CPUs = 0-9
Device 1 : Path = /sys/class/pci_bus/0000:18/../../0000:18:00.0, CPUs = 0-9
Device 2 : Path = /sys/class/pci_bus/0000:18/../../0000:18:00.0, CPUs = 10-19
Device 3 : Path = /sys/class/pci_bus/0000:18/../../0000:18:00.0, CPUs = 10-19

のようにしようとしている?

enp1s0 commented 5 years ago

sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);

なのでpeerinfo等のallgatherなどが終わったあとでCPU-GPU割当はもとに戻されるのか

enp1s0 commented 5 years ago

メモのメモ https://docs.google.com/spreadsheets/d/1v7r6g95WzLy-aqJckq0eVwtmpBDLOKLods4dITk8AJY/edit?usp=sharing

nccl-reader / nccl