icache vs DMEM memory access

natewise commented 1 year ago

Hello, so I have a question regarding instruction cache vs scratchpad/DMEM memory access.

I have a program that I plan to run across 2 tiles, that uses ~8.3KB of memory in lookup tables and local data structures. I want to run the program on only one tile (say tile 0), and use the other tile (say tile 1) as a sort of memory bank, where I'd like to utilize both tile 1's 4KB of icache and 4KB of scratchpad/DMEM for memory storage. So say I have 3 lookup tables like so: uint32_t LUT1[256] = {...}; uint32_t LUT2[256] = {...}; uint32_t LUT3[256] = {...};

How could I ensure LUT1 is stored in DMEM of tile 0, LUT2 on icache of tile 1 and LUT3 on DMEM of tile 1?

dpetrisko commented 1 year ago

Hi, for something like this you will probably want to use CUDA lite host calls. In the SPMD loader included in this repo, the same exact program is run on each tile allocated. Thus there is no way to statically differentiate between tiles.

Allocations can be made statically in DMEM by annotating the correct linker section (.dmem I believe). I'm not sure if the I$ can be statically allocated as scratchpad. In either case, you'll need to fill the data from the host core or "main.c" in the cuda code. This will allow you to copy to the correct lookup tables to each individual memory. There may be helper functions to copy or may not be, but the coordinates/addresses can always be calculated based on the technical reference manual tables

taylor-bsg commented 1 year ago

@tommydcjung can you take a stab at a reply and offer any pointers to example code? My guess is the answer is things should be stored in DRAM first and then each tile, based on its ID will load the data it needs. Although this is something that has been considered, the I-cache is not currently usable as a scratchpad.

tommydcjung commented 1 year ago

SPMD version: https://github.com/bespoke-silicon-group/bsg_manycore/tree/example_code1/software/spmd/example1

CUDA version: little more involved...

main.c (host)

#include <bsg_manycore_tile.h>
#include <bsg_manycore_errno.h>
#include <bsg_manycore_tile.h>
#include <bsg_manycore_loader.h>
#include <bsg_manycore_cuda.h>
#include <math.h>
#include <complex.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include <unistd.h>
#include <sys/ioctl.h>
#include <stdio.h>
#include <bsg_manycore_regression.h>

#define ALLOC_NAME "default_allocator"

#define SIZE (bsg_tiles_X*N)

int kernel_example(int argc, char **argv) {

  int rc;
  char *bin_path, *test_name;
  struct arguments_path args = {NULL, NULL};

  argp_parse(&argp_path, argc, argv, 0, 0, &args);
  bin_path = args.path;
  test_name = args.name;

  srand(time);

  // Initialize Device.
  hb_mc_device_t device;
  BSG_CUDA_CALL(hb_mc_device_init(&device, test_name, 0));

  hb_mc_pod_id_t pod;
  hb_mc_device_foreach_pod_id(&device, pod)
  {
    // What this example does:
    // the host will initialized some values in A.
    // the tiles will copy its data from A to its dmem.
    // tile0 will access everyone's dmem data and calculate sum, and store it in B.
    // the host will read the B, and validate.

    bsg_pr_info("Loading program for pod %d\n.", pod);
    BSG_CUDA_CALL(hb_mc_device_set_default_pod(&device, pod));
    BSG_CUDA_CALL(hb_mc_device_program_init(&device, bin_path, ALLOC_NAME, 0));

    // Allocate a block of memory in host.
    int A_host[SIZE];
    int B_host[1] = {0};
    for (int i = 0; i < SIZE; i++) {
      A_host[i] = i;
    }

    // Allocate a block of memory in device (DRAM).
    eva_t A_device, B_device;
    BSG_CUDA_CALL(hb_mc_device_malloc(&device, SIZE * sizeof(int), &A_device));
    BSG_CUDA_CALL(hb_mc_device_malloc(&device, sizeof(int), &B_device));

    // DMA Transfer to device.
    hb_mc_dma_htod_t htod_job [] = {
      {
        .d_addr = A_device,
        .h_addr = (void *) &A_host[0],
        .size = SIZE * sizeof(int)
      }
    };

    BSG_CUDA_CALL(hb_mc_device_dma_to_device(&device, htod_job, 1));

    // CUDA arguments
    hb_mc_dimension_t tg_dim = { .x = bsg_tiles_X, .y = bsg_tiles_Y};
    hb_mc_dimension_t grid_dim = { .x = 1, .y = 1};
    #define CUDA_ARGC 3
    uint32_t cuda_argv[CUDA_ARGC] = {A_device, B_device, N};

    // Enqueue Kernel.
    BSG_CUDA_CALL(hb_mc_kernel_enqueue(&device, grid_dim, tg_dim, "kernel_example", CUDA_ARGC, cuda_argv));

    // Launch kernel.
    hb_mc_manycore_trace_enable((&device)->mc);
    BSG_CUDA_CALL(hb_mc_device_tile_groups_execute(&device));
    hb_mc_manycore_trace_disable((&device)->mc);

    // copy result and validate.
    hb_mc_dma_dtoh_t dtoh_job [] = {
      {
        .d_addr = B_device,
        .h_addr = (void *) &B_host[0],
        .size = sizeof(int)
      }
    };

    BSG_CUDA_CALL(hb_mc_device_dma_to_host(&device, &dtoh_job, 1));

    // calculate sum on the host side
    int host_sum = 0;
    for (int i = 0; i < SIZE; i++) {
      host_sum += A_host[i];
    }

    // compare host and device sum.
    if (host_sum != B_host[0]) {
        printf("FAIL: expected = %d,  actual = %d \n",  host_sum, B_host[0]);
        BSG_CUDA_CALL(hb_mc_device_finish(&device));
        return HB_MC_FAIL;
    }

    // Freeze tiles.
    BSG_CUDA_CALL(hb_mc_device_program_finish(&device));
  }

  BSG_CUDA_CALL(hb_mc_device_finish(&device));
  return HB_MC_SUCCESS; 
}

declare_program_main("example", kernel_example);

kernel.cpp (device)

#include <bsg_manycore.h>
#include <bsg_cuda_lite_barrier.h>

int dmem_data[256]; // allocated in DMEM.

// A = ptr to DRAM allocated by the host.
// B = ptr to DRAM where to store the result.
// n = How many words to copy
extern "C" __attribute__ ((noinline))
int kernel_example(int * A, int * B, int n) {

  bsg_barrier_hw_tile_group_init(); // initialize HW barrier.
  bsg_fence();

  // copy your own data.
  int * dram_ptr = &A[__bsg_id*n];
  bsg_unroll(4)
  for (int i = 0; i < n; i++) {
    dmem_data[i] = dram_ptr[i];
  }

  // Do hw barrier. 
  bsg_fence();
  bsg_barrier_hw_tile_group_sync();

  // tile 0 checks.
  if (__bsg_id == 0) {
    int sum = 0;
    for (int x = 0; x < bsg_tiles_X; x++) {
      bsg_remote_int_ptr ptr = bsg_remote_ptr(x,0,&dmem_data);
      bsg_unroll(4)
      for (int i = 0; i < n; i ++ ) {
        sum += ptr[i];
      }
    }

    // store the answer in B.
    B[0] = sum;
  }  

  bsg_fence();
  bsg_barrier_hw_tile_group_sync();

  return 0;
}

bespoke-silicon-group / bsg_manycore

icache vs DMEM memory access #663