CMU-SAFARI / ramulator-pim

A fast and flexible simulation infrastructure for exploring general-purpose processing-in-memory (PIM) architectures. Ramulator-PIM combines a widely-used simulator for out-of-order and in-order processors (ZSim) with Ramulator, a DRAM simulator with memory models for DDRx, LPDDRx, GDDRx, WIOx, HBMx, and HMCx. Ramulator is described in the IEEE CAL 2015 paper by Kim et al. at https://people.inf.ethz.ch/omutlu/pub/ramulator_dram_simulator-ieee-cal15.pdf Ramulator-PIM is used in the DAC 2019 paper by Singh et al. at https://people.inf.ethz.ch/omutlu/pub/NAPEL-near-memory-computing-performance-prediction-via-ML_dac19.pdf
144 stars 60 forks source link

Zsim running in PIM.cfg does not terminate #10

Open avacoder42 opened 4 years ago

avacoder42 commented 4 years ago

Hi,

I'm trying to replicate the NAPEL paper Test cases for Polybench and facing the following issues -

1) Zsim with hooks for PIM offloading does not terminate (or force terminated after max_instructions reached). Adding one sample case for cholesky, but the issue remains for all benchmarks. Could you share an example with Polybench test case? It would be helpful to replicate. 2) Rodinia.bfs example shared also does not terminate for the suggested test case of 1.0m Nodes. Error

[S 0] Thread 15 starting
[S 0] WARN: Futex wake matching failed (0/31) (external/ff waiters?)
[S 0] WARN: Stalled for 20s 

I have done the suggested changes in the paper as follows Configuration file 32 cores PIM (PIM.cfg)

// This system is similar to a 6-core, 2.4GHz Westmere with 10 Niagara-like cores attached to the L3
sys = {
    lineSize = 64;
    frequency = 2400;

    cores = {
        core = {
            type = "OOO";
            cores = 32;
            icache = "l1i";
            dcache = "l1d";
        };
    };

    caches = {
        l1d = {
            array = {
                type = "SetAssoc";
                ways = 8;
            };
            caches = 32;
            latency = 4;
            size = 32768;
        };
        l1i = {
            array = {
                type = "SetAssoc";
                ways = 4;
            };
            caches = 32;
            latency = 3;
            size = 32768;
        };
        l2 = {
            array = {
                type = "SetAssoc";
                ways = 8;
            };
        //type = "Timing";
        //mshrs = 10;
            caches = 32;
            latency = 7;
            children = "l1i|l1d";
            size = 262144;
        };
        l3 = {
            array = {
                hash = "H3";
                type = "SetAssoc";
                ways = 16;
            };
        //type = "Timing";
        //mshrs = 16;
            banks = 32;
            caches = 1;
            latency = 27;
            children = "l2";
        size = 67108864;
        };

    };

    mem = {
        type = "Traces";
        instr_traces = true;
          only_offload = true;
          pim_traces = true;

        outFile = "pim-poly_cholesky_32.out"
    };

};

sim = {
    phaseLength = 10000;
    maxTotalInstrs = 10000000000L;
    statsPhaseInterval = 1000;
    printHierarchy = true;
    // attachDebugger = True;
};

process0 = {
    command = "benchmarks/PolyBench-ACC-master/OpenMP/linear-algebra/kernels/cholesky/cholesky" ;
    startFastForwarded = True;
//    command = "ls -la";
//    command = "unzip tracesLois.out.gz";
};

Polybench example In cholesky.h file added a dataset for test case of dimension = 2000 . cholesky.c file is modified as follows

/* POLYBENCH/GPU-OPENMP
 *
 * This file is a part of the Polybench/GPU-OpenMP suite
 *
 * Contact:
 * William Killian <killian@udel.edu>
 * 
 * Copyright 2013, The University of Delaware
 */
#include <stdio.h>
#include <unistd.h>
#include <string.h>
#include <math.h>

/* Include polybench common header. */
#include <polybench.h>

/* Include benchmark-specific header. */
/* Default data type is double, default size is 4000. */
#include "cholesky.h"
#include "../../../../../../misc/hooks/zsim_hooks.h"

/* Array initialization. */
static
void init_array(int n,
        DATA_TYPE POLYBENCH_1D(p,N,n),
        DATA_TYPE POLYBENCH_2D(A,N,N,n,n))
{
  int i, j;

  for (i = 0; i < n; i++)
    {
      p[i] = 1.0 / n;
      for (j = 0; j < n; j++)
    A[i][j] = 1.0 / n;
    }
}

/* DCE code. Must scan the entire live-out data.
   Can be used also to check the correctness of the output. */
static
void print_array(int n,
         DATA_TYPE POLYBENCH_2D(A,N,N,n,n))

{
  int i, j;

  for (i = 0; i < n; i++)
    for (j = 0; j < n; j++) {
    fprintf (stderr, DATA_PRINTF_MODIFIER, A[i][j]);
    if ((i * N + j) % 20 == 0) fprintf (stderr, "\n");
  }
}

/* Main computational kernel. The whole function will be timed,
   including the call and return. */
static
void kernel_cholesky(int n,
             DATA_TYPE POLYBENCH_1D(p,N,n),
             DATA_TYPE POLYBENCH_2D(A,N,N,n,n))
{

  int i, j, k;
  int    num_omp_threads;
    num_omp_threads = 32;
  DATA_TYPE x;
  #pragma scop
  #pragma omp parallel
  {  

    #pragma omp for private (j,k)
    for (i = 0; i < _PB_N; ++i)
      { 
        zsim_PIM_function_begin();
          x = A[i][i];
          for (j = 0; j <= i - 1; ++j)

              x = x - A[i][j] * A[i][j];
            p[i] = 1.0 /sqrt(x);

          for (j = i + 1; j < _PB_N; ++j)
            {
              x = A[i][j];
              for (k = 0; k <= i - 1; ++k)
                x = x - A[j][k] * A[i][k];
                A[j][i] = x * p[i];
            }
          zsim_PIM_function_end(); 
      }

  }
  #pragma endscop

}

int main(int argc, char** argv)
{
  /* Retrieve problem size. */
  int n = N;

  /* Variable declaration/allocation. */
  POLYBENCH_2D_ARRAY_DECL(A, DATA_TYPE, N, N, n, n);
  POLYBENCH_1D_ARRAY_DECL(p, DATA_TYPE, N, n);

  /* Initialize array(s). */
  init_array (n, POLYBENCH_ARRAY(p), POLYBENCH_ARRAY(A));

  /* Start timer. */
  polybench_start_instruments;

  /* Run kernel. */
  zsim_roi_begin();
  kernel_cholesky (n, POLYBENCH_ARRAY(p), POLYBENCH_ARRAY(A));
  zsim_roi_end();
  /* Stop and print timer. */
  polybench_stop_instruments;
  polybench_print_instruments;

  /* Prevent dead-code elimination. All live-out data must be printed
     by the function call in argument. */
  polybench_prevent_dce(print_array(n, POLYBENCH_ARRAY(A)));

  /* Be clean. */
  POLYBENCH_FREE_ARRAY(A);
  POLYBENCH_FREE_ARRAY(p);

  return 0;
}
eehaitaodu commented 3 years ago

sorry to disturb you, I've had the same problem recently,would you mind tell me how do you solve this problem at last? thank you!

maryam1364 commented 3 years ago

@avacoder42 @eehaitaodu : I am not sure if this is the answer for your question or not! but in my case: I had to set: startFastForwarded = false; option "true" doesn't work for some applications that I have experience running them.

avacoder42 commented 3 years ago

Ah I just saw this message, these helped me solve it-

Lastly, I have some stats from running this for months, happy to share :)

Best Regards! MK