stanford-ppl / spatial-lang

Spatial: "Specify Parameterized Accelerators Through Inordinately Abstract Language"
MIT License
100 stars 12 forks source link

AccessDispatch has more than one dispatch #220

Open mattfel1 opened 7 years ago

mattfel1 commented 7 years ago

Stefan found this in vgg_1d, pasted below. I think this is a bug, unless if there is something logical I should do when dispatched to multiple duplicates? Issue is on node x14503 accessing lb x14374_lb0

import spatial.dsl._
import org.virtualized._

object vgg_1d extends SpatialApp {

  // override val target = targets.AWS_F1
  type T = FixPt[TRUE,_128,_128] // Signed    // TODO: If quantized, change this later
  // type T = FixPt[TRUE,_16,_16] // Signed    // TODO: If quantized, change this later

  val par_L0 = 3
  val par_L1 = 3
  val par_L3_o = 1
  val par_L3_i = 1
  val par_L5_o = 1
  val par_L5_i = 1

  @virtualize
  def vgg_1d[T:Type:Num](
    i0: Array[T],
    c0: Array[T],
    c1: Array[T],
    c2: Array[T],
    c3: Array[T],
    c4: Array[T],
    c5: Array[T],
    c6: Array[T],
    c7: Array[T]
  ) : Array[T] = {

    val c0_DRAM = DRAM[T](16,3,3,3)
    val i0_DRAM = DRAM[T](3,16,16)
    val c1_DRAM = DRAM[T](16)
    val tmp0_DRAM = DRAM[T](16,16,16)

    val c2_DRAM = DRAM[T](16,16,3,3)
    val c3_DRAM = DRAM[T](16)
    val tmp1_DRAM = DRAM[T](16,8,16)

    val c4_DRAM = DRAM[T](1024,1024)

    val c5_DRAM = DRAM[T](1024)
    val c6_DRAM = DRAM[T](1000,1024)

    val c7_DRAM = DRAM[T](1000)
    val tmp5_DRAM = DRAM[T](1008)

    setMem(c0_DRAM, c0.reshape(16,3,3,3))
    setMem(i0_DRAM, i0.reshape(3,16,16))
    setMem(c1_DRAM, c1)
    setMem(c2_DRAM, c2.reshape(16,16,3,3))
    setMem(c3_DRAM, c3)
    setMem(c4_DRAM, c4.reshape(1024,1024))
    setMem(c5_DRAM, c5)
    setMem(c6_DRAM, c6.reshape(1000,1024))
    setMem(c7_DRAM, c7)

    Accel {
//     Sequential.Foreach { // TODO: Make Pipeline/MetaPipeline to pipeline input images for inference, for now since 1 image can ignore this line

        // Conv2D
        val c1_SRAM = SRAM[T](16)
        c1_SRAM load c1_DRAM(0::16)
        Sequential.Foreach(16 by 1) { outD_i => // out channels
          val nr = 16
          val nc = 16
          val kr = 3
          val kc = 3
          val kr_ignore = 1
          val kc_ignore = 1 // This should be called half_pad_minus_1
          val d = 3
          val tmp0_SRAM_conv = SRAM[T](nr, nc)
          MemReduce(tmp0_SRAM_conv)(d by 1) { inD_i => // in channels
            val lb0 = LineBuffer[T](kr, nc)
            val c0_RF = RegFile[T](kr, kc)
            c0_RF load c0_DRAM(outD_i, inD_i, 0::kr, 0::kc) // TODO: Can load a burst to SRAM first, since this is probably going to be many small loads
            // val sr0 = RegFile[T](kr, kc)
            val result = SRAM[T](nr, nc)
            Foreach(0 until nr + kr_ignore) { r =>
              val row_to_load_from = min(r.to[Int], nr.to[Int]-1)
              lb0 load i0_DRAM(inD_i, row_to_load_from, 0::nc)
              Foreach(0 until nc) { c =>
                // val col_to_load_from = min(c.to[Int], nc.to[Int]-1)
                // Foreach(0 until kr par par_L0){i => sr0(i, *) <<= lb0(i, col_to_load_from) }
                val row_start = min((kr-1).to[Index], max(0.to[Index], (kr-1-r.to[Index]   ).to[Index]) )
                val row_end   = min((kr  ).to[Index], max(1.to[Index], (kr+nr-1-r.to[Index]).to[Index]) )
                val col_start = max( 0.to[Index], kc_ignore   -c.to[Index]).to[Index]
                val col_end   = min(kc.to[Index], kc_ignore+nc-c.to[Index]).to[Index]
                // Note: Can make hardware above more efficient by calculating statically, e.g. below  is for k=5
                // val row_start = if (r == 2) (2) else if (r == 3) (1) else (0)
                // val row_end   = if (r == n + k_ignore - 1) (3) else if (r == n + k_ignore - 2) (4) else (k)
                // val col_start = if (c == 2) (2) else if (c == 3) (1) else (0)
                // val col_end   = if (c == n + k_ignore - 1) (3) else if (c == n + k_ignore - 2) (4) else (k)

                /*
                val window = Reduce(Reg[T](0.to[T]))(row_start until row_end, col_start until col_end){ (i,j) =>
                  sr0(i,kc-1-j) * c0_RF(i,j)
                }{_+_}
                if (r >= kr_ignore && c >= kc_ignore) {
                  result(r.to[Index]-kr_ignore, c.to[Index]-kc_ignore) = window.value
                }
                */

                // Could 2x unroll above, but need muxes (since bounds are not const)

                // TODO: Inline row_start and row_end etc. later and see if it improves
                // /*
                val prod00 = mux( (0 < row_start || 0 < col_start), 0.to[T], lb0(0, max(0.to[Index], min(15.to[Index], c.to[Index] - 1))) * c0_RF(0,0) )
                val prod01 = mux( (0 < row_start                 ), 0.to[T], lb0(0, max(0.to[Index], min(15.to[Index], c.to[Index]    ))) * c0_RF(0,1) )
                val prod02 = mux( (0 < row_start || 3 > col_end  ), 0.to[T], lb0(0, max(0.to[Index], min(15.to[Index], c.to[Index] + 1))) * c0_RF(0,2) )
                val prod03 = mux( (                 0 < col_start), 0.to[T], lb0(1, max(0.to[Index], min(15.to[Index], c.to[Index] - 1))) * c0_RF(1,0) )
                val prod04 =                                                 lb0(1, max(0.to[Index], min(15.to[Index], c.to[Index]    ))) * c0_RF(1,1)
                val prod05 = mux( (                 3 > col_end  ), 0.to[T], lb0(1, max(0.to[Index], min(15.to[Index], c.to[Index] + 1))) * c0_RF(1,2) )
                val prod06 = mux( (3 > row_end   || 0 < col_start), 0.to[T], lb0(2, max(0.to[Index], min(15.to[Index], c.to[Index] - 1))) * c0_RF(2,0) )
                val prod07 = mux( (3 > row_end                   ), 0.to[T], lb0(2, max(0.to[Index], min(15.to[Index], c.to[Index]    ))) * c0_RF(2,1) )
                val prod08 = mux( (3 > row_end   || 3 > col_end  ), 0.to[T], lb0(2, max(0.to[Index], min(15.to[Index], c.to[Index] + 1))) * c0_RF(2,2) )

                val tree_level_0_00 = prod00 + prod01
                val tree_level_0_01 = prod02 + prod03
                val tree_level_0_02 = prod04 + prod05
                val tree_level_0_03 = prod06 + prod07
                val tree_level_0_04 = prod08

                val tree_level_1_00 = tree_level_0_00 + tree_level_0_01
                val tree_level_1_01 = tree_level_0_02 + tree_level_0_03
                val tree_level_1_02 = tree_level_0_04

                val tree_level_2_00 = tree_level_1_00 + tree_level_1_01
                val tree_level_2_01 = tree_level_1_02

                val window = tree_level_2_00 + tree_level_2_01

                if (r >= kr_ignore) {
                  result(r.to[Index]-kr_ignore, c) = window
                }
                // */
              }
            }
            result
          }{_+_} // Reduce across in channels

          // Fused BiasAdd
          val tmp0_SRAM_bias = SRAM[T](16,16)
          Foreach(16 by 1, 16 by 1) { (i,j) =>
            tmp0_SRAM_bias(i, j) = max(0.to[T], tmp0_SRAM_conv(i,j) + c1_SRAM(outD_i))
          }
          tmp0_DRAM(outD_i, 0::16, 0::16) store tmp0_SRAM_bias
        }
        // Optimization: BiasAdd was merged into Conv2D above
        // Optimization: ReLU was merged into Conv2D above

        // Conv2D
        val c3_SRAM = SRAM[T](16)
        c3_SRAM load c3_DRAM(0::16)
        Sequential.Foreach(16 by 1) { outD_i => // out channels
          val nr = 16
          val nc = 16
          val kr = 3
          val kc = 3
          val kr_ignore = 1
          val kc_ignore = 1
          val d = 16
          val tmp1_SRAM_conv = SRAM[T](nr, nc)
          MemReduce(tmp1_SRAM_conv)(d by 1) { inD_i => // in channels
            val lb1 = LineBuffer[T](kr, nc)
            val c2_RF = RegFile[T](kr, kc)
            c2_RF load c2_DRAM(outD_i, inD_i, 0::kr, 0::kc) // TODO: Can load a burst to SRAM first, since this is probably going to be many small loads
            // val sr1 = RegFile[T](kr, kc)
            val result = SRAM[T](nr, nc)
            Foreach(0 until nr + kr_ignore) { r =>
              val row_to_load_from = min(r.to[Int], nr.to[Int]-1)
              lb1 load tmp0_DRAM(inD_i, row_to_load_from, 0::nc)
              Foreach(0 until nc) { c =>
                // val col_to_load_from = min(c.to[Int], nc.to[Int]-1)
                // Foreach(0 until kr par par_L1){i => sr1(i, *) <<= lb1(i, col_to_load_from) }
                val row_start = min((kr-1).to[Index], max(0.to[Index], (kr-1-r.to[Index]   ).to[Index]) )
                val row_end   = min((kr  ).to[Index], max(1.to[Index], (kr+nr-1-r.to[Index]).to[Index]) )
                val col_start = max( 0.to[Index], kc_ignore   -c.to[Index]).to[Index]
                val col_end   = min(kc.to[Index], kc_ignore+nc-c.to[Index]).to[Index]
                // Note: Can make hardware above more efficient by calculating statically, e.g. below  is for k=5
                // val row_start = if (r == 2) (2) else if (r == 3) (1) else (0)
                // val row_end   = if (r == n + k_ignore - 1) (3) else if (r == n + k_ignore - 2) (4) else (k)
                // val col_start = if (c == 2) (2) else if (c == 3) (1) else (0)
                // val col_end   = if (c == n + k_ignore - 1) (3) else if (c == n + k_ignore - 2) (4) else (k)

                /*
                val window = Reduce(Reg[T](0.to[T]))(row_start until row_end, col_start until col_end){ (i,j) =>
                  sr1(i,kc-1-j) * c2_RF(i,j)
                }{_+_}
                if (r >= kr_ignore && c >= kc_ignore) {
                  result(r.to[Index]-kr_ignore, c.to[Index]-kc_ignore) = window.value
                }
                */

                // /*
                val prod00 = mux( (0 < row_start || 0 < col_start), 0.to[T], lb1(0, max(0.to[Index], min(15.to[Index], c.to[Index] - 1))) * c2_RF(0,0) )
                val prod01 = mux( (0 < row_start                 ), 0.to[T], lb1(0, max(0.to[Index], min(15.to[Index], c.to[Index]    ))) * c2_RF(0,1) )
                val prod02 = mux( (0 < row_start || 3 > col_end  ), 0.to[T], lb1(0, max(0.to[Index], min(15.to[Index], c.to[Index] + 1))) * c2_RF(0,2) )
                val prod03 = mux( (                 0 < col_start), 0.to[T], lb1(1, max(0.to[Index], min(15.to[Index], c.to[Index] - 1))) * c2_RF(1,0) )
                val prod04 =                                                 lb1(1, max(0.to[Index], min(15.to[Index], c.to[Index]    ))) * c2_RF(1,1)
                val prod05 = mux( (                 3 > col_end  ), 0.to[T], lb1(1, max(0.to[Index], min(15.to[Index], c.to[Index] + 1))) * c2_RF(1,2) )
                val prod06 = mux( (3 > row_end   || 0 < col_start), 0.to[T], lb1(2, max(0.to[Index], min(15.to[Index], c.to[Index] - 1))) * c2_RF(2,0) )
                val prod07 = mux( (3 > row_end                   ), 0.to[T], lb1(2, max(0.to[Index], min(15.to[Index], c.to[Index]    ))) * c2_RF(2,1) )
                val prod08 = mux( (3 > row_end   || 3 > col_end  ), 0.to[T], lb1(2, max(0.to[Index], min(15.to[Index], c.to[Index] + 1))) * c2_RF(2,2) )

                val tree_level_0_00 = prod00 + prod01
                val tree_level_0_01 = prod02 + prod03
                val tree_level_0_02 = prod04 + prod05
                val tree_level_0_03 = prod06 + prod07
                val tree_level_0_04 = prod08

                val tree_level_1_00 = tree_level_0_00 + tree_level_0_01
                val tree_level_1_01 = tree_level_0_02 + tree_level_0_03
                val tree_level_1_02 = tree_level_0_04

                val tree_level_2_00 = tree_level_1_00 + tree_level_1_01
                val tree_level_2_01 = tree_level_1_02

                val window = tree_level_2_00 + tree_level_2_01

                if (r >= kr_ignore) {
                  result(r.to[Index]-kr_ignore, c) = window
                }
                // */
              }
            }
            result
          }{_+_} // Reduce across in channels

          // Fused BiasAdd
          val tmp1_SRAM_pool = SRAM[T](8,16)
          Foreach(8 by 1, 8 by 1) { (i,j) =>
            val out = Reduce(Reg[T](0.to[T]))(2 by 1, 2 by 1) { (ii, jj) =>
              max(0.to[T], tmp1_SRAM_conv(i*2 + ii, j*2 + jj) + c3_SRAM(outD_i))
            } { (x,y) => max(x,y) }
            tmp1_SRAM_pool(i, j) = out.value
          }
          tmp1_DRAM(outD_i, 0::8, 0::16) store tmp1_SRAM_pool
        }
        // Optimization: BiasAdd was merged into Conv2D above
        // Optimization: ReLU was merged into Conv2D above
        // Optimization: MaxPool was merged into Conv2D above

        // Reshape
        // TODO: Should fuse this with next op
        val tmp2_SRAM = SRAM[T](8*8*16)
        Foreach(16 by 1) { j =>
          Foreach(8 by 1) { i =>
            val row = SRAM[T](8)
            row load tmp1_DRAM(j, i, 0::8)
            Foreach(8 by 1) { k =>
              tmp2_SRAM(k*16 + i*8*16 + j) = row(k)
            }
          }
        }

        // MatMul
        val c5_SRAM = SRAM[T](1024)
        c5_SRAM load c5_DRAM(0::1024)
        val tmp3_SRAM = SRAM[T](1024)
        Foreach(1024 by 1 par par_L3_o){out_i =>
          val c4_row_SRAM = SRAM[T](1024)
          c4_row_SRAM load c4_DRAM(out_i, 0::1024 par 16)
          val prod = Reduce(Reg[T](0.to[T]))(1024 by 1 par 16){ in_i => tmp2_SRAM(in_i) * c4_row_SRAM(in_i) }{_+_}
          tmp3_SRAM(out_i) = max(0.to[T], prod.value + c5_SRAM(out_i))
        }
        // Optimization: BiasAdd was merged into MatMul above
        // Optimization: ReLU was merged into MatMul above

        // Reshape
        // Skipping reshape since tmp4 and tmp3 already 1d

        // MatMul
        val c7_SRAM = SRAM[T](1000)
        c7_SRAM load c7_DRAM(0::1000)
        val tmp5_SRAM = SRAM[T](1008)
        Foreach(1000 by 1 par par_L5_o){out_i =>
          val c6_row_SRAM = SRAM[T](1024)
          c6_row_SRAM load c6_DRAM(out_i, 0::1024 par 16)
          val prod = Reduce(Reg[T](0.to[T]))(1024 by 1 par 16){ in_i => tmp3_SRAM(in_i) * c6_row_SRAM(in_i) }{_+_}
          tmp5_SRAM(out_i) = prod.value + c7_SRAM(out_i)
        }
        // Optimization: BiasAdd was merged into MatMul above
        // Optimization: ReLU was merged into MatMul above

        tmp5_DRAM(0::1008) store tmp5_SRAM
//    } Sequential over all images
    }

    getMem(tmp5_DRAM)
  }

  @virtualize
  def main() {
    val i0 = loadCSV1D[T]("/home/shadjis/spatial-lang/csv_vgg_1/vgg_1_in_0.csv", "\n")
    val c0 = loadCSV1D[T]("/home/shadjis/spatial-lang/csv_vgg_1/c0.csv", "\n") // conv1_1/conv1_1_filters
    val c1 = loadCSV1D[T]("/home/shadjis/spatial-lang/csv_vgg_1/c1.csv", "\n") // conv1_1/conv1_1_biases
    val c2 = loadCSV1D[T]("/home/shadjis/spatial-lang/csv_vgg_1/c2.csv", "\n") // conv1_2/conv1_2_filters
    val c3 = loadCSV1D[T]("/home/shadjis/spatial-lang/csv_vgg_1/c3.csv", "\n") // conv1_2/conv1_2_biases
    val c4 = loadCSV1D[T]("/home/shadjis/spatial-lang/csv_vgg_1/c4.csv", "\n") // fc6/fc6_weights
    val c5 = loadCSV1D[T]("/home/shadjis/spatial-lang/csv_vgg_1/c5.csv", "\n") // fc6/fc6_biases
    val c6 = loadCSV1D[T]("/home/shadjis/spatial-lang/csv_vgg_1/c6.csv", "\n") // fc8/fc8_weights
    val c7 = loadCSV1D[T]("/home/shadjis/spatial-lang/csv_vgg_1/c7.csv", "\n") // fc8/fc8_biases

    val i0_reshaped = i0.reshape(3,16,16)
    val i0_reconstructed = (0::3, 0::16, 0::16){(i,j,k) =>
      var x = 0.to[T]
      if (i == 0) {
        x = i0_reshaped(2,j,k)*255.0.to[T] - 103.939.to[T]
      }
      if (i == 1) {
        x = i0_reshaped(1,j,k)*255.0.to[T] - 116.779.to[T]
      }
      if (i == 2) {
        x = i0_reshaped(0,j,k)*255.0.to[T] - 123.68.to[T]
      }
      x
    };
    val i0_reconstructed_linear = Array.tabulate(768){i => i0_reconstructed(i/(16*16),(i%(16*16))/(16),i%16)};
    val output = vgg_1d(i0_reconstructed_linear, c0, c1, c2, c3, c4, c5, c6, c7)
    val output_no_extra = Array.tabulate(1000){i => output(i)}
    printArray(output_no_extra, "output")
    val gold = loadCSV1D[T]("/home/shadjis/spatial-lang/csv_vgg_1/vgg_1_check_out_0.csv", "\n")
    printArray(gold, "gold")
    // val margin = 0.0001.to[T] // Within 0.01% confidence
    val margin = 0.000000001.to[T]
    val cksum = gold.zip(output_no_extra){(a,b) => abs(a-b) < margin}.reduce{_&&_}
    println("PASS: " + cksum)
  }
}
dkoeplin commented 7 years ago

Partially fixed on develop - issue was the compiler didn't know LineBuffer is always banked by the rows, so it was trying to duplicate.

However, this is an obvious case where we should be coalescing reads onto a single banked memory, but instead are creating duplicates (in this case, of the LineBuffer).

shadjis commented 7 years ago

FYI on latest develop this gives the following error still in --synth (not --sim):

[bug] An exception was encountered while compiling:
[bug]   This is an example where lb dispatch > 1. Please use as test case! (node x15945 on lb x15813)
java.lang.Exception: This is an example where lb dispatch > 1. Please use as test case! (node x15945 on lb x15813)
    at spatial.codegen.chiselgen.ChiselGenUnrolled.emitNode(ChiselGenUnrolled.scala:404)
    at spatial.codegen.chiselgen.ChiselGenUnrolled.emitNode$(ChiselGenUnrolled.scala:51)
    at spatial.SpatialCompiler$$anon$3.spatial$codegen$chiselgen$ChiselGenVector$$super$emitNode(Spatial.scala:107)
    at spatial.codegen.chiselgen.ChiselGenVector.emitNode(ChiselGenVector.scala:56)
    at spatial.codegen.chiselgen.ChiselGenVector.emitNode$(ChiselGenVector.scala:33)
    at spatial.SpatialCompiler$$anon$3.argon$codegen$chiselgen$ChiselGenArray$$super$emitNode(Spatial.scala:107)
    at argon.codegen.chiselgen.ChiselGenArray.emitNode(ChiselGenArray.scala:26)
    at argon.codegen.chiselgen.ChiselGenArray.emitNode$(ChiselGenArray.scala:18)
    at spatial.SpatialCompiler$$anon$3.spatial$codegen$chiselgen$ChiselGenAlteraVideo$$super$emitNode(Spatial.scala:107)