ledatelescope / bifrost

A stream processing framework for high-throughput applications.
BSD 3-Clause "New" or "Revised" License
64 stars 29 forks source link

transpose edge case? #143

Closed jaycedowell closed 2 years ago

jaycedowell commented 4 years ago

While working on Python3 support @realtimeradio ran into a problem with a test script:

import bifrost as bf
import bifrost.blocks as blocks
import bifrost.views as views

raw_data = blocks.read_wav([FILE], gulp_nframe=4096)
gpu_raw_data = blocks.copy(raw_data, space='cuda')
chunked_data = views.split_axis(gpu_raw_data, 'time', 256, label='fine_time')
fft_output = blocks.fft(chunked_data, axes='fine_time', axis_labels='freq')
squared = blocks.detect(fft_output, mode='scalar')
transposed = blocks.transpose(squared, ['time', 'pol', 'freq'])
host_transposed = blocks.copy(transposed, space='cuda_host')
quantized = bf.blocks.quantize(host_transposed, 'i8')
blocks.write_sigproc(quantized)

pipeline = bf.get_default_pipeline()
pipeline.shutdown_on_signals()
pipeline.run()

which was throwing a map.cpp:486 error 10: BF_STATUS_INVALID_ARGUMENT with:

  1 #include "Complex.hpp"
  2 #include "Vector.hpp"
  3 #include "ArrayIndexer.cuh"
  4 #include "ShapeIndexer.cuh"
  5 extern "C"
  6 __global__
  7 void transpose_vector_read_map_kernel(typename StorageType<4>::type const* in_ptr,
  8 typename StorageType<4>::type* out_ptr) {
  9   enum { NDIM = 3 };
 10   typedef StaticIndexArray<int,16,1,1> _Shape;
 11   typedef StaticShapeIndexer<_Shape> _ShapeIndexer;
 12   typedef StaticIndexArray<int,16,129> _Shape_in;
 13   typedef StaticIndexArray<int,516,4> _Strides_in;
 14   typedef StaticArrayIndexer<typename StorageType<4>::type const,_Shape_in,_Strides_in> _ArrayIndexer_in;
 15   typedef StaticIndexArray<int,16,1,129> _Shape_out;
 16   typedef StaticIndexArray<int,516,516,4> _Strides_out;
 17   typedef StaticArrayIndexer<typename StorageType<4>::type,_Shape_out,_Strides_out> _ArrayIndexer_out;
 18   const int _shape[NDIM] = {16, 1, 129}; (void)_shape[0];
 19   int _x0 = threadIdx.x + blockIdx.x*blockDim.x;
 20   int _z0 = blockIdx.z;
 21   for( int _z=_z0; _z<_ShapeIndexer::SIZE; _z+=gridDim.z ) {
 22   for( int _x=_x0; _x<129; _x+=blockDim.x*gridDim.x ) {
 23     auto _composite_index  = _ShapeIndexer::lift(_z);
 24     _composite_index[2] = _x;
 25     auto const& _  = _composite_index;
 26     _ArrayIndexer_in in(in_ptr, _);
 27     typedef typename StorageType<4>::type in_type;
 28     _ArrayIndexer_out out(out_ptr, _);
 29     typedef typename StorageType<4>::type out_type;
 30     auto i0 = _[0];
 31     auto i1 = _[1];
 32     auto i2 = _[2];
 33     enum { K = 1 };
 34 in_type ivals = in(i0, i2);
 35 #pragma unroll
 36 for( int k=0; k<K; ++k ) {
 37     out(i0, k, i2) = ivals[k];
 38 }
 39 ;
 40   }
 41   }
 42 }
---------------------------------------------------
--- JIT compile log for program transpose_vector_read ---
---------------------------------------------------
transpose_vector_read(37): error: expression must have pointer-to-object type

1 error detected in the compilation of "transpose_vector_read".

---------------------------------------------------

I think this a bug in the transpose operation but I wanted to check to see if this could be a usage problem. Assuming it is a bug I think the easiest thing to do would be to modify this if...else..else block in transpose.cu to explicitly exclude situations where the fastest axis size is one.

jaycedowell commented 2 years ago

I tried this today with autoconf and everything looks I didn't end up with a BF_STATUS_INVALID_ARGUMENT error. I'll go ahead and close this.