halide / Halide

a language for fast, portable data-parallel computation
https://halide-lang.org
Other
5.9k stars 1.07k forks source link

[Hexagon] Issue on gather generation: SDK 4.3.0 #6273

Closed AbyShk95 closed 3 years ago

AbyShk95 commented 3 years ago

I am trying a Nearest neighbor image resize algorithm using Halide. Following is the setup details:

Issue:

I would really appreciate it if someone could give some suggestions on this. @aankit-ca @pranavb-ca

aankit-ca commented 3 years ago

If you see a gather in with HL_DEBUG_CODEGEN=1, then you should be able to see vgather instructions in the assembly.

The reason you are not able to see gathers with sX, Sy passed as parameter is because your expressions (x sX) >> 10, (y sY) >> 10 could be bigger than range of i16. You can simply use clamp down your max value for these expressions to get gathers here:

    Expr x_coord = min((x * sX) >> 10, 4000);
    Expr y_coord = min((y * sY) >> 10, 4000);

Replace 4000 with a suitable value for your requirement.

AbyShk95 commented 3 years ago

Thank you for your response @aankit-ca.

So for the case of

out(x,y) = in(x_coord, y_coord)

to use gather, x_coord and y_coord cannot exceed sizeof(int16_t) ?

Further, as per your suggestion, I tried the following:

Expr x_coord = clamp(cast<int16_t>((x * sX) >> 10), 0, 4000); // even bigger than 4000 to make sure that is not an issue.
Expr y_coord = clamp(cast<int16_t>((y * sY) >> 10), 0, 4000);

I did see gather being used. But, I get an incorrect result. Does the term (x * sX) also have to be < sizeof(int16_t) based on the registers being used?

What worked well for me however is:

Expr x_coord = clamp(cast<int32_t>((x * sX) >> 10), 0, 4000); 
Expr y_coord = clamp(cast<int32_t>((y * sY) >> 10), 0, 4000);

Can you please explain this a bit?

aankit-ca commented 3 years ago

The x_coord and y_coord can exceed the i16, but the range of indices to be gathered need to be within the i16 range. (https://github.com/halide/Halide/blob/430764571012d6a06de4433d2b30ed7e46073a71/src/HexagonOptimize.cpp#L2266). So you should be able to compute in_vtcm and out_vtcm in tiles of fixed sizes even if x_coord and y_coord are very big.

You need not cast your coordinates i16. But I think the first one should have worked as well. Can you post your output of HL_DEBUG_CODEGEN=1?

AbyShk95 commented 3 years ago

Ok. I understood your point. Thank you for the explanation.

Got it. I was able to work without cast too. For first one:

Expr x_coord = clamp(cast<int16_t>((x * sX) >> 10), 0, 4000);
Expr y_coord = clamp(cast<int16_t>((y * sY) >> 10), 0, 4000);

My output for HL_DEBUG_CODEGEN=1 had gather, but got the incorrect result. Following is the output dump:

Creating initial loop nests...
Injecting realization of { out }
Injecting realization of { f1 }
Injecting realization of { f0 }
Inlining in_im
Skipping injecting memoization...
Injecting tracing...
Adding checks for parameters
Computing bounds of each function's value
Adding checks for images
Performing computation bounds inference...
Removing extern loops...
Performing sliding window optimization...
Simplifying correlated differences...
Performing allocation bounds inference...
Removing code that depends on undef values...
Uniquifying variable names...
Simplifying...
Performing storage folding optimization...
Injecting debug_to_file calls...
Injecting prefetches...
Discarding safe promises...
Dynamically skipping stages...
Forking asynchronous producers...
Destructuring tuple-valued realizations...
Performing storage flattening...
Adding atomic mutex allocation...
Unpacking buffer arguments...
Skipping rewriting memoized allocations...
Simplifying...
Reduce prefetch dimension...
Simplifying correlated differences...
Unrolling...
Vectorizing...
Detecting vector interleavings...
Partitioning loops to simplify boundary conditions...
Trimming loops to the region over which they do something...
Injecting early frees...
Simplifying correlated differences...
Bounding small allocations...
Simplifying...
Lowering unsafe promises...
Lowering after final simplification:
assert(((uint64)reinterpret(((halide_buffer_t *))out.buffer) != (uint64)0), halide_error_buffer_argument_is_null("out"))
assert(((uint64)reinterpret(((halide_buffer_t *))in.buffer) != (uint64)0), halide_error_buffer_argument_is_null("in"))
let in = ((void *))_halide_buffer_get_host(((halide_buffer_t *))in.buffer)
let in.type = (uint32)_halide_buffer_get_type(((halide_buffer_t *))in.buffer)
let in.device_dirty = (uint1)_halide_buffer_get_device_dirty(((halide_buffer_t *))in.buffer)
let in.dimensions = _halide_buffer_get_dimensions(((halide_buffer_t *))in.buffer)
let in.min.0 = _halide_buffer_get_min(((halide_buffer_t *))in.buffer, 0)
let in.extent.0 = _halide_buffer_get_extent(((halide_buffer_t *))in.buffer, 0)
let in.stride.0 = _halide_buffer_get_stride(((halide_buffer_t *))in.buffer, 0)
let in.min.1 = _halide_buffer_get_min(((halide_buffer_t *))in.buffer, 1)
let in.extent.1 = _halide_buffer_get_extent(((halide_buffer_t *))in.buffer, 1)
let in.stride.1 = _halide_buffer_get_stride(((halide_buffer_t *))in.buffer, 1)
let out = ((void *))_halide_buffer_get_host(((halide_buffer_t *))out.buffer)
let out.type = (uint32)_halide_buffer_get_type(((halide_buffer_t *))out.buffer)
let out.device_dirty = (uint1)_halide_buffer_get_device_dirty(((halide_buffer_t *))out.buffer)
let out.dimensions = _halide_buffer_get_dimensions(((halide_buffer_t *))out.buffer)
let out.min.0 = _halide_buffer_get_min(((halide_buffer_t *))out.buffer, 0)
let out.extent.0 = _halide_buffer_get_extent(((halide_buffer_t *))out.buffer, 0)
let out.stride.0 = _halide_buffer_get_stride(((halide_buffer_t *))out.buffer, 0)
let out.min.1 = _halide_buffer_get_min(((halide_buffer_t *))out.buffer, 1)
let out.extent.1 = _halide_buffer_get_extent(((halide_buffer_t *))out.buffer, 1)
let out.stride.1 = _halide_buffer_get_stride(((halide_buffer_t *))out.buffer, 1)
assert((!(uint1)_halide_buffer_is_bounds_query(((halide_buffer_t *))out.buffer) || (128 <= out.extent.0)), halide_error_constraints_make_required_region_smaller("Output buffer out", 0, 0, 127, (out.extent.0 + -128), (out.extent.0 + -1)))
assert((!(uint1)_halide_buffer_is_bounds_query(((halide_buffer_t *))out.buffer) || (32 <= out.extent.1)), halide_error_constraints_make_required_region_smaller("Output buffer out", 1, 0, 31, (out.extent.1 + -32), (out.extent.1 + -1)))
if ((uint1)_halide_buffer_is_bounds_query(((halide_buffer_t *))in.buffer)) {
 ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))in.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))in.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 1, 8, 2, ((halide_dimension_t *))make_struct(0, 4096, 1, 0, 0, 4001, 4096, 0), (uint64)0)
}
if ((uint1)_halide_buffer_is_bounds_query(((halide_buffer_t *))out.buffer)) {
 ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))out.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))out.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 1, 8, 2, ((halide_dimension_t *))make_struct(0, max(out.extent.0, 128), 1, 0, 0, max(out.extent.1, 32), max(out.extent.0, 128), 0), (uint64)0)
}
if (!((uint1)_halide_buffer_is_bounds_query(((halide_buffer_t *))in.buffer) || (uint1)_halide_buffer_is_bounds_query(((halide_buffer_t *))out.buffer))) {
 assert((in.type == (uint32)67585), halide_error_bad_type("Input buffer in", in.type, (uint32)67585))
 assert((in.dimensions == 2), halide_error_bad_dimensions("Input buffer in", in.dimensions, 2))
 assert((out.type == (uint32)67585), halide_error_bad_type("Output buffer out", out.type, (uint32)67585))
 assert((out.dimensions == 2), halide_error_bad_dimensions("Output buffer out", out.dimensions, 2))
 assert(((in.min.0 <= 0) && (4096 <= (in.extent.0 + in.min.0))), halide_error_access_out_of_bounds("Input buffer in", 0, 0, 4095, in.min.0, ((in.extent.0 + in.min.0) + -1)))
 assert((0 <= in.extent.0), halide_error_buffer_extents_negative("Input buffer in", 0, in.extent.0))
 assert(((in.min.1 <= 0) && (4001 <= (in.extent.1 + in.min.1))), halide_error_access_out_of_bounds("Input buffer in", 1, 0, 4000, in.min.1, ((in.extent.1 + in.min.1) + -1)))
 assert((0 <= in.extent.1), halide_error_buffer_extents_negative("Input buffer in", 1, in.extent.1))
 assert((((out.min.0 + 128) <= min(out.extent.0, 128)) && (0 <= out.min.0)), halide_error_access_out_of_bounds("Output buffer out", 0, (min(out.extent.0, 128) + -128), (out.extent.0 + -1), out.min.0, ((out.extent.0 + out.min.0) + -1)))
 assert((0 <= out.extent.0), halide_error_buffer_extents_negative("Output buffer out", 0, out.extent.0))
 assert((((out.min.1 + 32) <= min(out.extent.1, 32)) && (0 <= out.min.1)), halide_error_access_out_of_bounds("Output buffer out", 1, (min(out.extent.1, 32) + -32), (out.extent.1 + -1), out.min.1, ((out.extent.1 + out.min.1) + -1)))
 assert((0 <= out.extent.1), halide_error_buffer_extents_negative("Output buffer out", 1, out.extent.1))
 assert((in.stride.0 == 1), halide_error_constraint_violated("in.stride.0", in.stride.0, "1", 1))
 assert((in.min.0 == 0), halide_error_constraint_violated("in.min.0", in.min.0, "0", 0))
 assert((in.min.1 == 0), halide_error_constraint_violated("in.min.1", in.min.1, "0", 0))
 assert((out.stride.0 == 1), halide_error_constraint_violated("out.stride.0", out.stride.0, "1", 1))
 assert((out.min.0 == 0), halide_error_constraint_violated("out.min.0", out.min.0, "0", 0))
 assert((out.min.1 == 0), halide_error_constraint_violated("out.min.1", out.min.1, "0", 0))
 let in.total_extent.1 = (int64(in.extent.1)*int64(in.extent.0))
 let out.total_extent.1 = (int64(out.extent.1)*int64(out.extent.0))
 assert(((uint64)abs(int64(in.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("in", (uint64)abs(int64(in.extent.0)), (uint64)2147483647))
 assert(((uint64)abs((int64(in.extent.1)*int64(in.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("in", (uint64)abs((int64(in.extent.1)*int64(in.stride.1))), (uint64)2147483647))
 assert((in.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("in", in.total_extent.1, (int64)2147483647))
 assert(((uint64)abs(int64(out.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("out", (uint64)abs(int64(out.extent.0)), (uint64)2147483647))
 assert(((uint64)abs((int64(out.extent.1)*int64(out.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("out", (uint64)abs((int64(out.extent.1)*int64(out.stride.1))), (uint64)2147483647))
 assert((out.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("out", out.total_extent.1, (int64)2147483647))
 assert(!in.device_dirty, halide_error_device_dirty_with_no_device_support("Input buffer in"))
 assert(!out.device_dirty, halide_error_device_dirty_with_no_device_support("Output buffer out"))
 assert((((uint64)reinterpret(in) % (uint64)128) == (uint64)0), halide_error_unaligned_host_ptr("in", 128))
 assert((((uint64)reinterpret(out) % (uint64)128) == (uint64)0), halide_error_unaligned_host_ptr("out", 128))
 assert((in != ((void *))reinterpret((uint64)0)), halide_error_host_is_null("Input buffer in"))
 assert((out != ((void *))reinterpret((uint64)0)), halide_error_host_is_null("Output buffer out"))
 produce out {
  let t18 = max(out.extent.0, 128)
  let t16 = (0 < (out.extent.0 % 128))
  let t14 = min(out.extent.0, 128)
  let t13 = ((t18 + 127)/128)
  let t10 = ((out.extent.1 + 31)/32)
  let t15 = (out.extent.0/128)
  let t12 = ((((t18 + -1)/128)*128) + 128)
  let t11 = (out.extent.1 + -32)
  parallel (out.s0.y.y, 0, t10) {
   let out.s0.y.v1.base = min((out.s0.y.y*32), t11)
   allocate f0[uint8 * 4096 * 4001] in VTCM
   produce f0 {
    for (f0.s0.y, 0, 4001) {
     let t19 = (f0.s0.y*in.stride.1)
     let t20 = (f0.s0.y*32)
     for (f0.s0.x.x, 0, 32) {
      f0[ramp(((f0.s0.x.x + t20)*128), 1, 128) aligned(128, 0)] = in[ramp(((f0.s0.x.x*128) + t19), 1, 128)]
     }
    }
   }
   allocate f1[uint8 * t12 * 32] in VTCM
   produce f1 {
    consume f0 {
     for (f1.s0.y, out.s0.y.v1.base, 32) {
      let t22 = ((f1.s0.y - out.s0.y.v1.base)*t12)
      let t21 = (int32(max(min(int16(((f1.s0.y*sY)/1024)), (int16)4000), (int16)0))*4096)
      for (f1.s0.x.x, 0, t13) {
       let f1.s0.x.v3.base.s = ((f1.s0.x.x*128) + t14)
       f1[ramp(((f1.s0.x.v3.base.s - t14) + t22), 1, 128)] = f0[(int32x128(max(min(int16x128((ramp(((f1.s0.x.v3.base.s + -128)*sX), sX, 128)/x128(1024))), x128((int16)4000)), x128((int16)0))) + x128(t21))]
      }
     }
    }
   }
   free f0
   consume f1 {
    let t23 = ((((t18 + -1)/128)*128) + 128)
    for (out.s0.y.v1, 0, 32) {
     let t25 = ((out.s0.y.v1 + out.s0.y.v1.base)*out.stride.1)
     let t24 = (out.s0.y.v1*t12)
     for (out.s0.x.x, 0, t15) {
      out[ramp(((out.s0.x.x*128) + t25), 1, 128)] = f1[ramp(((((out.s0.x.x*128) - t14) + t24) + 128), 1, 128)]
     }
     if (t16) {
      out[ramp(((((out.s0.y.v1 + out.s0.y.v1.base)*out.stride.1) + out.extent.0) + -128), 1, 128)] = f1[ramp((((out.s0.y.v1*t23) + t18) + -128), 1, 128)]
     }
    }
   }
   free f1
  }
 }
}

Skipping Hexagon offload...
Target triple of initial module: hexagon-unknown--elf
mattrs: +hvx-length128b,+long-calls,+hvxv68,+hvx-qfloat
Generating llvm bitcode...
Generating llvm bitcode prolog for function resizeHalide...
Unpredicating loads and stores...
Looking for vscatter-vgather...
Optimizing shuffles...
Generating vtmpy/vrmpy...
Aligning loads for HVX....
Carrying values across loop iterations...
Eliminating boolean vectors from Hexagon code...
Optimizing Hexagon instructions...
Adding calls to qurt_hvx_lock, if necessary...
Hexagon function body:
assert(((uint64)reinterpret(((halide_buffer_t *))out.buffer) != (uint64)0), halide_error_buffer_argument_is_null("out"))
assert(((uint64)reinterpret(((halide_buffer_t *))in.buffer) != (uint64)0), halide_error_buffer_argument_is_null("in"))
let in = ((void *))_halide_buffer_get_host(((halide_buffer_t *))in.buffer)
let in.type = (uint32)_halide_buffer_get_type(((halide_buffer_t *))in.buffer)
let in.device_dirty = (uint1)_halide_buffer_get_device_dirty(((halide_buffer_t *))in.buffer)
let in.dimensions = _halide_buffer_get_dimensions(((halide_buffer_t *))in.buffer)
let in.min.0 = _halide_buffer_get_min(((halide_buffer_t *))in.buffer, 0)
let in.extent.0 = _halide_buffer_get_extent(((halide_buffer_t *))in.buffer, 0)
let in.stride.0 = _halide_buffer_get_stride(((halide_buffer_t *))in.buffer, 0)
let in.min.1 = _halide_buffer_get_min(((halide_buffer_t *))in.buffer, 1)
let in.extent.1 = _halide_buffer_get_extent(((halide_buffer_t *))in.buffer, 1)
let in.stride.1 = _halide_buffer_get_stride(((halide_buffer_t *))in.buffer, 1)
let out = ((void *))_halide_buffer_get_host(((halide_buffer_t *))out.buffer)
let out.type = (uint32)_halide_buffer_get_type(((halide_buffer_t *))out.buffer)
let out.device_dirty = (uint1)_halide_buffer_get_device_dirty(((halide_buffer_t *))out.buffer)
let out.dimensions = _halide_buffer_get_dimensions(((halide_buffer_t *))out.buffer)
let out.min.0 = _halide_buffer_get_min(((halide_buffer_t *))out.buffer, 0)
let out.extent.0 = _halide_buffer_get_extent(((halide_buffer_t *))out.buffer, 0)
let out.stride.0 = _halide_buffer_get_stride(((halide_buffer_t *))out.buffer, 0)
let out.min.1 = _halide_buffer_get_min(((halide_buffer_t *))out.buffer, 1)
let out.extent.1 = _halide_buffer_get_extent(((halide_buffer_t *))out.buffer, 1)
let out.stride.1 = _halide_buffer_get_stride(((halide_buffer_t *))out.buffer, 1)
assert((!(uint1)_halide_buffer_is_bounds_query(((halide_buffer_t *))out.buffer) || (128 <= out.extent.0)), halide_error_constraints_make_required_region_smaller("Output buffer out", 0, 0, 127, (out.extent.0 + -128), (out.extent.0 + -1)))
assert((!(uint1)_halide_buffer_is_bounds_query(((halide_buffer_t *))out.buffer) || (32 <= out.extent.1)), halide_error_constraints_make_required_region_smaller("Output buffer out", 1, 0, 31, (out.extent.1 + -32), (out.extent.1 + -1)))
if ((uint1)_halide_buffer_is_bounds_query(((halide_buffer_t *))in.buffer)) {
 ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))in.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))in.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 1, 8, 2, ((halide_dimension_t *))make_struct(0, 4096, 1, 0, 0, 4001, 4096, 0), (uint64)0)
}
if ((uint1)_halide_buffer_is_bounds_query(((halide_buffer_t *))out.buffer)) {
 ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))out.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))out.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 1, 8, 2, ((halide_dimension_t *))make_struct(0, max(out.extent.0, 128), 1, 0, 0, max(out.extent.1, 32), max(out.extent.0, 128), 0), (uint64)0)
}
if (!((uint1)_halide_buffer_is_bounds_query(((halide_buffer_t *))in.buffer) || (uint1)_halide_buffer_is_bounds_query(((halide_buffer_t *))out.buffer))) {
 assert((in.type == (uint32)67585), halide_error_bad_type("Input buffer in", in.type, (uint32)67585))
 assert((in.dimensions == 2), halide_error_bad_dimensions("Input buffer in", in.dimensions, 2))
 assert((out.type == (uint32)67585), halide_error_bad_type("Output buffer out", out.type, (uint32)67585))
 assert((out.dimensions == 2), halide_error_bad_dimensions("Output buffer out", out.dimensions, 2))
 assert(((in.min.0 <= 0) && (4096 <= (in.extent.0 + in.min.0))), halide_error_access_out_of_bounds("Input buffer in", 0, 0, 4095, in.min.0, ((in.extent.0 + in.min.0) + -1)))
 assert((0 <= in.extent.0), halide_error_buffer_extents_negative("Input buffer in", 0, in.extent.0))
 assert(((in.min.1 <= 0) && (4001 <= (in.extent.1 + in.min.1))), halide_error_access_out_of_bounds("Input buffer in", 1, 0, 4000, in.min.1, ((in.extent.1 + in.min.1) + -1)))
 assert((0 <= in.extent.1), halide_error_buffer_extents_negative("Input buffer in", 1, in.extent.1))
 assert((((out.min.0 + 128) <= min(out.extent.0, 128)) && (0 <= out.min.0)), halide_error_access_out_of_bounds("Output buffer out", 0, (min(out.extent.0, 128) + -128), (out.extent.0 + -1), out.min.0, ((out.extent.0 + out.min.0) + -1)))
 assert((0 <= out.extent.0), halide_error_buffer_extents_negative("Output buffer out", 0, out.extent.0))
 assert((((out.min.1 + 32) <= min(out.extent.1, 32)) && (0 <= out.min.1)), halide_error_access_out_of_bounds("Output buffer out", 1, (min(out.extent.1, 32) + -32), (out.extent.1 + -1), out.min.1, ((out.extent.1 + out.min.1) + -1)))
 assert((0 <= out.extent.1), halide_error_buffer_extents_negative("Output buffer out", 1, out.extent.1))
 assert((in.stride.0 == 1), halide_error_constraint_violated("in.stride.0", in.stride.0, "1", 1))
 assert((in.min.0 == 0), halide_error_constraint_violated("in.min.0", in.min.0, "0", 0))
 assert((in.min.1 == 0), halide_error_constraint_violated("in.min.1", in.min.1, "0", 0))
 assert((out.stride.0 == 1), halide_error_constraint_violated("out.stride.0", out.stride.0, "1", 1))
 assert((out.min.0 == 0), halide_error_constraint_violated("out.min.0", out.min.0, "0", 0))
 assert((out.min.1 == 0), halide_error_constraint_violated("out.min.1", out.min.1, "0", 0))
 let in.total_extent.1 = (int64(in.extent.1)*int64(in.extent.0))
 let out.total_extent.1 = (int64(out.extent.1)*int64(out.extent.0))
 assert(((uint64)abs(int64(in.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("in", (uint64)abs(int64(in.extent.0)), (uint64)2147483647))
 assert(((uint64)abs((int64(in.extent.1)*int64(in.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("in", (uint64)abs((int64(in.extent.1)*int64(in.stride.1))), (uint64)2147483647))
 assert((in.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("in", in.total_extent.1, (int64)2147483647))
 assert(((uint64)abs(int64(out.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("out", (uint64)abs(int64(out.extent.0)), (uint64)2147483647))
 assert(((uint64)abs((int64(out.extent.1)*int64(out.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("out", (uint64)abs((int64(out.extent.1)*int64(out.stride.1))), (uint64)2147483647))
 assert((out.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("out", out.total_extent.1, (int64)2147483647))
 assert(!in.device_dirty, halide_error_device_dirty_with_no_device_support("Input buffer in"))
 assert(!out.device_dirty, halide_error_device_dirty_with_no_device_support("Output buffer out"))
 assert((((uint64)reinterpret(in) % (uint64)128) == (uint64)0), halide_error_unaligned_host_ptr("in", 128))
 assert((((uint64)reinterpret(out) % (uint64)128) == (uint64)0), halide_error_unaligned_host_ptr("out", 128))
 assert((in != ((void *))reinterpret((uint64)0)), halide_error_host_is_null("Input buffer in"))
 assert((out != ((void *))reinterpret((uint64)0)), halide_error_host_is_null("Output buffer out"))
 produce out {
  let t16 = (0 < (out.extent.0 % 128))
  parallel (out.s0.y.y, 0, ((out.extent.1 + 31)/32)) {
   let hvx_lock_result = halide_qurt_hvx_lock(128)
   assert((hvx_lock_result == 0), hvx_lock_result)
   ((void *))register_destructor("halide_qurt_hvx_unlock_as_destructor", ((void *))reinterpret((uint64)1))
   let out.s0.y.v1.base = min((out.s0.y.y*32), (out.extent.1 + -32))
   allocate f0[uint8 * 4096 * 4001] in VTCM
   produce f0 {
    for (f0.s0.y, 0, 4001) {
     for (f0.s0.x.x, 0, 32) {
      f0[ramp((((f0.s0.y*32) + f0.s0.x.x)*128), 1, 128) aligned(128, 0)] = in[ramp(((f0.s0.x.x*128) + (f0.s0.y*in.stride.1)), 1, 128)]
     }
    }
   }
   allocate f1[uint8 * ((((max(out.extent.0, 128) + -1)/128)*128) + 128) * 32] in VTCM
   produce f1 {
    consume f0 {
     for (f1.s0.y, out.s0.y.v1.base, 32) {
      let t22 = ((f1.s0.y - out.s0.y.v1.base)*((((max(out.extent.0, 128) + -1)/128)*128) + 128))
      let t21.s = int16(((f1.s0.y*sY)/1024))
      for (f1.s0.x.x, 0, ((max(out.extent.0, 128) + 127)/128)) {
       let f1.s0.x.v3.base.s = (min(out.extent.0, 128) + (f1.s0.x.x*128))
       (uint8x128)gather(f1, ((f1.s0.x.v3.base.s - min(out.extent.0, 128)) + t22), f0, 16388097, (int16x128)halide.hexagon.deinterleave.vh((int16x128)halide.hexagon.trunc.vw(((int32x128)halide.hexagon.sxt.vh(max(min((int16x128)halide.hexagon.trunc_shr.vw.uw((int32x128)halide.hexagon.deinterleave.vw(ramp(((f1.s0.x.v3.base.s + -128)*sX), sX, 128)), 10), x128((int16)4000)), x128((int16)0))) + x128((int32(max(min(t21.s, (int16)4000), (int16)0))*4096))))))
      }
     }
    }
   }
   free f0
   consume f1 {
    scatter_release(f1)
    for (out.s0.y.v1, 0, 32) {
     for (out.s0.x.x, 0, (out.extent.0/128)) {
      out[ramp(((out.s0.x.x*128) + ((out.s0.y.v1 + out.s0.y.v1.base)*out.stride.1)), 1, 128)] = f1[ramp((((((((max(out.extent.0, 128) + -1)/128)*128) + 128)*out.s0.y.v1) + ((out.s0.x.x*128) - min(out.extent.0, 128))) + 128), 1, 128)]
     }
     if (t16) {
      out[ramp(((((out.s0.y.v1 + out.s0.y.v1.base)*out.stride.1) + out.extent.0) + -128), 1, 128)] = f1[ramp(((max(out.extent.0, 128) + (((((max(out.extent.0, 128) + -1)/128)*128) + 128)*out.s0.y.v1)) + -128), 1, 128)]
     }
    }
   }
   free f1
  }
 }
}

Using single_page_flag=1 for f0
Using single_page_flag=0 for f1
Module.compile(): object dsp/generated//resizeHalide.generator.o
emit_file.Compiling to native code...
Module.compile(): c_header dsp/generated//resizeHalide.generator.h
Module.compile(): c_source dsp/generated//resizeHalide.generator.halide_generated.cpp
Module.compile(): schedule dsp/generated//resizeHalide.generator.schedule.h