doe300 / VC4C

Compiler for the VC4CL OpenCL implementation
MIT License
118 stars 37 forks source link

Convert multiple additions to one multiplication #69

Closed nomaddo closed 5 years ago

nomaddo commented 6 years ago

In the following code, multiple additions can be combined to one multiplication instruction.

void kernel test(float global * a, const int n){
  float sum = 0.0;
  float c = a[0];
  for (int i = 0; i < 10; i++)
    sum += 10.0 + c;
  a[get_global_id(0)] = sum;
}
// Module with 1 kernels, global data with 0 words (64-bit each), starting at offset 1 words and 0 words of stack-frame
// Kernel 'test' with 47 instructions, offset 2, with following parameters: __global out float* a (4 B, 1 items), int n (4 B, 1 items) (lSize, lids, gidX, offX)
// label: %start_of_function
or r3, unif, unif
or ra3, unif, unif
or r1, unif, unif
or r2, unif, unif
or ra1, unif, unif
or -, unif, unif
// label: %tmp.0
or tmu0s, ra1, ra1
nop.load_tmu0.never 
itof r0, 10 (10)
fadd rb0, r4, r0
or ra2, r1, r1
or r0, rb0, rb0
fadd r0, rb0, r0   // can be improved
fadd r0, rb0, r0   //
fadd r0, rb0, r0   //
fadd r0, rb0, r0   //
fadd r0, rb0, r0   //
fadd r0, rb0, r0   //
fadd r0, rb0, r0   //
fadd ra0, rb0, r0 //
or r0, r3, r3
ldi r1, 255
and r3, r0, r1
or r2, r2, r2; v8min r0, ra3, ra3
and r1, r0, r1
mul24 r0, ra2, r3
add r0, r2, r0
add r0, r0, r1
shl r0, r0, 2 (2)
or -, mutex_acq, mutex_acq
ldi vpw_setup, vpm_setup(size: 16 words, stride: 1 rows, address: h32(0))
fadd vpm, rb0, ra0
ldi vpw_setup, vdw_setup(rows: 1, elements: 1 words, address: h32(0))
ldi vpw_setup, vdw_setup(stride: 0)
add vpw_addr, ra1, r0
or -, vpw_wait, vpw_wait
or mutex_rel, 1 (1), 1 (1)
// label: %end_of_function
or r0, unif, unif
or.setf -, elem_num, r0
brr.ifallzc (pc+4) + -43 // to %start_of_function
nop.never 
nop.never 
nop.never 
not irq, qpu_num
nop.thrend.never 
nop.never 
nop.never