In the following code, multiple additions can be combined to one multiplication instruction.
void kernel test(float global * a, const int n){
float sum = 0.0;
float c = a[0];
for (int i = 0; i < 10; i++)
sum += 10.0 + c;
a[get_global_id(0)] = sum;
}
// Module with 1 kernels, global data with 0 words (64-bit each), starting at offset 1 words and 0 words of stack-frame
// Kernel 'test' with 47 instructions, offset 2, with following parameters: __global out float* a (4 B, 1 items), int n (4 B, 1 items) (lSize, lids, gidX, offX)
// label: %start_of_function
or r3, unif, unif
or ra3, unif, unif
or r1, unif, unif
or r2, unif, unif
or ra1, unif, unif
or -, unif, unif
// label: %tmp.0
or tmu0s, ra1, ra1
nop.load_tmu0.never
itof r0, 10 (10)
fadd rb0, r4, r0
or ra2, r1, r1
or r0, rb0, rb0
fadd r0, rb0, r0 // can be improved
fadd r0, rb0, r0 //
fadd r0, rb0, r0 //
fadd r0, rb0, r0 //
fadd r0, rb0, r0 //
fadd r0, rb0, r0 //
fadd r0, rb0, r0 //
fadd ra0, rb0, r0 //
or r0, r3, r3
ldi r1, 255
and r3, r0, r1
or r2, r2, r2; v8min r0, ra3, ra3
and r1, r0, r1
mul24 r0, ra2, r3
add r0, r2, r0
add r0, r0, r1
shl r0, r0, 2 (2)
or -, mutex_acq, mutex_acq
ldi vpw_setup, vpm_setup(size: 16 words, stride: 1 rows, address: h32(0))
fadd vpm, rb0, ra0
ldi vpw_setup, vdw_setup(rows: 1, elements: 1 words, address: h32(0))
ldi vpw_setup, vdw_setup(stride: 0)
add vpw_addr, ra1, r0
or -, vpw_wait, vpw_wait
or mutex_rel, 1 (1), 1 (1)
// label: %end_of_function
or r0, unif, unif
or.setf -, elem_num, r0
brr.ifallzc (pc+4) + -43 // to %start_of_function
nop.never
nop.never
nop.never
not irq, qpu_num
nop.thrend.never
nop.never
nop.never
In the following code, multiple additions can be combined to one multiplication instruction.