Open Quuxplusone opened 10 years ago
Attached isolate.c
(438 bytes, text/x-csrc): Function demonstrating regression.
This appears to be fixed for 32-bit ARM, but this code (copied from attachment):
typedef uint8_t uchar4 __attribute__((__vector_size__(4)));
void dut0(uchar4 * restrict out, uchar4 const * restrict in, int count, uint8_t const * restrict tab) {
uint8_t const *t0 = tab, *t1 = tab + 256, *t2 = tab + 512, *t3 = tab + 768;
while (--count >= 0) {
uchar4 tmp = *in++;
*out++ = (uchar4){ t0[tmp[0]], t1[tmp[1]], t2[tmp[2]], t3[tmp[3]] };
}
}
Compiled:
clang --target=aarch64-linux-gnu -Ofast -S foo.c -o-
Gives this loop body:
ldrb w12, [x1]
ldrb w13, [x1, #1]
ldrb w14, [x1, #2]
ldrb w15, [x1, #3]
ins v0.h[0], w12
ins v0.h[1], w13
ins v0.h[2], w14
ins v0.h[3], w15
umov w12, v0.h[0]
umov w13, v0.h[1]
umov w14, v0.h[2]
umov w15, v0.h[3]
and x12, x12, #0xff
and x13, x13, #0xff
and x14, x14, #0xff
and x15, x15, #0xff
ldrb w15, [x10, x15]
ldrb w14, [x9, x14]
ldrb w13, [x8, x13]
ldrb w12, [x3, x12]
add x1, x1, #4 // =4
strb w15, [x0, #3]
strb w14, [x0, #2]
strb w13, [x0, #1]
strb w12, [x0], #4
Twelve of those instructions could simply be deleted.
isolate.c
(438 bytes, text/x-csrc)