Closed kazuki closed 7 years ago
Oh... thanks so much for pointing this out. I'll fix it.
I tried to fix it:
$ cat test.c
#include <immintrin.h>
__attribute__((target("default"))) void test() {}
__attribute__((target("sse2"))) void test() { __m128i x; _mm_xor_si128(x,x); }
__attribute__((target("avx2"))) void test() { __m256i x; __m128i y; _mm256_xor_si256(x,x); _mm256_srl_epi32(x,y); }
int main() { test(); }
However,
$ g++ -O0 -S test.c
$ cat test.s
... snip ...
_Z4testv.avx2:
.LFB3448:
.cfi_startproc
leaq 8(%rsp), %r10
.cfi_def_cfa 10, 0
andq $-32, %rsp
pushq -8(%r10)
pushq %rbp
.cfi_escape 0x10,0x6,0x2,0x76,0
movq %rsp, %rbp
pushq %r10
.cfi_escape 0xf,0x3,0x76,0x78,0x6
subq $80, %rsp
vmovdqa -48(%rbp), %ymm0
vmovdqa %ymm0, -112(%rbp)
vmovdqa -48(%rbp), %ymm0
vmovdqa %ymm0, -208(%rbp)
vmovdqa -48(%rbp), %ymm0
vmovdqa %ymm0, -144(%rbp)
vmovdqa -64(%rbp), %xmm0
vmovaps %xmm0, -160(%rbp)
nop
addq $80, %rsp
popq %r10
.cfi_def_cfa 10, 0
popq %rbp
leaq -8(%r10), %rsp
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE3448:
.size _Z4testv.avx2, .-_Z4testv.avx2
.globl main
.type main, @function
... snip ...
Hmm, _mm256_xor_si256
and _mm256_srl_epi32
is not producing AVX2 code...
$ cat test.c
#include <immintrin.h>
__attribute__((target("default"))) void test() {}
__attribute__((target("sse2"))) void test() { __m128i x; x = _mm_xor_si128(x,x); }
__attribute__((target("avx2"))) void test() { __m256i x; __m128i y; x = _mm256_srl_epi32(x,y); }
int main() { test(); }
$ g++ --version
g++ (Gentoo 7.1.0-r1 p1.1) 7.1.0
Copyright (C) 2017 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
$ g++ -O0 -S test.c
$ cat test.s
... snip ...
_Z4testv.sse2:
.LFB3672:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
movdqa -48(%rbp), %xmm0
movaps %xmm0, -32(%rbp)
movdqa -48(%rbp), %xmm0
movaps %xmm0, -16(%rbp)
movdqa -32(%rbp), %xmm1
movdqa -16(%rbp), %xmm0
pxor %xmm1, %xmm0
movaps %xmm0, -48(%rbp)
nop
popq %rbp
.cfi_def_cfa 7, 8
ret
.cfi_endproc
:
:
_Z4testv.avx2:
.LFB3673:
.cfi_startproc
leaq 8(%rsp), %r10
.cfi_def_cfa 10, 0
andq $-32, %rsp
pushq -8(%r10)
pushq %rbp
.cfi_escape 0x10,0x6,0x2,0x76,0
movq %rsp, %rbp
pushq %r10
.cfi_escape 0xf,0x3,0x76,0x78,0x6
vmovdqa -80(%rbp), %ymm0
vmovdqa %ymm0, -48(%rbp)
vmovdqa -112(%rbp), %xmm0
vmovaps %xmm0, -96(%rbp)
vmovdqa -96(%rbp), %xmm1
vmovdqa -48(%rbp), %ymm0
vpsrld %xmm1, %ymm0, %ymm0
vmovdqa %ymm0, -80(%rbp)
ignore return value, gcc cannot generate sse2/avx opcode.
Thanks! Not ignoring the return value generates AVX2 instruction in gcc-5.3.0 (#349) too:
vpsrld %xmm1, %ymm0, %ymm0
However, the problem is that this test snippet builds successfully on gcc-5.3.0.
I looked into the assembly code of the original error:
[183/213] Compiling jubatus/core/nearest_neighbor/lsh_function.cpp
{standard input}: Assembler messages:
{standard input}:22469: Error: suffix or operands invalid for `vpsrld'
Line 22469 is:
vpsrld $8, %ymm0, %ymm0
so the assembler fails only when immediates are given...?
We decided to fix this in next release.
Will be fixed in https://github.com/jubatus/jubatus_core/pull/379
Fixed via #379
caused in #373
config.log