I found some bugs in implementation of PSHUFB Semantics.
We can find the slightly difference between PSHUFB (with 64 bit operands) and PSHUFB (with 128 bit operands), As described in PSHUFB — Packed Shuffle Bytes:
PSHUFB (with 64 bit operands)
TEMP ← DEST
for i = 0 to 7 {
if (SRC[(i * 8)+7] = 1 ) then
DEST[(i*8)+7...(i*8)+0] ← 0;
else
index[2..0] ← SRC[(i*8)+2 .. (i*8)+0];
DEST[(i*8)+7...(i*8)+0] ← TEMP[(index*8+7)..(index*8+0)];
endif;
}
PSHUFB (with 128 bit operands)
TEMP ← DEST
for i = 0 to 15 {
if (SRC[(i * 8)+7] = 1 ) then
DEST[(i*8)+7..(i*8)+0] ← 0;
else
index[3..0] ← SRC[(i*8)+3 .. (i*8)+0];
DEST[(i*8)+7..(i*8)+0] ← TEMP[(index*8+7)..(index*8+0)];
endif
}
So it's different between index[2..0] and index[3..0].
However, we used the same template in MMX.cpp.
template <typename D, typename S1, typename S2>
DEF_SEM(PSHUFB, D dst, S1 src1, S2 src2) {
auto src1_vec = UReadV8(src1);
auto src2_vec = UReadV8(src2);
auto dst_vec = UClearV8(UReadV8(dst));
auto vec_count = NumVectorElems(src1_vec);
_Pragma("unroll")
for (size_t i = 0; i < vec_count; i++) {
uint8_t v1 = UExtractV8(src2_vec, i);
uint8_t index = UAnd(v1, 7_u8);
uint8_t v2 = UExtractV8(src1_vec, index);
uint8_t value = Select(SignFlag(v1), 0_u8, v2);
dst_vec = UInsertV8(dst_vec, i, value);
}
UWriteV8(dst, dst_vec);
return memory;
}
DEF_ISEL(PSHUFB_MMXq_MMXq) = PSHUFB<V64W, V64, V64>;
DEF_ISEL(PSHUFB_MMXq_MEMq) = PSHUFB<V64W, V64, MV64>;
DEF_ISEL(PSHUFB_XMMdq_XMMdq) = PSHUFB<V128W, V128, V128>;
DEF_ISEL(PSHUFB_XMMdq_MEMdq) = PSHUFB<V128W, V128, MV128>;
Hi team
I found some bugs in implementation of
PSHUFB
Semantics. We can find the slightly difference betweenPSHUFB (with 64 bit operands)
andPSHUFB (with 128 bit operands)
, As described in PSHUFB — Packed Shuffle Bytes:So it's different between
index[2..0]
andindex[3..0]
. However, we used the same template in MMX.cpp.It caused problems.