Closed icyfox168168 closed 2 years ago
Hello,
I don't fully understand what the problem is. Is the PUNPCKLBW supplying the wrong output? Do you have an assembly snippet I can test?
As for the AESENC fail - that instruction is not supported at all currently.
AESENC fail is an error message
void aaes() { uint8_t rk[16], s[16], s0[16]; __m128i rk128, s128;
for (int i = 0; i < 16; ++i) rk[i] = i;
for (int i = 0; i < 16; ++i) s[i] = i;
rk128 = _mm_load_si128((__m128i*)rk);
s128 = _mm_load_si128((__m128i*)s);
aesenc(s, rk);
s128 = _mm_aesenc_si128(s128, rk128);
_mm_storeu_si128((__m128i*)s0, s128);
if (!memcmp(s, s0, 16)) printf("aesenc ok\n");
else printf("aesenc fail\n");
rk128 = _mm_load_si128((__m128i*)rk);
s128 = _mm_load_si128((__m128i*)s);
aesenclast(s, rk);
s128 = _mm_aesenclast_si128(s128, rk128);
_mm_storeu_si128((__m128i*)s0, s128);
if (!memcmp(s, s0, 16)) printf("aesenclast ok\n");
else printf("aesenclast fail\n");
}
Let's see where the error is tonight
aesenc ok aesenclast ok 0000000000000000 0000 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ aesenc ok aesenclast ok Shemu returned: 00000002 RunCount 1 start 0000000140000000 end 0000000000000000 rax 0000000000000000 RETN id 602 0000 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ time 0 .............ok1............... .............ok2...............
I commented out this mock to get it into my real mock function to get the correct result
/ case ND_INS_PUNPCKLBW: GET_OP(Context, 0, &dst); GET_OP(Context, 1, &src); if (dst.Size == 8) { // Operating on MMX register. dst.Value.Bytes[7] = src.Value.Bytes[3]; dst.Value.Bytes[6] = dst.Value.Bytes[3]; dst.Value.Bytes[5] = src.Value.Bytes[2]; dst.Value.Bytes[4] = dst.Value.Bytes[2]; dst.Value.Bytes[3] = src.Value.Bytes[1]; dst.Value.Bytes[2] = dst.Value.Bytes[1]; dst.Value.Bytes[1] = src.Value.Bytes[0]; } else { // Operating on XMM register. dst.Value.Bytes[15] = src.Value.Bytes[7]; dst.Value.Bytes[14] = dst.Value.Bytes[7]; dst.Value.Bytes[13] = src.Value.Bytes[6]; dst.Value.Bytes[12] = dst.Value.Bytes[6]; dst.Value.Bytes[11] = src.Value.Bytes[5]; dst.Value.Bytes[10] = dst.Value.Bytes[5]; dst.Value.Bytes[9] = src.Value.Bytes[4]; dst.Value.Bytes[8] = src.Value.Bytes[4]; dst.Value.Bytes[7] = src.Value.Bytes[3]; dst.Value.Bytes[6] = dst.Value.Bytes[3]; dst.Value.Bytes[5] = src.Value.Bytes[2]; dst.Value.Bytes[4] = dst.Value.Bytes[2]; dst.Value.Bytes[3] = src.Value.Bytes[1]; dst.Value.Bytes[2] = dst.Value.Bytes[1]; dst.Value.Bytes[1] = src.Value.Bytes[0]; } SET_OP(Context, 0, &dst); break; /
If you don't comment, you will get wrong results aesenc ok aesenclast ok 0000000000000000 0000 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 000000014001A406 PUNPCKLBW xmm4, xmm2 start XMM4 (HI_32) = 0x000000000000000000000004000000007261506c000000000000000000000000 start XMM4 (LO_32) = 0x000000000000000000000004000000007261506c0000007a0000000000000045 start XMM2 (HI_32) = 0x000000000000000000000004000000007261506c000000000000000000000000 start XMM2 (LO_32) = 0x000000000000000000000004000000007261506c0000007a0000000000000045 ture XMM4 (HI_32) = 0x000000000000000000000004000000007261506c000000000000000000000000 ture XMM4 (LO_32) = 0x000000000000000000000004000000007261506c0000007a0000000000000045 bemu XMM4 (HI_32) = 0x000000000014fc80000000000014fa80000000000014f880000000000014f680 bemu XMM4 (LO_32) = 0x000000000014f480000000000014f280000000000014f080000000000014ee80
aesenc fail aesenclast ok Shemu returned: 00000002 RunCount 1 start 0000000140000000 end 0000000000000000 rax 0000000000000000 RETN id 602 0000 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ time 1 .............ok1............... .............ok2...............
"\x48\x83\xEC\x28\x48\xB8\x0F\x0E\x0D\x0C\x0B\x0A\x09\x08\x48\x89\x44\x24\x10\x48\xB8\x07\x06\x05\x04\x03\x02\x01\x00\x48\x89\x44\x24\x18\x48\xC7\x04\x24\x00\x00\x00\x00\x48\xC7\x44\x24\x08\x00\x00\x00\x00\x66\x0F\x6F\x24\x24\x66\x0F\x6F\x54\x24\x10\x66\x0F\x60\xE2\x66\x0F\x7F\x24\x24\x66\x0F\x7F\x54\x24\x10\x48\x8B\x04\x24\x48\x03\x44\x24\x08\x48\x03\x44\x24\x10\x48\x03\x44\x24\x18\x48\x83\xC4\x28\xC3"
1C0A220E28122E16 0000000140049BD0 Shemu returned: 00000002 RunCount 1 start 0000000140000000 end 0000000000000000 rax 1C0A220E28122E21 RETN id 602 0000000140049C12 PUNPCKLBW xmm4, xmm2 start XMM4 (HI_32) = 0x0000000000000000000000000000000000007ffe000000000000000000000000 start XMM4 (LO_32) = 0x0000000000000000000000000000000000007ffe000000000000000000000000 start XMM2 (HI_32) = 0x0000000000000000000000000000000000007ffe000000000000000000000000 start XMM2 (LO_32) = 0x0000000000000000000000000000000000007ffe000000000000000000000000 ture XMM4 (HI_32) = 0x0000000000000000000000000000000000007ffe000000000000000000000000 ture XMM4 (LO_32) = 0x0000000000000000000000000000000000007ffe000000000000000000000000 bemu XMM4 (HI_32) = 0x000000000014fc80000000000014fa80000000000014f880000000000014f680 bemu XMM4 (LO_32) = 0x000000000014f480000000000014f280000000000014f080000000000014ee80
Shemu returned: 00000002 RunCount 1 start 0000000140000000 end 0000000000000000 rax 1C0A220E28122E21 RETN id 602
_Pragma("clang optimize off") uint64_t asmadc() { uint64_t r_xmm2[2]; r_xmm2[0] = 0x08090a0b0c0d0e0f; r_xmm2[1] = 0x0001020304050607; uint64_t r_xmm4[2]; r_xmm4[0] = 0; r_xmm4[1] = 0;
__asm
{
MOVDQA xmm4, r_xmm4
MOVDQA xmm2, r_xmm2
PUNPCKLBW xmm4, xmm2
MOVDQA r_xmm4, xmm4
MOVDQA r_xmm2, xmm2
}
return r_xmm4[0] + r_xmm4[1] + r_xmm2[0] + r_xmm2[1];
} _Pragma("clang optimize on")
Ah, yes, indeed, it seems that there was a typo in PUNPCKLBW emulation. Fixed in https://github.com/bitdefender/bddisasm/commit/47da322ea5c01c625c5c833e20f2716cd0a392c5, together with some other small tweaks & fixes.
Not sure if there is a problem, monitoring that adding this assembly instruction will cause the function to return recognition, the real machine returns successfully
ND_INS_PUNPCKLBW
000000014001EC66 PUNPCKLBW xmm4, xmm2 start XMM4 (HI_32) = 0x0000000000000000000000040000000000000000000000000000000000000000 start XMM4 (LO_32) = 0x00000000000000000000000400000000000000000000007a0000000000000045 start XMM2 (HI_32) = 0x0000000000000000000000040000000000000000000000000000000000000000 start XMM2 (LO_32) = 0x00000000000000000000000400000000000000000000007a0000000000000045 ture XMM4 (HI_32) = 0x0000000000000000000000040000000000000000000000000000000000000000 ture XMM4 (LO_32) = 0x00000000000000000000000400000000000000000000007a0000000000000045 bemu XMM4 (HI_32) = 0x000000000014fc60000000000014fa60000000000014f860000000000014f660 bemu XMM4 (LO_32) = 0x000000000014f460000000000014f260000000000014f060000000000014ee60
aesenc fail