JustasMasiulis / xorstr

heavily vectorized c++17 compile time string encryption.
Apache License 2.0
1.19k stars 193 forks source link

update #47

Closed llxiaoyuan closed 3 years ago

llxiaoyuan commented 3 years ago

sir, here some better things i think

remove (x) (x) at JM_XORSTR_LOAD_FROM_REG, compilation should speedup

remove uint64_v

use std::integral_constant and ( ) instead uint64_v

I presume that the reason for using uint64_v in the project is because of this -> The Most Vexing Parse


Test passed with this

#include <stdio.h>
#include "xorstr.hpp"
int main() {
    printf(xorstr_("123456789qwertyuiopasdfghjklzxcvbnm,.;'[]{}-=_+"));
    return 0;
}



gcc command line

i686-w64-mingw32-g++ -msse4 -std=c++17 -O2 -s -DNDEBUG -o xorstr.exe xorstr.hpp xorstr_ex.cpp

g++ -std=c++17 -O2 -s -DNDEBUG -o xorstr.exe xorstr.hpp xorstr_ex.cpp


msvc x86

push    ebp
mov     ebp, esp
and     esp, 0FFFFFFE0h
sub     esp, 0A0h
mov     eax, ___security_cookie
xor     eax, esp
mov     [esp+0A0h+var_4], eax
mov     dword ptr [esp+0A0h+var_60], 0B32386BBh
lea     eax, [esp+0A0h+_Format]
mov     dword ptr [esp+0A0h+var_60+4], 4B8E8A2Ch
mov     dword ptr [esp+0A0h+var_60+8], 530A3324h
mov     dword ptr [esp+0A0h+var_60+0Ch], 7380282Dh
mov     dword ptr [esp+0A0h+var_60+10h], 78C92B4Dh
mov     dword ptr [esp+0A0h+var_60+14h], 2AE6E326h
mov     dword ptr [esp+0A0h+var_60+18h], 2110C96h
mov     dword ptr [esp+0A0h+var_60+1Ch], 916C3817h
vmovdqu ymm0, [esp+0A0h+var_60]
mov     dword ptr [esp+0A0h+_Format], 8710B48Ah
mov     dword ptr [esp+0A0h+_Format+4], 73B9BC19h
mov     dword ptr [esp+0A0h+_Format+8], 367D421Dh
mov     dword ptr [esp+0A0h+_Format+0Ch], 6F95C5Fh
mov     dword ptr [esp+0A0h+_Format+10h], 19B94424h
mov     dword ptr [esp+0A0h+_Format+14h], 4D808755h
mov     dword ptr [esp+0A0h+_Format+18h], 6E7A66FEh
mov     dword ptr [esp+0A0h+_Format+1Ch], 0E70F406Dh
vpxor   ymm0, ymm0, ymmword ptr [esp+0A0h+_Format]
mov     dword ptr [esp+0A0h+var_80], 0C87E8EEDh
mov     dword ptr [esp+0A0h+var_80+4], 9C705A5Eh
mov     dword ptr [esp+0A0h+var_80+8], 71F2E7A5h
mov     dword ptr [esp+0A0h+var_80+0Ch], 4602013Ch
mov     dword ptr [esp+0A0h+var_40], 0E413E08Fh
mov     dword ptr [esp+0A0h+var_40+4], 0C7576170h
mov     dword ptr [esp+0A0h+var_40+8], 5C8F9CF8h
mov     dword ptr [esp+0A0h+var_40+0Ch], 46295E01h
vmovdqa ymmword ptr [esp+0A0h+_Format], ymm0
vmovaps xmm1, [esp+0A0h+var_80]
vpxor   xmm1, xmm1, [esp+0A0h+var_40]
vmovaps [esp+0A0h+var_80], xmm1
push    eax             ; _Format
vzeroupper
call    _printf
mov     ecx, [esp+0A4h+var_4]
add     esp, 4
xor     ecx, esp        ; cookie
xor     eax, eax
call    @__security_check_cookie@4 ; __security_check_cookie(x)
mov     esp, ebp
pop     ebp
retn


msvc x86 with #define JM_XORSTR_DISABLE_AVX_INTRINSICS

push    ebp
mov     ebp, esp
and     esp, 0FFFFFFF0h
sub     esp, 60h
mov     dword ptr [esp+60h+_Format], 0F42739CEh
lea     eax, [esp+60h+_Format]
mov     dword ptr [esp+60h+_Format+4], 0FDFE1593h
mov     dword ptr [esp+60h+_Format+8], 0B1256415h
mov     dword ptr [esp+60h+_Format+0Ch], 0DDB137ADh
mov     dword ptr [esp+60h+var_50], 39DFCD10h
mov     dword ptr [esp+60h+var_50+4], 0A9F9CF77h
mov     dword ptr [esp+60h+var_50+8], 47D94136h
mov     dword ptr [esp+60h+var_50+0Ch], 8954F97h
mov     dword ptr [esp+60h+var_40], 22C35B1h
mov     dword ptr [esp+60h+var_40+4], 428DB804h
mov     dword ptr [esp+60h+var_40+8], 0F9FC84CDh
mov     dword ptr [esp+60h+var_40+0Ch], 0E332DD0Eh
movaps  xmm1, xmmword ptr [esp+60h+_Format]
mov     dword ptr [esp+60h+var_30], 0C0140BFFh
mov     dword ptr [esp+60h+var_30+4], 0C5C923A6h
mov     dword ptr [esp+60h+var_30+8], 0D452152Ch
mov     dword ptr [esp+60h+var_30+0Ch], 0A8C843DFh
pxor    xmm1, [esp+60h+var_30]
movaps  xmmword ptr [esp+60h+_Format], xmm1
movaps  xmm1, [esp+60h+var_50]
mov     dword ptr [esp+60h+var_20], 58AFA279h
mov     dword ptr [esp+60h+var_20+4], 0CE9FAB04h
mov     dword ptr [esp+60h+var_20+8], 2BB22B5Eh
mov     dword ptr [esp+60h+var_20+0Ch], 7EF637EDh
pxor    xmm1, [esp+60h+var_20]
movaps  [esp+60h+var_50], xmm1
movaps  xmm1, [esp+60h+var_40]
mov     dword ptr [esp+60h+var_10], 2E415BD3h
mov     dword ptr [esp+60h+var_10+4], 19AA832Ah
mov     dword ptr [esp+60h+var_10+8], 0D481FF90h
mov     dword ptr [esp+60h+var_10+0Ch], 0E3198233h
pxor    xmm1, [esp+60h+var_10]
push    eax             ; _Format
movaps  [esp+64h+var_40], xmm1
call    _printf
add     esp, 4
xor     eax, eax
mov     esp, ebp
pop     ebp
retn


msvc x64

push    rbp
sub     rsp, 0D0h
lea     rbp, [rsp+40h]
and     rbp, 0FFFFFFFFFFFFFFE0h
mov     rax, cs:__security_cookie
xor     rax, rsp
mov     [rbp+90h+var_10], rax
mov     rax, 1809FC1D9BFDB792h
lea     rcx, [rbp+90h+_Format] ; _Format
mov     qword ptr [rbp+90h+_Format], rax
mov     rax, 0E2AD81CB52D6979Dh
mov     qword ptr [rbp+90h+_Format+8], rax
mov     rax, 0AF8ED6916A4373B4h
mov     qword ptr [rbp+90h+_Format+10h], rax
mov     rax, 0A52775F90299642Eh
mov     qword ptr [rbp+90h+_Format+18h], rax
mov     rax, 7CA1DD7A10E37A85h
mov     qword ptr [rbp+90h+var_70], rax
mov     rax, 33F0B2834D03345h
mov     qword ptr [rbp+90h+var_70+8], rax
mov     rax, 203ECA28AFCE85A3h
movdqa  xmm2, [rbp+90h+var_70]
mov     qword ptr [rbp+90h+var_50], rax
mov     rax, 97D4F5B937A1E6A4h
mov     qword ptr [rbp+90h+var_50+8], rax
mov     rax, 0C8E8B2E20B331CDDh
mov     qword ptr [rbp+90h+var_50+10h], rax
mov     rax, 0D3440D836EF20E46h
mov     qword ptr [rbp+90h+var_50+18h], rax
mov     rax, 2786E6543C8E14E7h
vmovdqu ymm0, [rbp+90h+var_50]
vpxor   ymm1, ymm0, ymmword ptr [rbp+90h+_Format]
mov     qword ptr [rbp+90h+var_30], rax
mov     rax, 314541519AD4818h
mov     qword ptr [rbp+90h+var_30+8], rax
vpxor   xmm2, xmm2, [rbp+90h+var_30]
vmovdqa [rbp+90h+var_70], xmm2
vmovdqa ymmword ptr [rbp+90h+_Format], ymm1
vzeroupper
call    printf
xor     eax, eax
mov     rcx, [rbp+90h+var_10]
xor     rcx, rsp        ; StackCookie
call    __security_check_cookie
add     rsp, 0D0h
pop     rbp
retn


msvc x64 with #define JM_XORSTR_DISABLE_AVX_INTRINSICS

mov     r11, rsp
sub     rsp, 88h
mov     rax, 0EF13EE40CA661134h
lea     rcx, [r11-68h]  ; _Format
mov     [r11-68h], rax
mov     rax, 0E215FB621FF9F24Bh
mov     [r11-60h], rax
mov     rax, 0A61E9C4C4E1D1C1Eh
mov     [r11-58h], rax
mov     rax, 10A810B038B439B4h
mov     [r11-50h], rax
mov     rax, 8106E3DF48A2BBFBh
mov     [r11-48h], rax
mov     rax, 1F9FFDE12FAE7D4Bh
mov     [r11-40h], rax
mov     rax, 0D724D875FE552305h
movdqa  xmm0, [rsp+88h+var_68]
movdqa  xmm1, [rsp+88h+var_58]
mov     qword ptr [rsp+88h+var_38], rax
mov     rax, 976C8F107A8E8372h
mov     qword ptr [rsp+88h+var_38+8], rax
mov     rax, 0C178F83F2F6D7377h
pxor    xmm0, [rsp+88h+var_38]
mov     qword ptr [rsp+88h+var_28], rax
mov     rax, 66CB68CA54DF53DCh
mov     qword ptr [rsp+88h+var_28+8], rax
mov     rax, 0DA21D8F164CFD599h
pxor    xmm1, [rsp+88h+var_28]
mov     qword ptr [rsp+88h+var_18], rax
mov     rax, 1FB4A2DC02D30616h
movdqa  [rsp+88h+var_68], xmm0
movdqa  xmm0, [rsp+88h+var_48]
mov     qword ptr [rsp+88h+var_18+8], rax
pxor    xmm0, [rsp+88h+var_18]
movdqa  [rsp+88h+var_48], xmm0
movdqa  [rsp+88h+var_58], xmm1
call    printf
xor     eax, eax
add     rsp, 88h
retn
D0h
pop     rbp
retn


clang x86 with #define JM_XORSTR_DISABLE_AVX_INTRINSICS

push    ebp
mov     ebp, esp
and     esp, 0FFFFFFF0h
sub     esp, 70h
mov     eax, 9E144474h
mov     ecx, 2FE78350h
mov     dword ptr [esp+70h+var_70+4], ecx
mov     dword ptr [esp+70h+var_70], eax
mov     eax, 845C8B83h
mov     ecx, 0A8692F02h
mov     dword ptr [esp+70h+var_70+0Ch], ecx
mov     dword ptr [esp+70h+var_70+8], eax
mov     eax, 5E1CB6Eh
mov     ecx, 998A2184h
mov     dword ptr [esp+70h+var_60+4], ecx
mov     dword ptr [esp+70h+var_60], eax
mov     eax, 3845F1C4h
mov     ecx, 609EDA28h
mov     dword ptr [esp+70h+var_60+0Ch], ecx
mov     dword ptr [esp+70h+var_60+8], eax
mov     eax, 0CFE1C4EBh
mov     ecx, 2E4D6E97h
mov     dword ptr [esp+70h+var_50+4], ecx
mov     dword ptr [esp+70h+var_50], eax
mov     eax, 0D66CCE23h
mov     ecx, 84887319h
mov     dword ptr [esp+70h+var_50+0Ch], ecx
mov     dword ptr [esp+70h+var_50+8], eax
mov     eax, 0AA277645h
mov     ecx, 17D0B565h
mov     [esp+70h+var_3C], ecx
mov     [esp+70h+var_40], eax
mov     eax, 0E12BFABAh
mov     ecx, 0DD105B70h
mov     [esp+70h+var_34], ecx
mov     [esp+70h+var_38], eax
mov     eax, 6491A407h
mov     ecx, 0FEEC45F7h
mov     [esp+70h+var_2C], ecx
mov     [esp+70h+var_30], eax
mov     eax, 542E9BACh
mov     ecx, 16FDA252h
mov     [esp+70h+var_24], ecx
mov     [esp+70h+var_28], eax
mov     eax, 0E38CAA89h
mov     ecx, 756A55B9h
mov     [esp+70h+var_1C], ecx
mov     [esp+70h+var_20], eax
mov     eax, 0FB11B57Eh
mov     ecx, 84A32C24h
mov     [esp+70h+var_14], ecx
mov     [esp+70h+var_18], eax
lea     eax, [esp+70h+var_40]
xor     ecx, ecx
movaps  xmm0, [esp+70h+var_70]
xorps   xmm0, xmmword ptr [eax]
movaps  [esp+70h+var_70], xmm0
movaps  xmm0, [esp+70h+var_60]
xorps   xmm0, xmmword ptr [eax+10h]
movaps  [esp+70h+var_60], xmm0
movaps  xmm0, [esp+70h+var_50]
xorps   xmm0, xmmword ptr [eax+20h]
mov     eax, esp
movaps  [esp+70h+var_50], xmm0
push    eax             ; char *
call    _printf
add     esp, 4
xor     eax, eax
mov     esp, ebp
pop     ebp
retn


clang x64 with #define JM_XORSTR_DISABLE_AVX_INTRINSICS

sub     rsp, 88h
mov     rax, 0DF3D76B92731B19Eh
mov     qword ptr [rsp+88h+var_68], rax
mov     rax, 834ADCCF63E2147Dh
mov     qword ptr [rsp+88h+var_68+8], rax
mov     rax, 0F187B9B954509878h
mov     qword ptr [rsp+88h+var_58], rax
mov     rax, 833E1251514575E6h
mov     qword ptr [rsp+88h+var_58+8], rax
mov     rax, 0FFECBC5EEE9EAE31h
mov     qword ptr [rsp+88h+var_48], rax
mov     rax, 9ACE682C815200C5h
mov     qword ptr [rsp+88h+var_48+8], rax
mov     rax, 0E70A408C130283AFh
mov     [rsp+88h+var_38], rax
mov     rax, 0F633A8BD06956544h
mov     [rsp+88h+var_30], rax
mov     rax, 96E1DDCA3520F711h
mov     [rsp+88h+var_28], rax
mov     rax, 0F55D6A2B3D2E1F8Eh
mov     [rsp+88h+var_20], rax
mov     rax, 0A4CB8770C2F3C053h
mov     [rsp+88h+var_18], rax
mov     rax, 9AE53711AC2F7B98h
mov     [rsp+88h+var_10], rax
lea     rax, [rsp+88h+var_38]
movaps  xmm0, xmmword ptr [rsp+88h+var_68]
xorps   xmm0, xmmword ptr [rax]
movaps  xmmword ptr [rsp+88h+var_68], xmm0
movaps  xmm0, [rsp+88h+var_58]
xorps   xmm0, xmmword ptr [rax+10h]
movaps  [rsp+88h+var_58], xmm0
movaps  xmm0, [rsp+88h+var_48]
xorps   xmm0, xmmword ptr [rax+20h]
movaps  [rsp+88h+var_48], xmm0
lea     rcx, [rsp+88h+var_68] ; char *
call    printf
xor     eax, eax
add     rsp, 88h
retn


gcc x86 with #define JM_XORSTR_DISABLE_AVX_INTRINSICS

push    ebp
mov     ebp, esp
and     esp, 0FFFFFFF0h
sub     esp, 70h
call    sub_401650
mov     eax, 1A19B2CAh
mov     edx, 0D0D7B851h
mov     dword ptr [esp+70h+var_60], eax
mov     eax, 8366B0DDh
mov     dword ptr [esp+70h+var_60+4], edx
mov     edx, 9C5E0837h
mov     dword ptr [esp+70h+var_60+8], eax
mov     eax, 273D4744h
mov     dword ptr [esp+70h+var_60+0Ch], edx
mov     edx, 0E63FC40Dh
movdqa  xmm0, xmmword ptr [esp+70h+var_60]
mov     dword ptr [esp+70h+var_50], eax
mov     eax, 4E3382F6h
mov     dword ptr [esp+70h+var_50+4], edx
mov     edx, 2BCA435Dh
mov     dword ptr [esp+70h+var_50+8], eax
mov     eax, 0B04B35ADh
mov     dword ptr [esp+70h+var_50+0Ch], edx
mov     edx, 0D799E38Eh
mov     dword ptr [esp+70h+var_40], eax
mov     eax, 1E828DB5h
mov     dword ptr [esp+70h+var_40+4], edx
mov     edx, 0BDC782DCh
mov     dword ptr [esp+70h+var_40+8], eax
mov     eax, 2E2A80FBh
mov     dword ptr [esp+70h+var_40+0Ch], edx
mov     edx, 0E8E08E64h
mov     dword ptr [esp+70h+var_30], eax
mov     eax, 0E611C1E4h
mov     dword ptr [esp+70h+var_30+4], edx
mov     edx, 0E9277C45h
mov     dword ptr [esp+70h+var_30+8], eax
mov     eax, 464D282Dh
mov     dword ptr [esp+70h+var_30+0Ch], edx
pxor    xmm0, [esp+70h+var_30]
mov     edx, 8159A07Eh
movaps  xmmword ptr [esp+70h+var_60], xmm0
movdqa  xmm0, [esp+70h+var_50]
mov     dword ptr [esp+70h+var_20], eax
mov     eax, 2258E89Eh
mov     dword ptr [esp+70h+var_20+4], edx
mov     edx, 5DA93B27h
mov     dword ptr [esp+70h+var_20+8], eax
mov     eax, 9C265BCFh
mov     dword ptr [esp+70h+var_20+0Ch], edx
pxor    xmm0, [esp+70h+var_20]
mov     edx, 8CBED8A0h
movaps  [esp+70h+var_50], xmm0
movdqa  xmm0, [esp+70h+var_40]
mov     dword ptr [esp+70h+var_10], eax
mov     eax, 33FFF6E8h
mov     dword ptr [esp+70h+var_10+4], edx
mov     edx, 0BDECDDE1h
mov     dword ptr [esp+70h+var_10+8], eax
lea     eax, [esp+70h+var_60]
mov     dword ptr [esp+70h+var_10+0Ch], edx
pxor    xmm0, [esp+70h+var_10]
movaps  [esp+70h+var_40], xmm0
mov     [esp+70h+Format], eax ; Format
call    printf
xor     eax, eax
leave
retn


gcc x64 with #define JM_XORSTR_DISABLE_AVX_INTRINSICS

sub     rsp, 88h
call    sub_401600
lea     rcx, [rsp+88h+Format] ; Format
mov     rax, 6335F255388D4FD6h
mov     qword ptr [rsp+88h+Format], rax
mov     rax, 0A2A90DDB719C70ADh
mov     qword ptr [rsp+88h+Format+8], rax
movdqa  xmm0, xmmword ptr [rsp+88h+Format]
mov     rax, 80871D65B03F49F0h
mov     qword ptr [rsp+88h+var_58], rax
mov     rax, 59A58CA578ED42F6h
mov     qword ptr [rsp+88h+var_58+8], rax
mov     rax, 3AEEE05AA042D5C9h
mov     qword ptr [rsp+88h+var_48], rax
mov     rax, 0DC82F1B02D9645E5h
mov     qword ptr [rsp+88h+var_48+8], rax
mov     rax, 5B02C4600CBE7DE7h
mov     qword ptr [rsp+88h+var_38], rax
mov     rax, 0D7D079A914EB0194h
mov     qword ptr [rsp+88h+var_38+8], rax
pxor    xmm0, [rsp+88h+var_38]
mov     rax, 0E7E17916D14F2699h
movaps  xmmword ptr [rsp+88h+Format], xmm0
movdqa  xmm0, [rsp+88h+var_58]
mov     qword ptr [rsp+88h+var_28], rax
mov     rax, 2FC6F4DF1486289Eh
mov     qword ptr [rsp+88h+var_28+8], rax
pxor    xmm0, [rsp+88h+var_28]
mov     rax, 61C9DB748C2FBBABh
movaps  [rsp+88h+var_58], xmm0
movdqa  xmm0, [rsp+88h+var_48]
mov     qword ptr [rsp+88h+var_18], rax
mov     rax, 0DCA9AE8D00EB3EB8h
mov     qword ptr [rsp+88h+var_18+8], rax
pxor    xmm0, xmmword ptr [rsp+88h+var_18]
movaps  [rsp+88h+var_48], xmm0
call    printf
xor     eax, eax
add     rsp, 88h
retn
JustasMasiulis commented 3 years ago

remove (x) (x) at JM_XORSTR_LOAD_FROM_REG, compilation should speedup

👍

remove uint64_v

I use it as a micro optimization. It is smaller / simpler than integral_constant. Not really sure if it makes any difference though 🤷 - would need to benchmark.