Open Knogle opened 2 months ago
Ahoy. There is a legacy variant inside the code, so if there is no support present it will use the variant without SSE and AVX. I have to alter it, so it works for the compile time as well.
if( use_avx2 )
{
// Use AVX2-optimized version
rc4_genrand_4096_to_buf_avx2( (rc4_state_t*) *state, temp_output );
}
else if( use_sse4 )
{
// Use SSE4.2-optimized version
rc4_genrand_4096_to_buf_sse42( (rc4_state_t*) *state, temp_output );
}
else
{
// Fallback to generic version
rc4_genrand_4096_to_buf( (rc4_state_t*) *state, temp_output );
}
Should now compile and work perfectly fine, even from older platforms :) Added pre-processor macro in addition to the instruction checking. Also compiles and works fine on ARM.
What are your thoughts on this? @PartialVolume I've rewritten the AVX2 function in NASM, providing a small performance gain which i have tested. Is this something that's okay from your perspective, or do you prefer to stay in plain C? Regarding this topic, is nwipe meant to be to stay in C, or be migrated to C++ or Rust or something?
; rc4_genrand_4096_to_buf_avx2.asm
; Generates 4096 bytes of pseudorandom data using RC4, optimized with AVX2 instructions.
; This function is designed to work with a modified RC4 algorithm that utilizes AVX2 for
; faster processing of blocks of 32 bytes at a time, reducing overhead and optimizing throughput.
; Function prototype:
; void rc4_genrand_4096_to_buf_avx2(rc4_state_t* state, unsigned char* bufpos);
section .text
global rc4_genrand_4096_to_buf_avx2
rc4_genrand_4096_to_buf_avx2:
; Function parameters:
; rc4_state_t* state -> RDI (pointer to the RC4 state structure)
; unsigned char* bufpos -> RSI (pointer to the output buffer)
; Function prologue to preserve the calling function's state
push rbp ; Save the base pointer
mov rbp, rsp ; Set up a new stack frame
push rbx ; Save general-purpose registers
push r12
push r13
push r14
push r15
; Local variables and register assignments:
; rdi -> state (pointer to RC4 state structure)
; rsi -> bufpos (pointer to the output buffer)
; rax, rbx, rcx, rdx, r8, r9 are used as temporary registers
; Initialize n (the loop counter) to 0
xor rax, rax ; Clear rax, setting it to 0
mov r13, rax ; r13 = n = 0
; Define the output data length (4096 bytes)
mov r12, 4096 ; r12 = OUTPUT_DATA_LENGTH
.loop_start:
; Check if n has reached the output length (4096 bytes)
cmp r13, r12 ; Compare n with OUTPUT_DATA_LENGTH
jge .loop_end ; If n >= OUTPUT_DATA_LENGTH, exit the loop
; Prefetch the S-box data for faster access
mov rax, [rdi] ; Load state->S (pointer to S-box) into rax
movzx ebx, byte [rdi + 256] ; Load state->i into ebx
add ebx, 16 ; Increment state->i by 16 (preparation for AVX2)
and ebx, 0xFF ; Ensure the index is within 0-255 (modulo 256)
lea rcx, [rax + rbx] ; Compute the address of state->S[state->i + 16]
prefetcht0 [rcx] ; Prefetch the S-box data for better cache utilization
; Increment the RC4 counter (used for CTR-mode modifications)
; state->counter++
mov rax, [rdi + 264] ; Load state->counter
inc rax ; Increment the counter
mov [rdi + 264], rax ; Store the updated counter back
; Mix the counter into the S-box (to add variation)
; uint64_t counter_value = state->counter;
mov r14, rax ; r14 = counter_value (the incremented counter)
mov r15, 0 ; Initialize loop counter for mixing (i = 0)
.counter_mix_loop:
; Mix the counter into the state for 8 iterations
cmp r15, 8 ; Compare loop counter with 8
jge .generate_random ; If i >= 8, proceed to generate random bytes
; Increment and wrap state->i
movzx ebx, byte [rdi + 256] ; Load state->i
inc ebx ; Increment state->i by 1
and ebx, 0xFF ; Modulo 256 to wrap within 0-255
mov byte [rdi + 256], bl ; Store updated state->i
; Update state->j based on the new state->i, S-box, and counter
movzx ecx, byte [rdi + 257] ; Load state->j
mov rax, [rdi] ; Load pointer to the S-box
movzx edx, byte [rax + rbx] ; Load state->S[state->i]
movzx edi, dl ; edi = state->S[state->i]
movzx esi, byte r14b ; (counter_value & 0xFF)
add ecx, edi ; Update state->j with state->S[state->i]
add ecx, esi ; Add the counter's least significant byte
and ecx, 0xFF ; Modulo 256 for wrapping
mov byte [rdi + 257], cl ; Store updated state->j
; Swap state->S[state->i] and state->S[state->j]
movzx edx, byte [rax + rbx] ; Load state->S[state->i]
movzx edi, byte [rax + rcx] ; Load state->S[state->j]
mov byte [rax + rbx], dil ; state->S[state->i] = state->S[state->j]
mov byte [rax + rcx], dl ; state->S[state->j] = the previous state->S[state->i]
; Shift the counter to process the next byte
shr r14, 8 ; Right shift the counter_value by 8 bits
inc r15 ; Increment the loop counter
jmp .counter_mix_loop ; Continue mixing for 8 iterations
.generate_random:
; Generate 32 bytes of pseudorandom data in this iteration
mov r15, 0 ; Reset the loop counter for data generation
.random_gen_loop:
; Generate 1 byte at a time until 32 bytes are generated
cmp r15, 32 ; Check if 32 bytes have been generated
jge .store_data ; If 32 bytes are generated, proceed to store
; Increment state->i
movzx ebx, byte [rdi + 256] ; Load state->i
inc ebx ; Increment state->i
and ebx, 0xFF ; Wrap within 0-255 (modulo 256)
mov byte [rdi + 256], bl ; Store updated state->i
; Update state->j based on state->i and the S-box
movzx ecx, byte [rdi + 257] ; Load state->j
mov rax, [rdi] ; Load pointer to the S-box
movzx edx, byte [rax + rbx] ; Load state->S[state->i]
add ecx, edx ; Update state->j with state->S[state->i]
and ecx, 0xFF ; Modulo 256 for wrapping
mov byte [rdi + 257], cl ; Store updated state->j
; Swap state->S[state->i] and state->S[state->j]
movzx edx, byte [rax + rbx] ; Load state->S[state->i]
movzx edi, byte [rax + rcx] ; Load state->S[state->j]
mov byte [rax + rbx], dil ; Swap state->S[state->i] with state->S[state->j]
mov byte [rax + rcx], dl ; Swap state->S[state->j] with state->S[state->i]
; Generate the pseudorandom byte from the S-box
movzx edx, byte [rax + rbx] ; Load state->S[state->i]
movzx edi, byte [rax + rcx] ; Load state->S[state->j]
add edx, edi ; Sum the two values
and edx, 0xFF ; Modulo 256 to wrap
movzx edx, byte [rax + rdx] ; Load the pseudorandom byte from S-box
; Store the byte into the temporary buffer
mov byte [rsp + r15], dl
; Increment the loop counter for the next byte
inc r15
jmp .random_gen_loop ; Continue generating bytes
.store_data:
; Store the generated 32 bytes into the output buffer
vmovdqu ymm0, [rsp] ; Load 32 bytes into the AVX2 register ymm0
vmovdqu [rsi + r13], ymm0 ; Store 32 bytes to the output buffer
; Increment n by 32 (since 32 bytes have been generated in this iteration)
add r13, 32
jmp .loop_start ; Continue the loop to generate the next block
.loop_end:
; Function epilogue to restore the previous state
pop r15 ; Restore saved registers
pop r14
pop r13
pop r12
pop rbx
pop rbp
ret ; Return to the caller
I don't have a problem with assembler unless it reduces the number of CPUs that nwipe can be built for, which I guess is very likely?. Nwipe must be able to run on Intel or AMD processors prior to the AVX2 instructions, right back to circa 2000 and Pentium 4s. Also nwipe currently runs on ARM processors so anything that makes it less compatible with a wide range of systems is not something I would want. I'd rather it was slightly slower but could be used on any computer rather than tuned and compatible with a subset of systems.
As for rust, there is nothing stopping anybody writing some wipe software in rust but I really don't really see any point rewriting an application like nwipe in rust. It just to seems to me it would be a whole load of work possibly resulting in a buggy implementation, after all, memory safety is not the only place bugs might exist. But anybody wanting to re-write nwipe in rust, go for it but it's highly unlikely I would accept rust code so they would need to run their own separate project.
Same goes for C++, once I might have accepted C++ code, I have written a few QT based C++ programs in the past but nwipe is C and that's the way it will probably stay at least while I'm still working on it.
I agree with this stance. Nwipe needs to interface and/or borrow from low-level code, usually written in C, and portability across CPUs is a must as servers are adding ARM processors.
-- Mike
From: PartialVolume @.> Sent: Thursday, September 19, 2024 2:40:37 PM To: martijnvanbrummelen/nwipe @.> Cc: Subscribed @.***> Subject: Re: [martijnvanbrummelen/nwipe] Implement RC4 PRNG with AVX2 and SSE4.2 Optimizations (PR #604)
I don't have a problem with assembler unless it reduces the number of CPUs that nwipe can be built for, which I guess is very likely?. Nwipe must be able to run on Intel or AMD processors prior to the AVX2 instructions, right back to circa 2000 and Pentium 4s. Also nwipe currently runs on ARM processors so anything that makes it less compatible with a wide range of systems is not something I would want. I'd rather it was slightly slower but could be used on any computer rather than tuned and compatible with a subset of systems.
As for rust, there is nothing stopping anybody writing some wipe software in rust but I really don't really see any point rewriting an application like nwipe in rust. It just to seems to me it would be a whole load of work possibly resulting in a buggy implementation, after memory safety is not the only place bugs might exist. But anybody wanting to re-write nwipe in rust, go for it but it's highly unlikely I would accept rust code so they would need to run their own separate project.
Same goes for C++, once I might have accepted C++ code, I have written a few QT based C++ programs in the past but nwipe is C and that's the way it will probably stay at least while I'm still working on it.
— Reply to this email directly, view it on GitHubhttps://github.com/martijnvanbrummelen/nwipe/pull/604#issuecomment-2362037559, or unsubscribehttps://github.com/notifications/unsubscribe-auth/ANGK2PRW26O2V2ZU7DMY23TZXMSDLAVCNFSM6AAAAABOB6DEDSVHI2DSMVQWIX3LMV43OSLTON2WKQ3PNVWWK3TUHMZDGNRSGAZTONJVHE. You are receiving this because you are subscribed to this thread.Message ID: @.***>
This commit introduces a high-performance RC4-based pseudorandom number generator (PRNG) optimized for modern CPU architectures. Key changes and improvements over the traditional RC4 implementation include:
This RC4 PRNG is now faster and more suitable for generating large volumes of random data, taking full advantage of modern hardware capabilities. It is not intended for cryptographic security purposes.
It also provides insanely high entropy by dropping the first biased 256-bits, and introducing the CTR mode. This algorithm, in comparison to the others, can still be massively optimized.
0 0x0 Rising entropy edge (0.999979)