Closed shpalani closed 1 week ago
As a follow-up, please provide the stack trace of the segmentation violation.
0:000> k
# Child-SP RetAddr Call Site
00 00000022`48afe8e8 00007ff6`367ebc74 VCRUNTIME140!memcpy+0x17d [D:\a\_work\1\s\src\vctools\crt\vcruntime\src\string\amd64\memcpy.asm @ 304]
01 (Inline Function) --------`-------- unit_tests!std::_Copy_memmove+0x14 [C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Tools\MSVC\14.39.33519\include\xutility @ 4498]
02 (Inline Function) --------`-------- unit_tests!std::_Copy_unchecked+0x14 [C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Tools\MSVC\14.39.33519\include\xutility @ 4570]
03 (Inline Function) --------`-------- unit_tests!std::copy+0x14 [C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Tools\MSVC\14.39.33519\include\xutility @ 4589]
04 00000022`48afe8f0 00007ff6`367f13cf unit_tests!_update_map_element_batch+0x2b4 [C:\Users\shpalan\ebpf-for-windows-2024\ebpf-for-windows\libs\api\ebpf_api.cpp @ 780]
05 00000022`48afea10 00007ff6`36795371 unit_tests!ebpf_map_update_element_batch+0x3df [C:\Users\shpalan\ebpf-for-windows-2024\ebpf-for-windows\libs\api\ebpf_api.cpp @ 984]
06 00000022`48afeb00 00007ff6`367aebf5 unit_tests!_test_maps_batch+0x581 [C:\Users\shpalan\ebpf-for-windows-2024\ebpf-for-windows\tests\unit\libbpf_test.cpp @ 3317]
07 00000022`48aff260 00007ff6`367af2c4 unit_tests!Catch::RunContext::invokeActiveTestCase+0x45
08 00000022`48aff290 00007ff6`367af89a unit_tests!Catch::RunContext::runCurrentTest+0x2e4
09 00000022`48aff4a0 00007ff6`36887805 unit_tests!Catch::RunContext::runTest+0x39a
0a 00000022`48aff750 00007ff6`3688822d unit_tests!Catch::Clara::Detail::BasicResult<Catch::Clara::Detail::ParseState>::enforceOk+0xa5
0b 00000022`48aff7e0 00007ff6`3688271a unit_tests!Catch::Session::runInternal+0x24d
0c 00000022`48affac0 00007ff6`3665f818 unit_tests!main+0x9a
0d (Inline Function) --------`-------- unit_tests!invoke_main+0x22 [D:\a\_work\1\s\src\vctools\crt\vcstartup\src\startup\exe_common.inl @ 78]
0e 00000022`48affc80 00007fff`2626257d unit_tests!__scrt_common_main_seh+0x10c [D:\a\_work\1\s\src\vctools\crt\vcstartup\src\startup\exe_common.inl @ 288]
0f 00000022`48affcc0 00007fff`27f4aa48 KERNEL32!BaseThreadInitThunk+0x1d
10 00000022`48affcf0 00000000`00000000 ntdll!RtlUserThreadStart+0x28 [minkernel\ntdll\rtlstrt.c @ 1166]
0:000> g
(13c8.395c): Access violation - code c0000005 (first chance)
First chance exceptions are reported before any exception handling.
This exception may be expected and handled.
VCRUNTIME140!memcpy+0x17d:
00007fff`05d9123d c5fe6f02 vmovdqu ymm0,ymmword ptr [rdx] ds:0000025c`0c60f000=??
VCRUNTIME140!memcpy+0x17d:
00007fff05d9123d c5fe6f02 vmovdqu ymm0,ymmword ptr [rdx] ds:0000025c
0c60f000=??
;
; move memory with AVX loads and stores
;
; Entry:
; r8 - length (> 32 bytes)
; rcx - destination block address
; rdx - source block address
; Exit:
; rax - destination block address
;
; Implementation:
; the operation is based on a loop with eight AVX loads and stores copying 256 bytes in an iteration with the remainder (or blocks themselves)
; handled through a table based dispatch to the code that handles the remaining length with the appropriate number of AVX loads and stores
;
MoveWithYMM:
**vmovdqu ymm0, YMMWORD PTR (-__AVX_STEP_LEN*0)[rdx] ; save the first 32-byte chunk which may be overwritten while copying overlapping blocks with destination not 32-byte aligned**
vmovdqu ymm5, YMMWORD PTR (-__AVX_STEP_LEN*1)[rdx + r8] ; save the last 32-byte chunk which may be overwritten by the next to last AVX store
cmp r8, __AVX_LOOP_LEN ; blocks up to the loop iteration size (256 bytes) go straight to the loop remainder handling code
jbe MovUpTo256WithYMM ; fall-through to the loop code for blocks greater than the loop iteration size (256 bytes)
mov r9, rcx ; it is generally profitable to align the loop stores on the 32-byte boundary, this is done by shifting
and r9, __AVX_STEP_LEN-1 ; the starting source (rdx) and destination (rcx) copy addresses by the number of bytes (complement r9) required
sub r9, __AVX_STEP_LEN ; to make the destination block aligned on the next 32-byte boundary and reducing the loop copy length accordingly,
sub rcx, r9 ; the skipped head bytes will be copied in the remainder handling code;
sub rdx, r9
add r8, r9
cmp r8, __AVX_LOOP_LEN ; compare the block copy length (which may have gotten shorter) to the loop iteration length,
jbe MovUpTo256WithYMM ; if it is less than it go to the loop remainder handling code
cmp r8, __NT_AVX_THRESHOLD ; blocks greater than a certain threshold are best handled with non-temporal stores
ja YmmLoopNT
0:000> dx Debugger.Sessions[0].Processes[5064].Threads[14684].Stack.Frames[4].SwitchTo();dv /t /v
Debugger.Sessions[0].Processes[5064].Threads[14684].Stack.Frames[4].SwitchTo()
00000093`62efe928 int64 map_handle = 0n1
00000093`62efe920 void * key = 0x0000025c`0c5a1da0
00000093`62efe910 unsigned int64 key_size = 4
00000093`62efe930 void * value = 0x0000025c`0c5b5680
00000093`62efe908 unsigned int64 value_size = 0x80
00000093`62efea18 unsigned int * count = 0x00000093`62eff0e0
00000093`62efea20 unsigned int64 flags = 1
00000093`62efe958 struct _ebpf_operation_map_update_element_batch_reply reply = struct _ebpf_operation_map_update_element_batch_reply
00000093`62efe938 unsigned int64 input_count = 0x4e20
00000093`62efe940 class std::vector<unsigned char,std::allocator<unsigned char> > request_buffer = { size=0xffd4 }
@ebx ebpf_result result = EBPF_STALE_ID (0n33)
<unavailable> struct _ebpf_operation_map_update_element_batch_request * request = <value unavailable>
00000093`62efe900 unsigned int64 key_index = 0x9b0
@r12 unsigned int64 index = 0x183
<unavailable> unsigned char * destination_value = <value unavailable>
<unavailable> unsigned char * destination_key = <value unavailable>
index 0x183 = 387 key_index 0x9b0 = 2480 input_count 0x4e20 = 20000
Code:
void
_test_maps_batch(bpf_map_type map_type)
{
...
// Insert keys in batch.
REQUIRE(**bpf_map_update_batch**(map_fd, keys.data(), values.data(), &update_batch_size, &opts) == 0);
...
}
static ebpf_result_t
_update_map_element_batch(
ebpf_handle_t map_handle,
_In_opt_ const void* key,
size_t key_size,
_In_ const void* value,
size_t value_size,
_Inout_ uint32_t* count,
uint64_t flags) NO_EXCEPT_TRY
{
...
for (size_t index = 0; index < entries_to_update; index++) {
uint8_t* source_key = (uint8_t*)key + (key_index + index) * key_size;
uint8_t* source_value = (uint8_t*)value + (key_index + index) * value_size;
uint8_t* destination_key = request->data + index * (key_size + value_size);
uint8_t* destination_value = request->data + index * (key_size + value_size) + key_size;
if (key_size > 0) {
std::copy(source_key, source_key + key_size, destination_key);
}
std::copy(source_value, source_value + value_size, destination_value);
}
...
}
Analysis:
Additional:
Thanks @shpalani . For doxygen, you would need to add the documentation in include\bpf\bpf.h. I see the batch APIs are missing there.
From the discussion, checking the value size and returning EINVAL is impossible in the ebpf platform.. And since it is a user error, the P1 can be removed.
Status: With the modified user-mode applied value-size for per-cpu hash map type, bpf_map_lookup_batch doesn't work after bpf_map_update_batch call. I am still investigating and debugging the ebpf core for a fix.
In PR last week, waiting for reviewers.
Describe the bug
OS information
Windows 11 and above.
Steps taken to reproduce the bug
Add the below test case to the existing batch operation test code in https://github.com/microsoft/ebpf-for-windows/pull/3563
TEST_CASE("libbpf percpu hash map batch", "[libbpf]") { _test_maps_batch(BPF_MAP_TYPE_PERCPU_HASH); } the
Expected behavior
No crash.
Actual outcome
Additional details
No response