rust-lang / rust

Empowering everyone to build reliable and efficient software.
https://www.rust-lang.org
Other
96.92k stars 12.53k forks source link

Rust 1.79 regression: LLVM hang when compiling brotli-decompressor for i686 #130271

Open neivv opened 1 week ago

neivv commented 1 week ago

Trying to compile brotli-decompressor crate no longer finishes (in any reasonable time) since Rust 1.79 when trying to compile with the following parameters:

For reference, rustc 1.78 reports LLVM version: 18.1.2 while rustc 1.79 reports LLVM version: 18.1.7.

Stack trace from the only thread doing work

The function which is taking forever to run is llvm::SelectionDAG::Combine. This specific backtrace is from 1.82 compiler built from commit 9649706, ~two weeks ago, with LLVM 19.1.0 from Rust CI.

rustc_driver_90ddd6af3f105e8d!memcpy+0x236 [D:\a\_work\1\s\src\vctools\crt\vcruntime\src\string\amd64\memcpy.asm @ 355] 
rustc_driver_90ddd6af3f105e8d!llvm::SetVector<llvm::SDNode * __ptr64,llvm::SmallVector<llvm::SDNode * __ptr64,32>,llvm::DenseSet<llvm::SDNode * __ptr64,llvm::DenseMapInfo<llvm::SDNode * __ptr64,void> >,32>::insert+0x60d7f
rustc_driver_90ddd6af3f105e8d!llvm::SetVector<llvm::SDNode * __ptr64,llvm::SmallVector<llvm::SDNode * __ptr64,32>,llvm::DenseSet<llvm::SDNode * __ptr64,llvm::DenseMapInfo<llvm::SDNode * __ptr64,void> >,32>::insert+0x322d
rustc_driver_90ddd6af3f105e8d!llvm::SelectionDAG::Combine+0x78a
rustc_driver_90ddd6af3f105e8d!llvm::SelectionDAGISel::CodeGenAndEmitDAG+0x62c
rustc_driver_90ddd6af3f105e8d!llvm::SelectionDAGISel::SelectBasicBlock+0x1bc
rustc_driver_90ddd6af3f105e8d!llvm::SelectionDAGISel::SelectAllBasicBlocks+0x184c
rustc_driver_90ddd6af3f105e8d!llvm::SelectionDAGISel::runOnMachineFunction+0x14e
rustc_driver_90ddd6af3f105e8d!llvm::<no symbols?>
rustc_driver_90ddd6af3f105e8d!llvm::SelectionDAGISelLegacy::runOnMachineFunction+0xad
rustc_driver_90ddd6af3f105e8d!llvm::MachineFunctionPass::runOnFunction+0x245
rustc_driver_90ddd6af3f105e8d!llvm::FPPassManager::runOnFunction+0x26c
rustc_driver_90ddd6af3f105e8d!llvm::FPPassManager::runOnModule+0x31
rustc_driver_90ddd6af3f105e8d!llvm::legacy::PassManagerImpl::run+0x361
rustc_driver_90ddd6af3f105e8d!LLVMRustWriteOutputFile+0x3b7
rustc_driver_90ddd6af3f105e8d!<no symbols>
rustc_driver_90ddd6af3f105e8d!<no symbols>
rustc_driver_90ddd6af3f105e8d!<no symbols>
rustc_driver_90ddd6af3f105e8d!<no symbols>
rustc_driver_90ddd6af3f105e8d!<no symbols>
std_4dc33288a5442bd4!alloc::boxed::impl$48::call_once+0xb [rust\library\alloc\src\boxed.rs @ 2231] 
std_4dc33288a5442bd4!alloc::boxed::impl$48::call_once+0x16 [rust\library\alloc\src\boxed.rs @ 2231] 
std_4dc33288a5442bd4!std::sys::pal::windows::thread::impl$0::new::thread_start+0x58 [rust\library\std\src\sys\pal\windows\thread.rs @ 55] 
KERNEL32!BaseThreadInitThunk+0x14
ntdll!RtlUserThreadStart+0x21

Minimized reproducer

This one finishes within 5~10 minutes due to being simpler but still has the behaviour of taking massively longer when targeting i686.

// rustc -C opt-level=1 --crate-type lib --target i686-pc-windows-msvc a.rs
#![allow(dead_code, unused_variables)]

macro_rules! static_array2 {
    (@accum (0, $($_ignored:expr),*) -> ($($body:tt)*))
        => {static_array2!(@as_expr [$($body)*])};
    (@accum (1, $($expr:expr),*) -> ($($body:tt)*))
        => {static_array2!(@accum (0, $($expr),*) -> ($($body)* $($expr,)*))};
    (@accum (2, $($expr:expr),*) -> ($($body:tt)*))
        => {static_array2!(@accum (0, $($expr),*) -> ($($body)* $($expr,)* $($expr,)*))};
    (@accum (4, $($expr:expr),*) -> ($($body:tt)*))
        => {static_array2!(@accum (2, $($expr,)* $($expr),*) -> ($($body)*))};
    (@accum (8, $($expr:expr),*) -> ($($body:tt)*))
        => {static_array2!(@accum (4, $($expr,)* $($expr),*) -> ($($body)*))};
    (@accum (16, $($expr:expr),*) -> ($($body:tt)*))
        => {static_array2!(@accum (8, $($expr,)* $($expr),*) -> ($($body)*))};
    (@accum (32, $($expr:expr),*) -> ($($body:tt)*))
        => {static_array2!(@accum (16, $($expr,)* $($expr),*) -> ($($body)*))};
    (@accum (64, $($expr:expr),*) -> ($($body:tt)*))
        => {static_array2!(@accum (32, $($expr,)* $($expr),*) -> ($($body)*))};
    (@accum (128, $($expr:expr),*) -> ($($body:tt)*))
        => {static_array2!(@accum (64, $($expr,)* $($expr),*) -> ($($body)*))};
    (@accum (256, $($expr:expr),*) -> ($($body:tt)*))
        => {static_array2!(@accum (128, $($expr,)* $($expr),*) -> ($($body)*))};
    (@accum (512, $($expr:expr),*) -> ($($body:tt)*))
        => {static_array2!(@accum (256, $($expr,)* $($expr),*) -> ($($body)*))};
    (@accum (1024, $($expr:expr),*) -> ($($body:tt)*))
        => {static_array2!(@accum (512, $($expr,)* $($expr),*) -> ($($body)*))};
    (@accum (2048, $($expr:expr),*) -> ($($body:tt)*))
        => {static_array2!(@accum (1024, $($expr,)* $($expr),*) -> ($($body)*))};
    (@as_expr $expr:expr) => {$expr};

    ($expr:expr; $n:tt) => { static_array2!(@accum ($n, $expr) -> ()) };
}

pub struct StackAllocator<'a> {
    freelist : [&'a mut [u8]; 512],
}

pub fn brotli_decode_prealloc(input: &[u8])  {
    let stack_hc_allocator = StackAllocator {
        freelist : static_array2!(&mut[]; 512),
    };
    let brotli_state = BrotliState::new(stack_hc_allocator);
    if input[0] == 0 { panic!("a") }
}

pub struct BrotliState<'c> {
  pub alloc_hc: StackAllocator<'c>,
}

impl <'a, 'b, 'c> BrotliState<'c> {
    #[inline(never)]
    pub fn new(alloc_hc : StackAllocator<'c>) -> Self {
        BrotliState {
            alloc_hc,
        }
    }
}

extern {
    fn BrotliStateCleanup(x: *mut u32);
}

impl <'c> Drop for BrotliState<'c> {
    fn drop(&mut self) {
        unsafe { BrotliStateCleanup(self as *mut _ as *mut u32); }
    }
}
neivv commented 1 week ago

Reproducing this issue on the full brotli-decompressor crate seems finicky. I was able to do it while trying to get the minified example, and it still hangs while being compiled as a dependency, but at the moment just cloning source and building it with opt-level 1 started to work fine 🤷

nikic commented 1 week ago

Doesn't reproduce on i686-linux-unknown-gnu, but does on i686-pc-windows-msvc. From a quick look, it seems like the time is spent in the store merging DAGCombine. I think there's a PR somewhere to improve that, I'll try to find it again...

nikic commented 1 week ago

The PR I had in mind is https://github.com/llvm/llvm-project/pull/106949. Without that PR the test case takes ~1 min, with it about ~1 second. I'll request a backport.