pola-rs / polars

Dataframes powered by a multithreaded, vectorized query engine, written in Rust
https://docs.pola.rs
Other
30.68k stars 1.99k forks source link

StackOverflow when using `collect_all` but not `collect` #14079

Open TylerGrantSmith opened 10 months ago

TylerGrantSmith commented 10 months ago

Checks

Reproducible example

import polars as pl

# NVARS = 117 # works
NVARS = 118
COLUMNS = list(map(str, range(NVARS)))
FACTOR_COL = "factor"

lookups = {c: pl.LazyFrame({c: ["a"], FACTOR_COL: [1.0]}) for c in COLUMNS}
ldf = pl.LazyFrame({c: ["a"] for c in COLUMNS})
for col, lookup in lookups.items():
    ldf = ldf.join(lookup.rename({FACTOR_COL: col + "_" + FACTOR_COL}), on=col, how="left")

# runs
print(pl.collect_all([ldf]))

# runs fine with collect
print(ldf.select(COLUMNS[0]).collect())

# seg faults when using collect_all
print(pl.collect_all([ldf.select(COLUMNS[0])]))

Log output

join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
join parallel: true
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished
LEFT join dataframes finished

Issue description

This is a reprex distilled from a more complicated internal process. The value of NVARS at which the errors start varies by machine. On my current machine the value decreased going from 0.20.4 to 0.20.6

Expected behavior

To run without segfaulting.

Installed versions

``` --------Version info--------- Polars: 0.20.6 Index type: UInt32 Platform: Linux-4.14.326-245.539.amzn2.x86_64-x86_64-with-glibc2.26 Python: 3.10.13 (main, Sep 11 2023, 13:44:35) [GCC 11.2.0] ----Optional dependencies---- adbc_driver_manager: cloudpickle: connectorx: deltalake: fsspec: 2023.12.2 gevent: hvplot: matplotlib: 3.8.2 numpy: 1.26.2 openpyxl: 3.1.2 pandas: 2.1.4 pyarrow: 13.0.0 pydantic: 2.5.2 pyiceberg: pyxlsb: sqlalchemy: xlsx2csv: xlsxwriter: ```
cmdlineluser commented 10 months ago

Can reproduce.

If it helps with debugging: it runs when projection_pushdown is disabled.

>>> print(pl.collect_all([ldf.select(COLUMNS[0])], projection_pushdown=False))
[shape: (1, 1)
┌─────┐
│ 0   │
│ --- │
│ str │
╞═════╡
│ a   │
└─────┘]
MarcNuebel commented 10 months ago

Can reproduce and projection_pushdown=False only seems to make a higher value for NVARS possible for me before segfault

ritchie46 commented 10 months ago

You segfault because we StackOverFlow. This happens at a certain NVARS.

saoudm commented 10 months ago

I’m facing the same issue, and it seems to happen also with collect. I’ll try to work on a reproduction.

TylerGrantSmith commented 8 months ago

Thanks all, it looks like the issue was resolved along with many others as of 0.20.17

TylerGrantSmith commented 8 months ago

Pre-mature....the example passed, but still overflows (at ~N=500) now

mmore500 commented 19 hours ago

I believe I may be encountering a similar issue, as of Polars 1.16.0 on Python 3.12.7. My crash also gives error code 139 (segfault). In case it's useful, I've attached an example of where I'm encountering a crash. Output from mwb.py is

attempting S=128 collect_chunked_=False
didn't crash!
attempting S=128 collect_chunked_=True
didn't crash!
attempting S=256 collect_chunked_=False
didn't crash!
attempting S=256 collect_chunked_=True

(and then it crashes)

mwb.py: ``` import itertools as it import numpy as np import polars as pl def bitlen_pl(col: pl.Expr) -> pl.Expr: """Create Polars expression for the bit length of integers.""" return pl.lit(64) - col.cast(pl.UInt64).bitwise_leading_zeros() def collect_chunked(df: pl.LazyFrame, num_rows: int) -> pl.LazyFrame: """Collect a Polars LazyFrame, collecting in chunks for multithreaded processing.""" n_chunks = max(pl.thread_pool_size() - 1, 1) chunk_size = max(num_rows // n_chunks, 1) chunks = it.pairwise([*range(0, n_chunks, chunk_size), num_rows]) # collect_all uses polars threadpool to collect chunks in parallel collected = pl.collect_all([df[slice(*chunk)] for chunk in chunks]) concatenated = pl.concat([c.lazy() for c in collected], rechunk=False) return concatenated def _steady_lookup_ingest_times_batched_polars( S: int, T: np.ndarray, collect_chunked_: bool = True, ) -> np.ndarray: """Implementation detail for steady_lookup_ingest_times_batched.""" assert S > 1 and int(S).bit_count() == 1 c_, l_ = pl.col, pl.lit df = pl.LazyFrame({"T": T}) s = int(S).bit_length() - 1 t = bitlen_pl(c_("T")) - l_(s) df = df.with_columns(t=t) b = 0 # Bunch physical index (left-to right) m_b__ = 1 # Countdown on segments traversed within bunch b_star = True # Have traversed all segments in bunch? k_m__ = s + 1 # Countdown on sites traversed within segment for k in range(S): # Iterate over buffer sites, except unused last one # Calculate info about current segment... epsilon_w = b == 0 # Correction on segment width if first segment # Number of sites in current segment (i.e., segment size) w = s - b + epsilon_w m = (1 << b) - m_b__ # Calc left-to-right index of current segment # Max possible hanoi value in segment during epoch h_max = c_("t") + l_(w - 1) df = df.with_columns(h_max=h_max) # Calculate candidate hanoi value... h_ = c_("h_max") - (c_("h_max") + l_(k_m__)) % l_(w) df = df.with_columns(h_=h_) # Decode ingest time of assigned h.v. from segment index g, ... # ... i.e., how many instances of that h.v. seen before T_bar_k_ = np.left_shift(l_(2 * m + 1), c_("h_")) - l_(1) df = df.with_columns(T_bar_k_=T_bar_k_) # ^^^ Guess ingest time epsilon_h = (c_("T_bar_k_") >= c_("T")) * l_(w) df = df.with_columns(epsilon_h=epsilon_h) # ^^^ Correction on h.v. if not yet seen h = c_("h_") - c_("epsilon_h") # Corrected true resident h.v. df = df.with_columns(h=h) T_bar_k = np.left_shift(l_(2 * m + 1), c_("h")) - l_(1) # ^^^ True ingest time df = df.with_columns(T_bar_k.alias(f"{k}")) # Update within-segment state for next site... k_m__ = (k_m__ or w) - 1 # Bump to next site within segment # Update h for next site... # ... only needed if not calculating h fresh every iter [[see above]] h_ = c_("h_") + l_(1) - (c_("h_") >= c_("h_max")) * l_(w) df = df.with_columns(h_=h_) # Update within-bunch state for next site... m_b__ -= not k_m__ # Bump to next segment within bunch b_star = not (m_b__ or k_m__) # Should bump to next bunch? b += b_star # Do bump to next bunch, if any # Set within-bunch segment countdown, if bumping to next bunch m_b__ = m_b__ or (1 << (b - 1)) df = df.select("^[0-9]+$") # select only numbered Tbar_k "result" columns if collect_chunked_: df = collect_chunked(df, len(T)) return df.collect().to_numpy() if __name__ == "__main__": for S, collect_chunked_ in it.product([128, 256], [False, True]): print(f"attempting {S=} {collect_chunked_=}") _steady_lookup_ingest_times_batched_polars( S, # S: int np.arange(20), # T: np.ndarray collect_chunked_=collect_chunked_, # use threadpool to collect? ) print("didn't crash!") ```