Panic on TIMESTAMP-returning queries over HTTP with more than one value

mildbyte commented 1 year ago

Running this query:

curl -XPOST localhost:8080/q -H "Content-Type: application/json" -d@- <<EOF
{"query": "
    WITH ts AS (
        SELECT * FROM (VALUES ('2020-01-01 00:00:00'::timestamp)) AS v
    ) 
    (SELECT * FROM ts) UNION ALL (SELECT * FROM ts)
"}
EOF

results in a Trying to access an element at index 1 from a PrimitiveArray of length 1 panic:

thread 'tokio-runtime-worker' panicked at 'Trying to access an element at index 1 from a PrimitiveArray of length 1', /home/mildbyte/.cargo/git/checkouts/arrow-rs-aeceac26ba73ca43/57f79c0/arrow-array/src/array/primitive_array.rs:337:9
stack backtrace:                                                                                                                                                                                                     
   0:     0x56060d8d214a - std::backtrace_rs::backtrace::libunwind::trace::h08bd4b4334e680d9                                                                                                                         
                               at /rustc/700938c0781c9f135244bb1ec846fe1a5e03ae7d/library/std/src/../../backtrace/src/backtrace/libunwind.rs:93:5                                                                    
   1:     0x56060d8d214a - std::backtrace_rs::backtrace::trace_unsynchronized::hf594b03fffc7c3a4                                                                                                                     
                               at /rustc/700938c0781c9f135244bb1ec846fe1a5e03ae7d/library/std/src/../../backtrace/src/backtrace/mod.rs:66:5                                                                         
   2:     0x56060d8d214a - std::sys_common::backtrace::_print_fmt::h1134a35071387263                                                                                                                                
                               at /rustc/700938c0781c9f135244bb1ec846fe1a5e03ae7d/library/std/src/sys_common/backtrace.rs:65:5                                                                                      
   3:     0x56060d8d214a - <std::sys_common::backtrace::_print::DisplayBacktrace as core::fmt::Display>::fmt::h69f04f53733d3891                                                                                      
                               at /rustc/700938c0781c9f135244bb1ec846fe1a5e03ae7d/library/std/src/sys_common/backtrace.rs:44:22                                                                                     
   4:     0x56060d900bff - core::fmt::write::heb1c797211b5fb3d                                                                                                                                                      
                               at /rustc/700938c0781c9f135244bb1ec846fe1a5e03ae7d/library/core/src/fmt/mod.rs:1254:17                                                                                               
   5:     0x56060d8cd665 - std::io::Write::write_fmt::h0753f8e473762982                                   
                               at /rustc/700938c0781c9f135244bb1ec846fe1a5e03ae7d/library/std/src/io/mod.rs:1698:15                                                                                                 
   6:     0x56060d8d1f15 - std::sys_common::backtrace::_print::h91ef833f3b8da05b                                                                                                                                    
                               at /rustc/700938c0781c9f135244bb1ec846fe1a5e03ae7d/library/std/src/sys_common/backtrace.rs:47:5                                                                                      
   7:     0x56060d8d1f15 - std::sys_common::backtrace::print::h95ac49619fd683a6                           
                               at /rustc/700938c0781c9f135244bb1ec846fe1a5e03ae7d/library/std/src/sys_common/backtrace.rs:34:9                                                                                      
   8:     0x56060d8d374e - std::panicking::default_hook::{{closure}}::hd0ccf67e3d41bed4                                                                                                                             
                               at /rustc/700938c0781c9f135244bb1ec846fe1a5e03ae7d/library/std/src/panicking.rs:269:22                                                                                               
   9:     0x56060d8d34f5 - std::panicking::default_hook::he9eeede79fcc44f5                                
                               at /rustc/700938c0781c9f135244bb1ec846fe1a5e03ae7d/library/std/src/panicking.rs:288:9                                                                                                
  10:     0x56060d8d3cae - std::panicking::rust_panic_with_hook::hfee790920b2b90a3                                                                                                                                  
                               at /rustc/700938c0781c9f135244bb1ec846fe1a5e03ae7d/library/std/src/panicking.rs:690:13                                                                                               
  11:     0x56060d8d3ba9 - std::panicking::begin_panic_handler::{{closure}}::h3bfe84e5bbd52252                                                                                                                      
                               at /rustc/700938c0781c9f135244bb1ec846fe1a5e03ae7d/library/std/src/panicking.rs:581:13                                                                                               
  12:     0x56060d8d25b6 - std::sys_common::backtrace::__rust_end_short_backtrace::hd97ff4e01dc5cc20                                                                                                                
                               at /rustc/700938c0781c9f135244bb1ec846fe1a5e03ae7d/library/std/src/sys_common/backtrace.rs:150:18                                                                                    
  13:     0x56060d8d3902 - rust_begin_unwind                                                              
                               at /rustc/700938c0781c9f135244bb1ec846fe1a5e03ae7d/library/std/src/panicking.rs:577:5                                                                                                
  14:     0x560606d2d0d3 - core::panicking::panic_fmt::h6a7ef2d25e2f2c88                                  
                               at /rustc/700938c0781c9f135244bb1ec846fe1a5e03ae7d/library/core/src/panicking.rs:67:14                                                                                               
  15:     0x56060d1ea504 - arrow_array::array::primitive_array::PrimitiveArray<T>::value::h06c4a531e630fa22                                                                                                         
                               at /home/mildbyte/.cargo/git/checkouts/arrow-rs-aeceac26ba73ca43/57f79c0/arrow-array/src/array/primitive_array.rs:337:9                                                              
  16:     0x56060c596977 - <&arrow_array::array::primitive_array::PrimitiveArray<T> as arrow_array::array::ArrayAccessor>::value::h3f301b24e10056e7                                                                 
                               at /home/mildbyte/.cargo/git/checkouts/arrow-rs-aeceac26ba73ca43/57f79c0/arrow-array/src/array/primitive_array.rs:711:9                                                              
  17:     0x56060c6265a8 - <&arrow_array::array::primitive_array::PrimitiveArray<arrow_array::types::TimestampNanosecondType> as arrow_cast::display::DisplayIndexState>::write::hbab3f37a9240d8fa                                                                                                                                                                                                                                      
                               at /home/mildbyte/.cargo/git/checkouts/arrow-rs-aeceac26ba73ca43/57f79c0/arrow-cast/src/display.rs:465:29                                                                            
  18:     0x56060c699d1e - <arrow_cast::display::ArrayFormat<F> as arrow_cast::display::DisplayIndex>::write::h7aa0695156911352                                                                                     
                               at /home/mildbyte/.cargo/git/checkouts/arrow-rs-aeceac26ba73ca43/57f79c0/arrow-cast/src/display.rs:361:9                                                                             
  19:     0x56060c68f7b1 - <arrow_cast::display::ValueFormatter as core::fmt::Display>::fmt::ha5bc81ef21ea99da                                                                                                      
                               at /home/mildbyte/.cargo/git/checkouts/arrow-rs-aeceac26ba73ca43/57f79c0/arrow-cast/src/display.rs:162:15                                                                            
  20:     0x56060c68f4a9 - <T as alloc::string::ToString>::to_string::heccf163b66441211                                                                                                                             
                               at /rustc/700938c0781c9f135244bb1ec846fe1a5e03ae7d/library/alloc/src/string.rs:2532:9                                                                                                
  21:     0x56060c0ac717 - arrow_json::writer::set_column_for_json_rows::{{closure}}::h420f7f3184b60b76                                                                                                             
                               at /home/mildbyte/.cargo/registry/src/index.crates.io-6f17d22bba15001f/arrow-json-33.0.0/src/writer.rs:309:25                                                                        
  22:     0x56060c06d18c - core::iter::traits::iterator::Iterator::for_each::call::{{closure}}::ha515900963170f28                                                                                                   
                               at /rustc/700938c0781c9f135244bb1ec846fe1a5e03ae7d/library/core/src/iter/traits/iterator.rs:860:29                                                                                   
  23:     0x56060c07fa75 - <core::iter::adapters::enumerate::Enumerate<I> as core::iter::traits::iterator::Iterator>::fold::enumerate::{{closure}}::h489b852badad06e2                                               
                               at /rustc/700938c0781c9f135244bb1ec846fe1a5e03ae7d/library/core/src/iter/adapters/enumerate.rs:107:27                                                                                
  24:     0x56060c0a06d1 - core::iter::traits::iterator::Iterator::fold::h6ff9366481b33815                                                                                                                          
                               at /rustc/700938c0781c9f135244bb1ec846fe1a5e03ae7d/library/core/src/iter/traits/iterator.rs:2488:21                                                                                  
  25:     0x56060c07f9eb - <core::iter::adapters::enumerate::Enumerate<I> as core::iter::traits::iterator::Iterator>::fold::hb8ad7f1bf610321b                                                                       
                               at /rustc/700938c0781c9f135244bb1ec846fe1a5e03ae7d/library/core/src/iter/adapters/enumerate.rs:113:9                                                                                 
  26:     0x56060c081cd6 - core::iter::traits::iterator::Iterator::for_each::h048eac0c33342e69                                                                                                                      
                               at /rustc/700938c0781c9f135244bb1ec846fe1a5e03ae7d/library/core/src/iter/traits/iterator.rs:863:9                                                                                    
  27:     0x56060c0ac5fd - arrow_json::writer::set_column_for_json_rows::hf81407c8110e749e                                                                                                                          
                               at /home/mildbyte/.cargo/registry/src/index.crates.io-6f17d22bba15001f/arrow-json-33.0.0/src/writer.rs:305:13                                                                        
  28:     0x56060c0ad4e0 - arrow_json::writer::record_batches_to_json_rows::hb7bc69ec576f8414                                                                                                                       
                               at /home/mildbyte/.cargo/registry/src/index.crates.io-6f17d22bba15001f/arrow-json-33.0.0/src/writer.rs:422:17                                                                        
  29:     0x560606e8a5a3 - arrow_json::writer::Writer<W,F>::write_batches::h30f3fdafed90909c                                                                                                                        
                               at /home/mildbyte/.cargo/registry/src/index.crates.io-6f17d22bba15001f/arrow-json-33.0.0/src/writer.rs:581:20                                                                        
  30:     0x560606d5dddd - seafowl::frontend::http::physical_plan_to_json::{{closure}}::h701efe4e6b718a67                                                                                                           
                               at /home/mildbyte/seafowl/src/frontend/http.rs:112:5                                                                                                                                 
  31:     0x560606d5f87e - seafowl::frontend::http::uncached_read_write_query::{{closure}}::h3359bcf1b25a4a2e                                                                                                       
# warp / futures infra backtrace after this

If I remove the UNION ALL or cast the timestamp to text before returning it, the error doesn't happen:

 ~ $ curl -XPOST localhost:8080/q -H "Content-Type: application/json" -d@- <<EOF
{"query": "
    WITH ts AS (
        SELECT * FROM (VALUES ('2020-01-01 00:00:00'::timestamp)) AS v   
    )

    SELECT * FROM ts                      
"}
EOF
{"column1":"2020-01-01T00:00:00"}
 ~ $ curl -XPOST localhost:8080/q -H "Content-Type: application/json" -d@- <<EOF
{"query": "
    WITH ts AS (
        SELECT * FROM (VALUES ('2020-01-01 00:00:00'::timestamp)) AS v(t)
    )

    (SELECT t::text FROM ts) UNION ALL (SELECT t::text FROM ts)
"}
EOF
{"ts.t":"2020-01-01T00:00:00"}
{"ts.t":"2020-01-01T00:00:00"}
 ~ $

onpaws commented 1 year ago

Great job sleuthing a minimal repro 🎉

gruuya commented 1 year ago

The issue originates from a bug in arrow-json v33.0.0 that we're currently on.

Namely, what happens is that when iterating over the record batches in arrow_json::writer::record_batches_to_json_rows an auxiliary vector doesn't get sliced properly, leading to out of bounds access attempt. This has been fixed in newer arrow-json versions (see https://github.com/apache/arrow-rs/pull/3924 and https://github.com/apache/arrow-rs/pull/3934), so will pick it up eventually (should be in v36.0.0).

In the meantime I can add something along the following lines as a mitigation (could pose a problem for very large outputs as it doubles the size of total record batch rows/columns in memory):

@@ -106,11 +107,12 @@ async fn physical_plan_to_json(
     context: Arc<DefaultSeafowlContext>,
     physical: Arc<dyn ExecutionPlan>,
 ) -> Result<Vec<u8>, DataFusionError> {
+    let schema_ref = physical.schema();
     let batches = context.collect(physical).await?;
     let mut buf = Vec::new();
     let mut writer = LineDelimitedWriter::new(&mut buf);
     writer
-        .write_batches(&batches)
+        .write_batches(&[concat_batches(&schema_ref, batches.iter())?])
         .map_err(DataFusionError::ArrowError)?;
     writer.finish().map_err(DataFusionError::ArrowError)?;
     Ok(buf)

splitgraph / seafowl

Panic on TIMESTAMP-returning queries over HTTP with more than one value #349