apache / arrow-adbc

Database connectivity API standard and libraries for Apache Arrow
https://arrow.apache.org/adbc/
Apache License 2.0
385 stars 97 forks source link

go: `runtime: marked free object in span` is still possible #1931

Open lidavidm opened 5 months ago

lidavidm commented 5 months ago

What happened?

https://github.com/apache/arrow-adbc/issues/729 is still possible because there are other arguments that need to be sanitized before passing on to go.

The gist is the same as earlier, but this time the GC write barrier is not involved. Instead the GC can run and eventually discovers/marks an invalid Go pointer just as part of its normal operation. Hence the symptoms look slightly different.

runtime: marked free object in span 0x7f8b1afb0ac8, elemsize=24 freeindex=0 (bad use of unsafe.Pointer? try -d=checkptr)
...snip...
fatal error: found pointer to free object

goroutine 18 gp=0xc0001a7a40 m=7 mp=0xc0000c3808 [running, locked to thread]:
runtime.throw({0x7f8b1a6be91c?, 0xc00038c258?})
    /home/conda/feedstock_root/build_artifacts/arrow-adbc-split_1708616262548/_build_env/go/src/runtime/panic.go:1023 +0x5e fp=0xc0000aba70 sp=0xc0000aba40 pc=0x7f8b19fd6c5e
runtime.(*mspan).reportZombies(0x7f8b1afb0ac8)
    /home/conda/feedstock_root/build_artifacts/arrow-adbc-split_1708616262548/_build_env/go/src/runtime/mgcsweep.go:872 +0x2ea fp=0xc0000abaf0 sp=0xc0000aba70 pc=0x7f8b19fc69ca
runtime.(*sweepLocked).sweep(0x7f8b1af1cfe0?, 0x0)
    /home/conda/feedstock_root/build_artifacts/arrow-adbc-split_1708616262548/_build_env/go/src/runtime/mgcsweep.go:643 +0xb88 fp=0xc0000abc08 sp=0xc0000abaf0 pc=0x7f8b19fc6168
runtime.sweepone()
    /home/conda/feedstock_root/build_artifacts/arrow-adbc-split_1708616262548/_build_env/go/src/runtime/mgcsweep.go:390 +0xdd fp=0xc0000abc58 sp=0xc0000abc08 pc=0x7f8b19fc531d
runtime.gcStart({0xc000000918?, 0xc0000abd58?, 0x0?})
    /home/conda/feedstock_root/build_artifacts/arrow-adbc-split_1708616262548/_build_env/go/src/runtime/mgc.go:618 +0xdf fp=0xc0000abd18 sp=0xc0000abc58 pc=0x7f8b19fb9e9f
runtime.GC()
    /home/conda/feedstock_root/build_artifacts/arrow-adbc-split_1708616262548/_build_env/go/src/runtime/mgc.go:466 +0x3e fp=0xc0000abd50 sp=0xc0000abd18 pc=0x7f8b19fb9afe
main.FlightSQLStatementRelease.func2()
    /home/conda/feedstock_root/build_artifacts/arrow-adbc-split_1708616262548/work/go/adbc/pkg/flightsql/driver.go:1474 +0xdb fp=0xc0000abd98 sp=0xc0000abd50 pc=0x7f8b1a6a175b
main.FlightSQLStatementRelease(0x7f8b1bc59998, 0x7f8a9cff82e0)
    /home/conda/feedstock_root/build_artifacts/arrow-adbc-split_1708616262548/work/go/adbc/pkg/flightsql/driver.go:1479 +0x17b fp=0xc0000abe38 sp=0xc0000abd98 pc=0x7f8b1a6a155b
_cgoexp_aa58dd989240_FlightSQLStatementRelease(0x7f8a9cff81e0)
    _cgo_gotypes.go:1072 +0x25 fp=0xc0000abe58 sp=0xc0000abe38 pc=0x7f8b1a6a58c5
runtime.cgocallbackg1(0x7f8b1a6a58a0, 0x7f8a9cff81e0, 0x0)
    /home/conda/feedstock_root/build_artifacts/arrow-adbc-split_1708616262548/_build_env/go/src/runtime/cgocall.go:403 +0x2a5 fp=0xc0000abf18 sp=0xc0000abe58 pc=0x7f8b19fa3ac5
runtime.cgocallbackg(0x7f8b1a6a58a0, 0x7f8a9cff81e0, 0x0)
    /home/conda/feedstock_root/build_artifacts/arrow-adbc-split_1708616262548/_build_env/go/src/runtime/cgocall.go:322 +0x138 fp=0xc0000abf90 sp=0xc0000abf18 pc=0x7f8b19fa3778
runtime.cgocallbackg(0x7f8b1a6a58a0, 0x7f8a9cff81e0, 0x0)
    <autogenerated>:1 +0x2b fp=0xc0000abfb8 sp=0xc0000abf90 pc=0x7f8b1a00e54b
runtime.cgocallback(0x0, 0x0, 0x0)
    /home/conda/feedstock_root/build_artifacts/arrow-adbc-split_1708616262548/_build_env/go/src/runtime/asm_amd64.s:1079 +0xcd fp=0xc0000abfe0 sp=0xc0000abfb8 pc=0x7f8b1a00bc8d
runtime.goexit({})
    /home/conda/feedstock_root/build_artifacts/arrow-adbc-split_1708616262548/_build_env/go/src/runtime/asm_amd64.s:1695 +0x1 fp=0xc0000abfe8 sp=0xc0000abfe0 pc=0x7f8b1a00bee1

How can we reproduce the bug?

Unfortunately, the reproduction is private, but you can patch the driver manager to force the bug to happen

diff --git a/go/adbc/pkg/flightsql/driver.go b/go/adbc/pkg/flightsql/driver.go
index 9a8535fe..7ce2bb16 100644
--- a/go/adbc/pkg/flightsql/driver.go
+++ b/go/adbc/pkg/flightsql/driver.go
@@ -17,8 +17,6 @@
 // specific language governing permissions and limitations
 // under the License.

-//go:build driverlib
-
 package main

 // ADBC_EXPORTING is required on Windows, or else the symbols
diff --git a/python/adbc_driver_manager/adbc_driver_manager/_lib.pxd b/python/adbc_driver_manager/adbc_driver_manager/_lib.pxd
index 5c572f78..fe9ef0dd 100644
--- a/python/adbc_driver_manager/adbc_driver_manager/_lib.pxd
+++ b/python/adbc_driver_manager/adbc_driver_manager/_lib.pxd
@@ -28,6 +28,9 @@ cdef extern from "adbc.h" nogil:

     cdef struct CArrowSchema"ArrowSchema":
         CArrowSchemaRelease release
+        CArrowSchema** children
+        CArrowSchema* dictionary
+        void* private_data

     cdef struct CArrowArray"ArrowArray":
         CArrowArrayRelease release
diff --git a/python/adbc_driver_manager/adbc_driver_manager/_lib.pyx b/python/adbc_driver_manager/adbc_driver_manager/_lib.pyx
index cf85e226..d78cc58a 100644
--- a/python/adbc_driver_manager/adbc_driver_manager/_lib.pyx
+++ b/python/adbc_driver_manager/adbc_driver_manager/_lib.pyx
@@ -1217,6 +1217,13 @@ cdef class AdbcStatement(_AdbcHandle):
             0, NULL, NULL, NULL, NULL)
         cdef int64_t rows_affected = 0

+        import random
+        a = 0xc000000000 + random.randint(0, 0xFFFFFF)
+        print("testing", hex(a))
+        schema.schema.private_data = <void*> (<uintptr_t> a)
+        # schema.schema.children = <CArrowSchema**> (<uintptr_t> a)
+        schema.schema.dictionary = <CArrowSchema*> (<uintptr_t> a)
+
         with nogil:
             status = AdbcStatementExecutePartitions(
                 &self.statement,

Environment/Setup

Linux x64; ADBC (Python) 0.10.0 and 1.0.0 were tested and it seems all versions should exhibit this

lidavidm commented 5 months ago

So while this repro forces symptoms close to what was reported, CPython zero-initializes Python objects so there should be no way the ArrowSchemaHandle has an uninitialized struct inside it

So there may be another place where we are allocating and not initializing a struct that gets passed to Go in the code; or there may be a problem with the FFI or C Data code within the driver/Arrow-Go itself

lidavidm commented 5 months ago

Here's another potential issue: we use C.malloc at a few points to allocate a struct that we fill. Well, malloc doesn't initialize the memory so perhaps it's possible a Go-pointer-alike sneaks in there.

This diff is also able to force a crash:

diff --git a/go/adbc/pkg/flightsql/driver.go b/go/adbc/pkg/flightsql/driver.go
index 9a8535fe..cb3810ab 100644
--- a/go/adbc/pkg/flightsql/driver.go
+++ b/go/adbc/pkg/flightsql/driver.go
@@ -17,8 +17,6 @@
 // specific language governing permissions and limitations
 // under the License.

-//go:build driverlib
-
 package main

 // ADBC_EXPORTING is required on Windows, or else the symbols
@@ -46,6 +44,7 @@ package main
 // int FlightSQLArrayStreamGetNextTrampoline(struct ArrowArrayStream*, struct ArrowArray*);
 //
 // void releasePartitions(struct AdbcPartitions* partitions);
+// void* allocError();
 //
 import "C"
 import (
@@ -102,7 +101,8 @@ func setErrWithDetails(err *C.struct_AdbcError, adbcError adbc.Error) {
                return
        }

-       cErrPtr := C.malloc(C.sizeof_struct_FlightSQLError)
+       // cErrPtr := C.malloc(C.sizeof_struct_FlightSQLError)
+       cErrPtr := C.allocError()
        cErr := (*C.struct_FlightSQLError)(cErrPtr)
        cErr.message = C.CString(adbcError.Msg)
        err.message = cErr.message
diff --git a/go/adbc/pkg/flightsql/utils.c b/go/adbc/pkg/flightsql/utils.c
index 95920aa4..d84e1e79 100644
--- a/go/adbc/pkg/flightsql/utils.c
+++ b/go/adbc/pkg/flightsql/utils.c
@@ -24,6 +24,7 @@
 #include "utils.h"

 #include <string.h>
+#include <stdio.h>

 #ifdef __cplusplus
 extern "C" {
@@ -440,6 +441,16 @@ int FlightSQLArrayStreamGetNextTrampoline(struct ArrowArrayStream* stream,
   return FlightSQLArrayStreamGetNext(stream, out);
 }

+  void* allocError(void) {
+    struct FlightSQLError* error = (struct FlightSQLError*)malloc(sizeof(struct FlightSQLError));
+    uintptr_t bad = 0xc000000000;
+    bad += rand() % 0xFFFFFF;
+    error->lengths = (void*)bad;
+    error->message = (void*)bad;
+    printf("tester2 %#010lx\n", bad);
+    return error;
+}
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/go/adbc/pkg/flightsql/utils.h b/go/adbc/pkg/flightsql/utils.h
index fbdbe89a..090b471d 100644
--- a/go/adbc/pkg/flightsql/utils.h
+++ b/go/adbc/pkg/flightsql/utils.h
@@ -179,3 +179,5 @@ int FlightSQLArrayStreamGetSchemaTrampoline(struct ArrowArrayStream* stream,
                                             struct ArrowSchema* out);
 int FlightSQLArrayStreamGetNextTrampoline(struct ArrowArrayStream* stream,
                                           struct ArrowArray* out);
+
+void* allocError(void);
lidavidm commented 5 months ago

If we accept this as the potential cause, we should audit upstream as well. I see a couple of similar patterns in the C Data Interface implementation.

I think we should also audit all cgo definitions and helper C code to look for anything potentially stack allocated or otherwise uninitialized.