elastic / apm-data

apm-data holds definitions and code for manipulating Elastic APM data
Apache License 2.0
12 stars 25 forks source link

Panic in stacktrace mapping #326

Closed lahsivjar closed 2 months ago

lahsivjar commented 2 months ago
{"log.level":"error","@timestamp":"2024-07-22T15:24:51.682+0100","log.logger":"request","log.origin":{"function":"github.com/elastic/apm-server/internal/beater/api.apmMiddleware.LogMiddleware.func1.1","file.name":"middleware/log_middleware.go","file.line":59},"message":"internal error","service.name":"apm-server","url.original":"/intake/v2/events","http.request.method":"POST","user_agent.original":"Go-http-client/1.1","source.address":"127.0.0.1","http.request.id":"a390e2cf-1bbc-4485-9251-4f082c5b2cbe","event.duration":449042,"http.request.body.bytes":4096,"http.response.status_code":500,"error.message":"internal error: runtime error: invalid memory address or nil pointer dereference","error.stack_trace":"goroutine 180 [running]:
    runtime/debug.Stack()
        runtime/debug/stack.go:24 +0x64
    github.com/elastic/apm-server/internal/beater/api.apmMiddleware.RecoverPanicMiddleware.func2.1.1()
        github.com/elastic/apm-server/internal/beater/middleware/recover_panic_middleware.go:55 +0x134
    panic({0x106a3dd20?, 0x108025640?})
        runtime/panic.go:770 +0x124
    github.com/elastic/apm-data/input/elasticapm/internal/modeldecoder/v2.mapToStracktraceModel({0x1400f88a008?, 0x1a, 0x140028c2120?}, {0x14018f18f00, 0x1a, 0x140198f5ada?})
        github.com/elastic/apm-data@v1.9.0/input/elasticapm/internal/modeldecoder/v2/decoder.go:1320 +0x80
    github.com/elastic/apm-data/input/elasticapm/internal/modeldecoder/v2.mapToSpanModel(0x140115ec608, 0x1401860dce0)
        github.com/elastic/apm-data@v1.9.0/input/elasticapm/internal/modeldecoder/v2/decoder.go:1282 +0x13f0
    github.com/elastic/apm-data/input/elasticapm/internal/modeldecoder/v2.DecodeNestedSpan({0x106d5b2e0, 0x14018c351e0}, 0x140000bceb8, 0x14018c62c18)
        github.com/elastic/apm-data@v1.9.0/input/elasticapm/internal/modeldecoder/v2/decoder.go:218 +0x118
    github.com/elastic/apm-data/input/elasticapm.(*Processor).readBatch(0x1400062ea50, {0x1082a85b8?, 0x50?}, 0x14018f80580, 0x14018c114a0?, 0x14018c62c18, 0x14018c351e0, 0x14018bff380)
        github.com/elastic/apm-data@v1.9.0/input/elasticapm/processor.go:206 +0x1d8
    github.com/elastic/apm-data/input/elasticapm.(*Processor).handleStream(0x1400062ea50, {0x106d69418, 0x14018c0b470}, 0x14018c62c18, 0x0?, 0x0?, 0x0?, {0x106d54668, 0x140004001c8}, 0x14018bff380)
        github.com/elastic/apm-data@v1.9.0/input/elasticapm/processor.go:310 +0x4c
    github.com/elastic/apm-data/input/elasticapm.(*Processor).HandleStream(0x1400062ea50, {0x106d69418?, 0x14018c0b470?}, 0x14018f80580, {0x10f2ef308, 0x14018c11400}, 0xa, {0x106d54668, 0x140004001c8}, 0x14018bff380)
        github.com/elastic/apm-data@v1.9.0/input/elasticapm/processor.go:291 +0x480
    github.com/elastic/apm-server/internal/beater/api.(*routeBuilder).backendIntakeHandler.Handler.func2(0x14014b97c80)
        github.com/elastic/apm-server/internal/beater/api/intake/handler.go:77 +0x158
    github.com/elastic/apm-server/internal/beater/api.backendMiddleware.AnonymousRateLimitMiddleware.func3.1(0x14018bf6d80?)
        github.com/elastic/apm-server/internal/beater/middleware/rate_limit_middleware.go:49 +0xc4
    github.com/elastic/apm-server/internal/beater/api.backendMiddleware.AuthMiddleware.func2.1(0x14014b97c80)
        github.com/elastic/apm-server/internal/beater/middleware/auth_middleware.go:70 +0x1ac
    github.com/elastic/apm-server/internal/beater/api.backendMiddleware.ResponseHeadersMiddleware.func1.1(0x14014b97c80)
        github.com/elastic/apm-server/internal/beater/middleware/response_headers_middleware.go:33 +0xa4
    github.com/elastic/apm-server/internal/beater/middleware.MonitoringMiddleware.(*monitoringMiddleware).Middleware.func1.1(0x14014b97c80)
        github.com/elastic/apm-server/internal/beater/middleware/monitoring_middleware.go:53 +0xac
    github.com/elastic/apm-server/internal/beater/api.apmMiddleware.RecoverPanicMiddleware.func2.1(0x106923ce0?)
        github.com/elastic/apm-server/internal/beater/middleware/recover_panic_middleware.go:60 +0x50
    github.com/elastic/apm-server/internal/beater/middleware.TimeoutMiddleware.func1.1(0x14014b97c80)
        github.com/elastic/apm-server/internal/beater/middleware/timeout_middleware.go:35 +0x3c
    github.com/elastic/apm-server/internal/beater/api.apmMiddleware.LogMiddleware.func1.1(0x14014b97c80)
        github.com/elastic/apm-server/internal/beater/middleware/log_middleware.go:47 +0xa0
    github.com/elastic/apm-server/internal/beater/api.NewMux.(*ContextPool).HTTPHandler.func11({0x106d633d0, 0x14013e417a0}, 0x14018bf6d80)
        github.com/elastic/apm-server/internal/beater/request/context_pool.go:47 +0xe0
    net/http.HandlerFunc.ServeHTTP(0x14018bf6d80?, {0x106d633d0?, 0x14013e417a0?}, 0x14018bf6d80?)
        net/http/server.go:2166 +0x38
    go.elastic.co/apm/module/apmhttp/v2.(*handler).ServeHTTP(0x14018bf6c60?, {0x106d633d0?, 0x14013e417a0?}, 0x1082a85b8?)
        go.elastic.co/apm/module/apmhttp/v2@v2.6.0/handler.go:74 +0xac
    github.com/gorilla/mux.(*Router).ServeHTTP(0x140005ce240, {0x106d633d0, 0x14013e417a0}, 0x14018bf6b40)
        github.com/gorilla/mux@v1.8.1/mux.go:212 +0x194
    github.com/elastic/gmux.ConfigureServer.(*mux).withGRPCInsecure.func3({0x106d633d0?, 0x14013e417a0?}, 0x0?)
        github.com/elastic/gmux@v0.3.2/mux.go:119 +0x120
    net/http.HandlerFunc.ServeHTTP(0x10?, {0x106d633d0?, 0x14013e417a0?}, 0x14013e417a0?)
        net/http/server.go:2166 +0x38
    net/http.serverHandler.ServeHTTP({0x106d5ec28?}, {0x106d633d0?, 0x14013e417a0?}, 0x6?)
        net/http/server.go:3137 +0xbc
    net/http.(*conn).serve(0x14002bf2090, {0x106d69418, 0x1400028f3e0})
        net/http/server.go:2039 +0x508
    created by net/http.(*Server).Serve in goroutine 169
        net/http/server.go:3285 +0x3f0
    ","ecs.version":"1.6.0"}
lahsivjar commented 2 months ago

The root cause of this is that the Reslice function (ref) in APM data falsely assumes that the whole capacity of the slice is non-nil. The below test case can be used to reproduce the panic:

func TestBadReslice(t *testing.T) {
    metadata := `{"metadata": {"service": {"name": "1234_service-12a3","node": {"configured_name": "node-123"},"version": "5.1.3","environment": "staging","language": {"name": "ecmascript","version": "8"},"runtime": {"name": "node","version": "8.0.0"},"framework": {"name": "Express","version": "1.2.3"},"agent": {"name": "elastic-node","version": "3.14.0"}},"user": {"id": "123user", "username": "bar", "email": "bar@user.com"}, "labels": {"tag0": null, "tag1": "one", "tag2": 2}, "process": {"pid": 1234,"ppid": 6789,"title": "node","argv": ["node","server.js"]},"system": {"hostname": "prod1.example.com","architecture": "x64","platform": "darwin", "container": {"id": "container-id"}, "kubernetes": {"namespace": "namespace1", "pod": {"uid": "pod-uid", "name": "pod-name"}, "node": {"name": "node-name"}}},"cloud":{"account":{"id":"account_id","name":"account_name"},"availability_zone":"cloud_availability_zone","instance":{"id":"instance_id","name":"instance_name"},"machine":{"type":"machine_type"},"project":{"id":"project_id","name":"project_name"},"provider":"cloud_provider","region":"cloud_region","service":{"name":"lambda"}}}}`
    errstacktrace := `{"error": {"id": "cdefab0123456789", "trace_id": null, "timestamp": 1533826745999000,"exception": {"message": "Cannot read property 'baz' no defined", "stacktrace": [{"classname": "cl1", "abs_path": "abs_path1"}, {"classname": "cl2", "abs_path": "abs_path2"}, {"classname": "cl3", "abs_path": "abs_path3"}, {"classname": "cl4", "abs_path": "abs_path4"}, {"classname": "cl5", "abs_path": "abs_path5"}]}}}`

    batchProcessor := modelpb.ProcessBatchFunc(func(ctx context.Context, batch *modelpb.Batch) error {
        return nil
    })
    p := NewProcessor(Config{
        MaxEventSize: 100 * 1024,
        Semaphore:    semaphore.NewWeighted(1),
    })

    // Below, we try to emulate a case where all of the capacity of Error#Exception#Stacktrace slice
    // is not populated with non-nil values. In practical cases, this could happen with OTLP handling
    // of exceptions: https://github.com/elastic/apm-data/blob/cee5ac3fc5f2ee66f156133992882cae758e2b66/input/otlp/exceptions.go#L165
    event1 := modelpb.APMEventFromVTPool()
    event1.Error = modelpb.ErrorFromVTPool()
    event1.Error.Exception = modelpb.ExceptionFromVTPool()
    for i := 0; i < 3; i++ {
        // Add 3 so that reallocation makes the capacity 4
        // this will cause the 4th element to be nil
        event1.Error.Exception.Stacktrace = append(event1.Error.Exception.Stacktrace, modelpb.StacktraceFrameFromVTPool())
    }
    require.Equal(t, 4, cap(event1.Error.Exception.Stacktrace))

    event1.ReturnToVTPool()
    event2 := modelpb.APMEventFromVTPool()
    payload2 := strings.Join([]string{metadata, errstacktrace}, "\n")
    err := p.HandleStream(
        context.Background(), event2,
        strings.NewReader(payload2), 10, batchProcessor,
        &Result{},
    )
    require.NoError(t, err)
}

I am working on a fix.