open-telemetry / opentelemetry-collector-contrib

Contrib repository for the OpenTelemetry Collector
https://opentelemetry.io
Apache License 2.0
2.76k stars 2.19k forks source link

[receiver/elasticsearch] TestIntegration test times out intermittently #32656

Open crobert-1 opened 3 months ago

crobert-1 commented 3 months ago

Component(s)

receiver/elasticsearch

Describe the issue you're reporting

Failing CI/CD link

It looks like the test may be looking for a .dockerignore file that doesn't exist, and that's what causes the timeout.

Failure output:

Running target 'mod-integration-test' in module 'receiver/elasticsearchreceiver' as part of group 'receiver-0'
make --no-print-directory -C receiver/elasticsearchreceiver mod-integration-test
running go integration test ./... in /home/runner/work/opentelemetry-collector-contrib/opentelemetry-collector-contrib/receiver/elasticsearchreceiver
/home/runner/work/opentelemetry-collector-contrib/opentelemetry-collector-contrib/.tools/gotestsum --rerun-fails=1 --packages="./..." -- -race -timeout 360s -parallel 4 -tags=integration,"" -run=Integration
∅  internal/mocks
∅  internal/model
∅  internal/metadata (1.02s)
✖  . (6m0.053s)

=== Failed
=== FAIL: .  (0.00s)
panic: test timed out after 6m0s
running tests:
    TestIntegration (6m0s)
    TestIntegration/7.9.3 (6m0s)

goroutine 98 [running]:
testing.(*M).startAlarm.func1()
    /opt/hostedtoolcache/go/1.21.9/x64/src/testing/testing.go:2259 +0x259
created by time.goFunc
    /opt/hostedtoolcache/go/1.21.9/x64/src/time/sleep.go:176 +0x45

goroutine 1 [chan receive, 6 minutes]:
testing.(*T).Run(0xc000007a00, {0x15f1287, 0xf}, 0x1648ed0)
    /opt/hostedtoolcache/go/1.21.9/x64/src/testing/testing.go:1649 +0x871
testing.runTests.func1(0x0?)
    /opt/hostedtoolcache/go/1.21.9/x64/src/testing/testing.go:2054 +0x85
testing.tRunner(0xc000007a00, 0xc00059f998)
    /opt/hostedtoolcache/go/1.21.9/x64/src/testing/testing.go:1595 +0x262
testing.runTests(0xc00036afa0?, {0x1f55ce0, 0x2a, 0x2a}, {0xc00059fa68?, 0x448805?, 0x1f62e00?})
    /opt/hostedtoolcache/go/1.21.9/x64/src/testing/testing.go:2052 +0x8ae
testing.(*M).Run(0xc00036afa0)
    /opt/hostedtoolcache/go/1.21.9/x64/src/testing/testing.go:1925 +0xcd8
go.uber.org/goleak.VerifyTestMain({0x17cba40, 0xc00036afa0}, {0x0, 0x0, 0x0})
    /home/runner/go/pkg/mod/go.uber.org/goleak@v1.3.0/testmain.go:53 +0x65
github.com/open-telemetry/opentelemetry-collector-contrib/receiver/elasticsearchreceiver.TestMain(...)
    /home/runner/work/opentelemetry-collector-contrib/opentelemetry-collector-contrib/receiver/elasticsearchreceiver/generated_package_test.go:12
main.main()
    _testmain.go:131 +0x317

goroutine 5 [chan receive, 6 minutes]:
testing.(*T).Run(0xc000007ba0, {0x15e7177, 0x5}, 0xc000110050)
    /opt/hostedtoolcache/go/1.21.9/x64/src/testing/testing.go:1649 +0x871
github.com/open-telemetry/opentelemetry-collector-contrib/receiver/elasticsearchreceiver.TestIntegration(0x0?)
    /home/runner/work/opentelemetry-collector-contrib/opentelemetry-collector-contrib/receiver/elasticsearchreceiver/integration_test.go:25 +0x47
testing.tRunner(0xc000007ba0, 0x1648ed0)
    /opt/hostedtoolcache/go/1.21.9/x64/src/testing/testing.go:1595 +0x262
created by testing.(*T).Run in goroutine 1
    /opt/hostedtoolcache/go/1.21.9/x64/src/testing/testing.go:1648 +0x846

goroutine 34 [semacquire, 6 minutes]:
sync.runtime_Semacquire(0xc000408058?)
    /opt/hostedtoolcache/go/1.21.9/x64/src/runtime/sema.go:62 +0x25
sync.(*WaitGroup).Wait(0xc000408050)
    /opt/hostedtoolcache/go/1.21.9/x64/src/sync/waitgroup.go:116 +0xa5
github.com/open-telemetry/opentelemetry-collector-contrib/internal/coreinternal/scraperinttest.(*IntegrationTest).createContainers(0xc000428000, 0xc000336000)
    /home/runner/work/opentelemetry-collector-contrib/opentelemetry-collector-contrib/internal/coreinternal/scraperinttest/scraperint.go:162 +0x40f
github.com/open-telemetry/opentelemetry-collector-contrib/internal/coreinternal/scraperinttest.(*IntegrationTest).Run(0xc000428000, 0xc000336000)
    /home/runner/work/opentelemetry-collector-contrib/opentelemetry-collector-contrib/internal/coreinternal/scraperinttest/scraperint.go:67 +0x65
testing.tRunner(0xc000336000, 0xc000110050)
    /opt/hostedtoolcache/go/1.21.9/x64/src/testing/testing.go:1595 +0x262
created by testing.(*T).Run in goroutine 5
    /opt/hostedtoolcache/go/1.21.9/x64/src/testing/testing.go:1648 +0x846

goroutine 36 [IO wait]:
internal/poll.runtime_pollWait(0x7f8ffc1c8c68, 0x72)
    /opt/hostedtoolcache/go/1.21.9/x64/src/runtime/netpoll.go:343 +0x85
internal/poll.(*pollDesc).wait(0xc0004952a0, 0xc0000d5000?, 0x0)
    /opt/hostedtoolcache/go/1.21.9/x64/src/internal/poll/fd_poll_runtime.go:84 +0xb1
internal/poll.(*pollDesc).waitRead(...)
    /opt/hostedtoolcache/go/1.21.9/x64/src/internal/poll/fd_poll_runtime.go:89
internal/poll.(*FD).Read(0xc000495280, {0xc0000d5000, 0x1000, 0x1000})
    /opt/hostedtoolcache/go/1.21.9/x64/src/internal/poll/fd_unix.go:164 +0x405
net.(*netFD).Read(0xc000495280, {0xc0000d5000, 0x1000, 0x1000})
    /opt/hostedtoolcache/go/1.21.9/x64/src/net/fd_posix.go:55 +0x4b
net.(*conn).Read(0xc000492250, {0xc0000d5000, 0x1000, 0x1000})
    /opt/hostedtoolcache/go/1.21.9/x64/src/net/net.go:179 +0xad
net/http.(*persistConn).Read(0xc000477320, {0xc0000d5000, 0x1000, 0x1000})
    /opt/hostedtoolcache/go/1.21.9/x64/src/net/http/transport.go:1954 +0x105
bufio.(*Reader).fill(0xc000433680)
    /opt/hostedtoolcache/go/1.21.9/x64/src/bufio/bufio.go:113 +0x29a
bufio.(*Reader).ReadSlice(0xc000433680, 0x91?)
    /opt/hostedtoolcache/go/1.21.9/x64/src/bufio/bufio.go:379 +0x85
net/http/internal.readChunkLine(0xc0000d5000?)
    /opt/hostedtoolcache/go/1.21.9/x64/src/net/http/internal/chunked.go:156 +0x34
net/http/internal.(*chunkedReader).beginChunk(0xc0004743c0)
    /opt/hostedtoolcache/go/1.21.9/x64/src/net/http/internal/chunked.go:49 +0x5a
net/http/internal.(*chunkedReader).Read(0xc0004743c0, {0xc0003c899b, 0x1665, 0x1665})
    /opt/hostedtoolcache/go/1.21.9/x64/src/net/http/internal/chunked.go:125 +0x2b4
net/http.(*body).readLocked(0xc0001127c0, {0xc0003c899b, 0x1665, 0x1665})
    /opt/hostedtoolcache/go/1.21.9/x64/src/net/http/transfer.go:839 +0xa9
net/http.(*body).Read(0xc0001127c0, {0xc0003c899b, 0x1665, 0x1665})
    /opt/hostedtoolcache/go/1.21.9/x64/src/net/http/transfer.go:831 +0x185
net/http.(*bodyEOFSignal).Read(0xc000112800, {0xc0003c899b, 0x1665, 0x1665})
    /opt/hostedtoolcache/go/1.21.9/x64/src/net/http/transport.go:2810 +0x143
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp.(*wrappedBody).Read(0xc0001128c0, {0xc0003c899b, 0x1665, 0x1665})
    /home/runner/go/pkg/mod/go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp@v0.50.0/transport.go:239 +0x7a
bytes.(*Buffer).ReadFrom(0xc0005128b0, {0x7f8ffbf75c10, 0xc000110200})
    /opt/hostedtoolcache/go/1.21.9/x64/src/bytes/buffer.go:211 +0x110
github.com/testcontainers/testcontainers-go.(*DockerProvider).BuildImage(0xc00035c8f0, {0x17d7298, 0x1f93900}, {0x17dc0a0, 0xc000307680})
    /home/runner/go/pkg/mod/github.com/testcontainers/testcontainers-go@v0.30.0/docker.go:912 +0x525
github.com/testcontainers/testcontainers-go.(*DockerProvider).CreateContainer(_, {_, _}, {{{0xc0003a4030, 0x14}, {0x0, 0x0}, {0xc00016a120, 0x1e}, {0x0, ...}, ...}, ...})
    /home/runner/go/pkg/mod/github.com/testcontainers/testcontainers-go@v0.30.0/docker.go:1012 +0x1c8a
github.com/testcontainers/testcontainers-go.GenericContainer({_, _}, {{{{0xc0003a4030, 0x14}, {0x0, 0x0}, {0xc00016a120, 0x1e}, {0x0, 0x0}, ...}, ...}, ...})
    /home/runner/go/pkg/mod/github.com/testcontainers/testcontainers-go@v0.30.0/generic.go:73 +0x350
github.com/open-telemetry/opentelemetry-collector-contrib/internal/coreinternal/scraperinttest.(*IntegrationTest).createContainers.func1.1()
    /home/runner/work/opentelemetry-collector-contrib/opentelemetry-collector-contrib/internal/coreinternal/scraperinttest/scraperint.go:146 +0xcf
github.com/stretchr/testify/assert.Eventually.func1()
    /home/runner/go/pkg/mod/github.com/stretchr/testify@v1.9.0/assert/assertions.go:1902 +0x34
created by github.com/stretchr/testify/assert.Eventually in goroutine 35
    /home/runner/go/pkg/mod/github.com/stretchr/testify@v1.9.0/assert/assertions.go:1902 +0x3b6

goroutine 82 [select, 5 minutes]:
net/http.(*persistConn).readLoop(0xc000477320)
    /opt/hostedtoolcache/go/1.21.9/x64/src/net/http/transport.go:2238 +0x153d
created by net/http.(*Transport).dialConn in goroutine 65
    /opt/hostedtoolcache/go/1.21.9/x64/src/net/http/transport.go:1[776](https://github.com/open-telemetry/opentelemetry-collector-contrib/actions/runs/8806610147/job/24171994179#step:5:777) +0x25da

goroutine 63 [chan receive, 5 minutes]:
github.com/testcontainers/testcontainers-go.(*Reaper).Connect.func1({0x17db778?, 0xc0004920e0})
    /home/runner/go/pkg/mod/github.com/testcontainers/testcontainers-go@v0.30.0/reaper.go:358 +0xa0e
created by github.com/testcontainers/testcontainers-go.(*Reaper).Connect in goroutine 36
    /home/runner/go/pkg/mod/github.com/testcontainers/testcontainers-go@v0.30.0/reaper.go:323 +0x2ec

goroutine 83 [select, 5 minutes]:
net/http.(*persistConn).writeLoop(0xc000477320)
    /opt/hostedtoolcache/go/1.21.9/x64/src/net/http/transport.go:2421 +0x1bc
created by net/http.(*Transport).dialConn in goroutine 65
    /opt/hostedtoolcache/go/1.21.9/x64/src/net/http/transport.go:1[777](https://github.com/open-telemetry/opentelemetry-collector-contrib/actions/runs/8806610147/job/24171994179#step:5:778) +0x266b
FAIL    github.com/open-telemetry/opentelemetry-collector-contrib/receiver/elasticsearchreceiver    360.053s

=== FAIL: . TestIntegration (unknown)
2024/04/23 20:23:43 github.com/testcontainers/testcontainers-go - Connected to docker: 
  Server Version: 24.0.9
  API Version: 1.43
  Operating System: Ubuntu 22.04.4 LTS
  Total Memory: 15981 MB
  Resolved Docker Host: unix:///var/run/docker.sock
  Resolved Docker Socket Path: /var/run/docker.sock
  Test SessionID: 792e60a8157f651dfd53bc6e649f645c80e82e592de8b5b1e66c2910bb330dad
  Test ProcessID: d8505f59-7b32-41c6-9455-4c6add531b51
2024/04/23 20:23:43 🐳 Creating container for image testcontainers/ryuk:0.7.0
2024/04/23 20:23:43 ✅ Container created: c7fa1e05eb78
2024/04/23 20:23:43 🐳 Starting container: c7fa1e05eb78
2024/04/23 20:23:43 ✅ Container started: c7fa1e05eb78
2024/04/23 20:23:43 🚧 Waiting for container id c7fa1e05eb78 image: testcontainers/ryuk:0.7.0. Waiting for: &{Port:8080/tcp timeout:<nil> PollInterval:100ms}
2024/04/23 20:23:43 🔔 Container is ready: c7fa1e05eb78
time="2024-04-23T20:23:43Z" level=error msg="Tar: Can't stat file /home/runner/work/opentelemetry-collector-contrib/opentelemetry-collector-contrib/receiver/elasticsearchreceiver/testdata/integration to tar: lstat /home/runner/work/opentelemetry-collector-contrib/opentelemetry-collector-contrib/receiver/elasticsearchreceiver/testdata/integration//home/runner/work/opentelemetry-collector-contrib/opentelemetry-collector-contrib/receiver/elasticsearchreceiver/testdata/integration/.dockerignore: no such file or directory"
    scraperint.go:145: 
            Error Trace:    /home/runner/work/opentelemetry-collector-contrib/opentelemetry-collector-contrib/internal/coreinternal/scraperinttest/scraperint.go:145
                                        /opt/hostedtoolcache/go/1.21.9/x64/src/runtime/asm_amd64.s:1650
            Error:          Condition never satisfied
            Test:           TestIntegration/7.9.3
            Messages:       create container timeout: <nil>

=== FAIL: . TestIntegration/7.9.3 (unknown)
2024/04/23 20:23:43 github.com/testcontainers/testcontainers-go - Connected to docker: 
  Server Version: 24.0.9
  API Version: 1.43
  Operating System: Ubuntu 22.04.4 LTS
  Total Memory: 15981 MB
  Resolved Docker Host: unix:///var/run/docker.sock
  Resolved Docker Socket Path: /var/run/docker.sock
  Test SessionID: [792](https://github.com/open-telemetry/opentelemetry-collector-contrib/actions/runs/8806610147/job/24171994179#step:5:793)e60a8157f651dfd53bc6e649f645c80e82e592de8b5b1e66c2910bb330dad
  Test ProcessID: d8505f59-7b32-41c6-9455-4c6add531b51
2024/04/23 20:23:43 🐳 Creating container for image testcontainers/ryuk:0.7.0
2024/04/23 20:23:43 ✅ Container created: c7fa1e05eb78
2024/04/23 20:23:43 🐳 Starting container: c7fa1e05eb78
2024/04/23 20:23:43 ✅ Container started: c7fa1e05eb78
2024/04/23 20:23:43 🚧 Waiting for container id c7fa1e05eb78 image: testcontainers/ryuk:0.7.0. Waiting for: &{Port:8080/tcp timeout:<nil> PollInterval:100ms}
2024/04/23 20:23:43 🔔 Container is ready: c7fa1e05eb78
time="2024-04-23T20:23:43Z" level=error msg="Tar: Can't stat file /home/runner/work/opentelemetry-collector-contrib/opentelemetry-collector-contrib/receiver/elasticsearchreceiver/testdata/integration to tar: lstat /home/runner/work/opentelemetry-collector-contrib/opentelemetry-collector-contrib/receiver/elasticsearchreceiver/testdata/integration//home/runner/work/opentelemetry-collector-contrib/opentelemetry-collector-contrib/receiver/elasticsearchreceiver/testdata/integration/.dockerignore: no such file or directory"
    scraperint.go:145: 
            Error Trace:    /home/runner/work/opentelemetry-collector-contrib/opentelemetry-collector-contrib/internal/coreinternal/scraperinttest/scraperint.go:145
                                        /opt/hostedtoolcache/go/1.21.9/x64/src/runtime/asm_amd64.s:1650
ERROR rerun aborted because previous run had a suspected panic and some test may not have run
make[2]: *** [../../Makefile.Common:142: mod-integration-test] Error 3
make[1]: *** [Makefile:169: receiver/elasticsearchreceiver] Error 2
make: *** [Makefile:126: gointegration-test] Error 2
            Error:          Condition never satisfied
            Test:           TestIntegration/7.9.3
            Messages:       create container timeout: <nil>

DONE 2 tests, 3 failures in 363.065s
make[1]: Leaving directory '/home/runner/work/opentelemetry-collector-contrib/opentelemetry-collector-contrib'
github-actions[bot] commented 3 months ago

Pinging code owners:

%s See Adding Labels via Comments if you do not have permissions to add labels yourself.

crobert-1 commented 3 months ago

+1 freq: https://github.com/open-telemetry/opentelemetry-collector-contrib/actions/runs/8806610147/job/24171994179

rogercoll commented 2 months ago

I am not sure if is related to the .dockerignore file not found, as that error log is also shown in a successful execution:

2024/05/10 08:14:23 🐳 Creating container for image testcontainers/ryuk:0.7.0
2024/05/10 08:14:23 ✅ Container created: 8218e5e5d53c
2024/05/10 08:14:23 🐳 Starting container: 8218e5e5d53c
2024/05/10 08:14:23 ✅ Container started: 8218e5e5d53c
2024/05/10 08:14:23 🚧 Waiting for container id 8218e5e5d53c image: testcontainers/ryuk:0.7.0. Waiting for: &{Port:8080/tcp timeout:<nil> PollInterval:100ms}
2024/05/10 08:14:23 🔔 Container is ready: 8218e5e5d53c
ERRO[0001] Tar: Can't stat file ../opentelemetry-collector-contrib/receiver/elasticsearchreceiver/testdata/integration to tar: lstat ../opentelemetry-collector-contrib/receiver/elasticsearchreceiver/testdata/integration/../opentelemetry-collector-contrib/receiver/elasticsearchreceiver/testdata/integration/.dockerignore: no such file or directory
2024/05/10 08:14:23 🐳 Creating container for image
2024/05/10 08:14:23 ✅ Container created: b6cc6dd779de
2024/05/10 08:14:23 🐳 Starting container: b6cc6dd779de
2024/05/10 08:14:24 ✅ Container started: b6cc6dd779de
2024/05/10 08:14:24 🚧 Waiting for container id b6cc6dd779de image: 909b3c36-3860-41ea-b156-43e09cd91dd0:8fbe8d00-ff1e-4e62-9e99-243bd6caa753. Waiting for: &{Port:9200 timeout:0xc000223490 PollInterval:100ms}
2024/05/10 08:14:59 🔔 Container is ready: b6cc6dd779de
2024/05/10 08:15:05 🐳 Terminating container: b6cc6dd779de
2024/05/10 08:15:06 🚫 Container terminated: b6cc6dd779de
ERRO[0045] Tar: Can't stat file ../opentelemetry-collector-contrib/receiver/elasticsearchreceiver/testdata/integration to tar: lstat ../opentelemetry-collector-contrib/receiver/elasticsearchreceiver/testdata/integration/../opentelemetry-collector-contrib/receiver/elasticsearchreceiver/testdata/integration/.dockerignore: no such file or directory
2024/05/10 08:15:10 🐳 Creating container for image
2024/05/10 08:15:10 ✅ Container created: 5fe2aa314b47
2024/05/10 08:15:10 🐳 Starting container: 5fe2aa314b47
2024/05/10 08:15:10 ✅ Container started: 5fe2aa314b47
2024/05/10 08:15:10 🚧 Waiting for container id 5fe2aa314b47 image: 39f06d00-f189-4648-bd59-ab3158d3d7e1:bd18957d-b93e-4bec-99d5-1f98c551524f. Waiting for: &{Port:9200 timeout:0xc00070c2f0 PollInterval:100ms}
2024/05/10 08:15:55 🔔 Container is ready: 5fe2aa314b47
2024/05/10 08:16:08 🐳 Terminating container: 5fe2aa314b47
2024/05/10 08:16:08 🚫 Container terminated: 5fe2aa314b47
PASS
ok      github.com/open-telemetry/opentelemetry-collector-contrib/receiver/elasticsearchreceiver    106.773s

As it fails intermittently and not only for the elasticreceiver, it might be related to the testcontainers framework itself. The panic occurs due to a waitgroup wait that never exits and finally killed due to the testing timeout (internal/coreinternal/scraperinttest/scraperint.go:162 +0x40f):

func (it *IntegrationTest) createContainers(t *testing.T) *ContainerInfo {
    var wg sync.WaitGroup
    ci := &ContainerInfo{
        containers: make(map[string]testcontainers.Container, len(it.containerRequests)),
    }
    wg.Add(len(it.containerRequests))
    for _, cr := range it.containerRequests {
        go func(req testcontainers.ContainerRequest) {
            var errs error
            require.Eventuallyf(t, func() bool {
                c, err := testcontainers.GenericContainer(
                    context.Background(),
                    testcontainers.GenericContainerRequest{
                        ContainerRequest: req,
                        Started:          true,
                    })
                if err != nil {
                    errs = multierr.Append(errs, fmt.Errorf("execute container request: %w", err))
                    return false
                }
                ci.add(req.Name, c)
                return true
            }, it.createContainerTimeout, time.Second, "create container timeout: %v", errs)
            wg.Done()
        }(cr)
    }
    wg.Wait()
    return ci
}

require.Eventuallyf calls t.FailNow if the condition is not met, thus preventing relasing the waitgroup. Although I was not able to reproduce the issue in my local environment, I would suggest two modifications:

github-actions[bot] commented 2 weeks ago

This issue has been inactive for 60 days. It will be closed in 60 days if there is no activity. To ping code owners by adding a component label, see Adding Labels via Comments, or if you are unsure of which component this issue relates to, please ping @open-telemetry/collector-contrib-triagers. If this issue is still relevant, please ping the code owners or leave a comment explaining why it is still relevant. Otherwise, please close it.

Pinging code owners:

See Adding Labels via Comments if you do not have permissions to add labels yourself.