quickwit-oss / quickwit

Cloud-native search engine for observability. An open-source alternative to Datadog, Elasticsearch, Loki, and Tempo.
https://quickwit.io
Other
8.24k stars 336 forks source link

panic on merge of date term in a json field with indexed position #4284

Closed trinity-1686a closed 10 months ago

trinity-1686a commented 11 months ago
thread 'merge_thread_0' panicked at 'assertion failed: `(left == right)`
  left: `0`,
 right: `1`', /home/trinity/dev/tantivy/bff7c58/src/postings/serializer.rs:203:13
stack backtrace:
   0: rust_begin_unwind
             at /rustc/d5c2e9c342b358556da91d61ed4133f6f50fc0c3/library/std/src/panicking.rs:593:5
   1: core::panicking::panic_fmt
             at /rustc/d5c2e9c342b358556da91d61ed4133f6f50fc0c3/library/core/src/panicking.rs:67:14
   2: core::panicking::assert_failed_inner
   3: core::panicking::assert_failed
             at /rustc/d5c2e9c342b358556da91d61ed4133f6f50fc0c3/library/core/src/panicking.rs:229:5
   4: tantivy::postings::serializer::FieldSerializer::write_doc
   5: tantivy::indexer::merger::IndexMerger::write_postings_for_field
             at /home/trinity/dev/tantivy/bff7c58/src/indexer/merger.rs:626:29
   6: tantivy::indexer::merger::IndexMerger::write_postings
             at /home/trinity/dev/tantivy/bff7c58/src/indexer/merger.rs:658:17
   7: tantivy::indexer::merger::IndexMerger::write
             at /home/trinity/dev/tantivy/bff7c58/src/indexer/merger.rs:766:9
   8: tantivy::indexer::segment_updater::merge
             at /home/trinity/dev/tantivy/bff7c58/src/indexer/segment_updater.rs:125:20
   9: tantivy::indexer::segment_updater::SegmentUpdater::start_merge::{{closure}}
             at /home/trinity/dev/tantivy/bff7c58/src/indexer/segment_updater.rs:518:19
  10: <core::panic::unwind_safe::AssertUnwindSafe<F> as core::ops::function::FnOnce<()>>::call_once
             at /rustc/d5c2e9c342b358556da91d61ed4133f6f50fc0c3/library/core/src/panic/unwind_safe.rs:271:9
  11: std::panicking::try::do_call
             at /rustc/d5c2e9c342b358556da91d61ed4133f6f50fc0c3/library/std/src/panicking.rs:500:40
  12: std::panicking::try
             at /rustc/d5c2e9c342b358556da91d61ed4133f6f50fc0c3/library/std/src/panicking.rs:464:19
  13: std::panic::catch_unwind
             at /rustc/d5c2e9c342b358556da91d61ed4133f6f50fc0c3/library/std/src/panic.rs:142:14
  14: rayon_core::unwind::halt_unwinding
             at /home/trinity/.cargo/registry/src/index.crates.io-6f17d22bba15001f/rayon-core-1.12.0/src/unwind.rs:17:5
  15: rayon_core::registry::Registry::catch_unwind
             at /home/trinity/.cargo/registry/src/index.crates.io-6f17d22bba15001f/rayon-core-1.12.0/src/registry.rs:366:27
  16: rayon_core::spawn::spawn_job::{{closure}}
             at /home/trinity/.cargo/registry/src/index.crates.io-6f17d22bba15001f/rayon-core-1.12.0/src/spawn/mod.rs:97:13
  17: <rayon_core::job::HeapJob<BODY> as rayon_core::job::Job>::execute
             at /home/trinity/.cargo/registry/src/index.crates.io-6f17d22bba15001f/rayon-core-1.12.0/src/job.rs:169:9
  18: rayon_core::job::JobRef::execute
             at /home/trinity/.cargo/registry/src/index.crates.io-6f17d22bba15001f/rayon-core-1.12.0/src/job.rs:64:9
  19: rayon_core::registry::WorkerThread::execute
             at /home/trinity/.cargo/registry/src/index.crates.io-6f17d22bba15001f/rayon-core-1.12.0/src/registry.rs:859:13
  20: rayon_core::registry::WorkerThread::wait_until_cold
             at /home/trinity/.cargo/registry/src/index.crates.io-6f17d22bba15001f/rayon-core-1.12.0/src/registry.rs:793:26
  21: rayon_core::registry::WorkerThread::wait_until
             at /home/trinity/.cargo/registry/src/index.crates.io-6f17d22bba15001f/rayon-core-1.12.0/src/registry.rs:768:13
  22: rayon_core::registry::WorkerThread::wait_until_out_of_work
             at /home/trinity/.cargo/registry/src/index.crates.io-6f17d22bba15001f/rayon-core-1.12.0/src/registry.rs:817:9
  23: rayon_core::registry::main_loop
             at /home/trinity/.cargo/registry/src/index.crates.io-6f17d22bba15001f/rayon-core-1.12.0/src/registry.rs:922:5
  24: rayon_core::registry::ThreadBuilder::run
             at /home/trinity/.cargo/registry/src/index.crates.io-6f17d22bba15001f/rayon-core-1.12.0/src/registry.rs:52:18
  25: <rayon_core::registry::DefaultSpawn as rayon_core::registry::ThreadSpawn>::spawn::{{closure}}
             at /home/trinity/.cargo/registry/src/index.crates.io-6f17d22bba15001f/rayon-core-1.12.0/src/registry.rs:97:20

with some printf debugging I found the document causing the crash to be

{
  "id": "21559270770",
  "type": "IssueCommentEvent",
  "actor": {
    "id": 26881592,
    "login": "cenk1cenk2",
    "display_login": "cenk1cenk2",
    "gravatar_id": "",
    "url": "https://api.github.com/users/cenk1cenk2",
    "avatar_url": "https://avatars.githubusercontent.com/u/26881592?"
  },
  "repo": {
    "id": 320717183,
    "name": "cenk1cenk2/nestjs-tools",
    "url": "https://api.github.com/repos/cenk1cenk2/nestjs-tools"
  },
  "payload": {
    "action": "created",
    "issue": {
      "url": "https://api.github.com/repos/cenk1cenk2/nestjs-tools/issues/675",
      "repository_url": "https://api.github.com/repos/cenk1cenk2/nestjs-tools",
      "labels_url": "https://api.github.com/repos/cenk1cenk2/nestjs-tools/issues/675/labels{/name}",
      "comments_url": "https://api.github.com/repos/cenk1cenk2/nestjs-tools/issues/675/comments",
      "events_url": "https://api.github.com/repos/cenk1cenk2/nestjs-tools/issues/675/events",
      "html_url": "https://github.com/cenk1cenk2/nestjs-tools/pull/675",
      "id": 1124351973,
      "node_id": "PR_kwDOEx3Bf84yF0kH",
      "number": 675,
      "title": "chore(deps): update dependency @webundsoehne/nestjs-util to v5 [skip ci]",
      "user": {
        "login": "renovate[bot]",
        "id": 29139614,
        "node_id": "MDM6Qm90MjkxMzk2MTQ=",
        "avatar_url": "https://avatars.githubusercontent.com/in/2740?v=4",
        "gravatar_id": "",
        "url": "https://api.github.com/users/renovate%5Bbot%5D",
        "html_url": "https://github.com/apps/renovate",
        "followers_url": "https://api.github.com/users/renovate%5Bbot%5D/followers",
        "following_url": "https://api.github.com/users/renovate%5Bbot%5D/following{/other_user}",
        "gists_url": "https://api.github.com/users/renovate%5Bbot%5D/gists{/gist_id}",
        "starred_url": "https://api.github.com/users/renovate%5Bbot%5D/starred{/owner}{/repo}",
        "subscriptions_url": "https://api.github.com/users/renovate%5Bbot%5D/subscriptions",
        "organizations_url": "https://api.github.com/users/renovate%5Bbot%5D/orgs",
        "repos_url": "https://api.github.com/users/renovate%5Bbot%5D/repos",
        "events_url": "https://api.github.com/users/renovate%5Bbot%5D/events{/privacy}",
        "received_events_url": "https://api.github.com/users/renovate%5Bbot%5D/received_events",
        "type": "Bot",
        "site_admin": false
      },
      "labels": [
        {
          "id": 2578400219,
          "node_id": "MDU6TGFiZWwyNTc4NDAwMjE5",
          "url": "https://api.github.com/repos/cenk1cenk2/nestjs-tools/labels/renovate",
          "name": "renovate",
          "color": "ededed",
          "default": false,
          "description": null
        },
        {
          "id": 2578400227,
          "node_id": "MDU6TGFiZWwyNTc4NDAwMjI3",
          "url": "https://api.github.com/repos/cenk1cenk2/nestjs-tools/labels/automerge",
          "name": "automerge",
          "color": "ededed",
          "default": false,
          "description": null
        },
        {
          "id": 2611901016,
          "node_id": "MDU6TGFiZWwyNjExOTAxMDE2",
          "url": "https://api.github.com/repos/cenk1cenk2/nestjs-tools/labels/peer-deps",
          "name": "peer-deps",
          "color": "ededed",
          "default": false,
          "description": null
        }
      ],
      "state": "closed",
      "locked": false,
      "assignee": null,
      "assignees": [],
      "milestone": null,
      "comments": 1,
      "created_at": "2022-02-04T15:53:10Z",
      "updated_at": "2022-05-01T00:00:01Z",
      "closed_at": "2022-02-04T15:53:12Z",
      "author_association": "NONE",
      "active_lock_reason": null,
      "draft": false,
      "pull_request": {
        "url": "https://api.github.com/repos/cenk1cenk2/nestjs-tools/pulls/675",
        "html_url": "https://github.com/cenk1cenk2/nestjs-tools/pull/675",
        "diff_url": "https://github.com/cenk1cenk2/nestjs-tools/pull/675.diff",
        "patch_url": "https://github.com/cenk1cenk2/nestjs-tools/pull/675.patch",
        "merged_at": "2022-02-04T15:53:12Z"
      },
      "body": "[![WhiteSource Renovate](https://app.renovatebot.com/images/banner.svg)](https://renovatebot.com)\n\nThis PR contains the following updates:\n\n| Package | Change | Age | Adoption | Passing | Confidence |\n|---|---|---|---|---|---|\n| @&#8203;webundsoehne/nestjs-util | [`>= 2.3.2 < 5` -> `>= 2.3.2 < 6`](https://renovatebot.com/diffs/npm/@webundsoehne%2fnestjs-util/4.3.2/5.0.1) | [![age](https://badges.renovateapi.com/packages/npm/@webundsoehne%2fnestjs-util/5.0.1/age-slim)](https://docs.renovatebot.com/merge-confidence/) | [![adoption](https://badges.renovateapi.com/packages/npm/@webundsoehne%2fnestjs-util/5.0.1/adoption-slim)](https://docs.renovatebot.com/merge-confidence/) | [![passing](https://badges.renovateapi.com/packages/npm/@webundsoehne%2fnestjs-util/5.0.1/compatibility-slim/4.3.2)](https://docs.renovatebot.com/merge-confidence/) | [![confidence](https://badges.renovateapi.com/packages/npm/@webundsoehne%2fnestjs-util/5.0.1/confidence-slim/4.3.2)](https://docs.renovatebot.com/merge-confidence/) |\n\n---\n\n### Configuration\n\n📅 **Schedule**: At any time (no schedule defined).\n\n🚦 **Automerge**: Enabled.\n\n♻ **Rebasing**: Whenever PR becomes conflicted, or you tick the rebase/retry checkbox.\n\n🔕 **Ignore**: Close this PR and you won't be reminded about this update again.\n\n---\n\n - [ ] <!-- rebase-check -->If you want to rebase/retry this PR, click this checkbox.\n\n---\n\nThis PR has been generated by [WhiteSource Renovate](https://renovate.whitesourcesoftware.com). View repository job log [here](https://app.renovatebot.com/dashboard#github/cenk1cenk2/nestjs-tools).",
      "reactions": {
        "url": "https://api.github.com/repos/cenk1cenk2/nestjs-tools/issues/675/reactions",
        "total_count": 0,
        "+1": 0,
        "-1": 0,
        "laugh": 0,
        "hooray": 0,
        "confused": 0,
        "heart": 0,
        "rocket": 0,
        "eyes": 0
      },
      "timeline_url": "https://api.github.com/repos/cenk1cenk2/nestjs-tools/issues/675/timeline",
      "performed_via_github_app": null
    },
    "comment": {
      "url": "https://api.github.com/repos/cenk1cenk2/nestjs-tools/issues/comments/1114074806",
      "html_url": "https://github.com/cenk1cenk2/nestjs-tools/pull/675#issuecomment-1114074806",
      "issue_url": "https://api.github.com/repos/cenk1cenk2/nestjs-tools/issues/675",
      "id": 1114074806,
      "node_id": "IC_kwDOEx3Bf85CZ262",
      "user": {
        "login": "cenk1cenk2",
        "id": 26881592,
        "node_id": "MDQ6VXNlcjI2ODgxNTky",
        "avatar_url": "https://avatars.githubusercontent.com/u/26881592?v=4",
        "gravatar_id": "",
        "url": "https://api.github.com/users/cenk1cenk2",
        "html_url": "https://github.com/cenk1cenk2",
        "followers_url": "https://api.github.com/users/cenk1cenk2/followers",
        "following_url": "https://api.github.com/users/cenk1cenk2/following{/other_user}",
        "gists_url": "https://api.github.com/users/cenk1cenk2/gists{/gist_id}",
        "starred_url": "https://api.github.com/users/cenk1cenk2/starred{/owner}{/repo}",
        "subscriptions_url": "https://api.github.com/users/cenk1cenk2/subscriptions",
        "organizations_url": "https://api.github.com/users/cenk1cenk2/orgs",
        "repos_url": "https://api.github.com/users/cenk1cenk2/repos",
        "events_url": "https://api.github.com/users/cenk1cenk2/events{/privacy}",
        "received_events_url": "https://api.github.com/users/cenk1cenk2/received_events",
        "type": "User",
        "site_admin": false
      },
      "created_at": "2022-05-01T00:00:01Z",
      "updated_at": "2022-05-01T00:00:01Z",
      "author_association": "OWNER",
      "body": ":tada: This PR is included in version 4.0.0 :tada:\n\nThe release is available on [npm package (@latest dist-tag)](https://www.npmjs.com/package/@cenk1cenk2/nestjs-transactions/v/4.0.0)\n\nYour **[semantic-release](https://github.com/semantic-release/semantic-release)** bot :package::rocket:",
      "reactions": {
        "url": "https://api.github.com/repos/cenk1cenk2/nestjs-tools/issues/comments/1114074806/reactions",
        "total_count": 0,
        "+1": 0,
        "-1": 0,
        "laugh": 0,
        "hooray": 0,
        "confused": 0,
        "heart": 0,
        "rocket": 0,
        "eyes": 0
      },
      "performed_via_github_app": null
    }
  },
  "public": true,
  "created_at": "2022-05-01T00:00:02Z"
}

and more specifically its term payload.comment.created_at:2022-05-01T00:00:01Z, which is actually stored as a d (date == signed integer; you have to flip the 1st bit to decode it): [150, 234, 210, 20, 254, 193, 202, 0]

doc mapper is:

#
# Index config file for gh-archive dataset.
#
version: 0.4

index_id: gh-archive

doc_mapping:
  field_mappings:
    - name: id
      type: text
      fast: true
    - name: type
      type: text
      fast: true
      tokenizer: raw
    - name: public
      type: bool
      fast: true
    - name: payload
      type: json
      record: position
      tokenizer: default
    - name: body
      type: text
      record: position
      tokenizer: default
    - name: org
      type: json
      tokenizer: default
    - name: repo
      type: json
      tokenizer: default
    - name: actor
      type: json
      tokenizer: default
    - name: other
      type: json
      tokenizer: default
    - name: created_at
      type: datetime
      fast: true
      input_formats:
        - rfc3339
      precision: seconds
    - name: repo.id
      type: u64
      indexed: true
      stored: true
  timestamp_field: created_at

indexing_settings:
  commit_timeout_secs: 10
  merge_policy:
    type: "stable_log"
    min_level_num_docs: 100000
    merge_factor: 2
    max_merge_factor: 3
    maturation_period: 6h

build with quickwit 163ed7ef50051dd5cc1709675421a64102047cda and tantivy bff7c58497964f947dc94e2e45dfe9962e1d10c3

To reproduce: ingest enough of github archive to trigger a merge, the 1st document of type IssueCommentEvent will cause a panic (or possibly some other kind of document before)

this looks a lot like a variant of https://github.com/quickwit-oss/tantivy/issues/2251, which is supposed to be fixed by https://github.com/quickwit-oss/tantivy/pull/2253

PSeitz commented 10 months ago

Minimal Error Case:

    #[test]
    fn test_json_date_with_id_regression() {
        let mut schema_builder = Schema::builder();
        let json = schema_builder.add_json_field("json", TEXT);
        let schema = schema_builder.build();
        let index = Index::create_in_ram(schema);
        let mut writer = index.writer_for_tests().unwrap();
        let doc = json!({"field": "a"});
        writer.add_document(doc!(json=>doc)).unwrap();
        writer.commit().unwrap();
        let doc = json!({"field": "a", "id": 1});
        writer.add_document(doc!(json=>doc.clone())).unwrap();
        writer.commit().unwrap();

        // Force Merge
        writer.wait_merging_threads().unwrap();
        let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
        let segment_ids = index
            .searchable_segment_ids()
            .expect("Searchable segments failed.");
        index_writer.merge(&segment_ids).wait().unwrap();
        assert!(index_writer.wait_merging_threads().is_ok());
    }

If the Term without freq is ordered before the text field it does not panic

let doc = json!({"field": "a", "aid": 1}); // aid so it gets ordered before