rode / grafeas-elasticsearch

An implementation of the Grafeas storage backend based on Elasticsearch
Apache License 2.0
12 stars 5 forks source link

Allow for paging through more results than the limit set by `index.max_result_window` #67

Open alexashley opened 3 years ago

alexashley commented 3 years ago

Came out of this discussion.

The documentation that Elasticsearch provides on pagination makes it sound like there is a hard cap on the number of results than can be paged through using from and size:

By default, you cannot use from and size to page through more than 10,000 hits. This limit is a safeguard set by the index.max_result_window index setting. If you need to page through more than 10,000 hits, use the search_after parameter instead.

We need to determine if that's the case or not by loading a number of notes or occurrences greater than index.max_result_window and attempting to page through them.

If it is, we'll need to make some changes to grab the sort value from the last hit in the results, encode that in the page token, and send it along in future requests as the search_after parameter.

alexashley commented 3 years ago

Used this script to bulk load 15,000 occurrences and try to page through them:

bulk.go ```go package main import ( "bytes" "context" "encoding/json" "fmt" "io" "log" "net/http" "time" "github.com/brianvoe/gofakeit/v6" "github.com/grafeas/grafeas/proto/v1beta1/build_go_proto" "github.com/grafeas/grafeas/proto/v1beta1/common_go_proto" "github.com/grafeas/grafeas/proto/v1beta1/grafeas_go_proto" "github.com/grafeas/grafeas/proto/v1beta1/provenance_go_proto" "github.com/grafeas/grafeas/proto/v1beta1/source_go_proto" "google.golang.org/grpc" ) const ( chunkSize = 1000 numberOfOccurrences = 15000 project = "rode" grafeasUrl = "localhost:8080" ) var ( fake = gofakeit.New(0) ) func main() { conn, client := createGrafeasClient() defer conn.Close() createProject() log.Println("created project") loadOccurrences(client) log.Println("loaded occurrences") pageThroughOccurrences(client) } func createProject() { client := http.Client{ Timeout: time.Minute, } projectPayload := map[string]string{ "name": "projects/" + project, } response, err := client.Post(fmt.Sprintf("%s/v1beta1/projects", "http://" + grafeasUrl), "application/json", jsonBody(&projectPayload)) if err != nil { log.Fatal("error creating project", err) } if response.StatusCode != http.StatusOK { log.Fatal("unexpected response creating project", response.StatusCode) } } func createGrafeasClient() (*grpc.ClientConn, grafeas_go_proto.GrafeasV1Beta1Client) { connection, err := grpc.DialContext(context.Background(), grafeasUrl, grpc.WithInsecure(), grpc.WithBlock()) if err != nil { log.Fatal("error creating grafeas client", err) } grafeasClient := grafeas_go_proto.NewGrafeasV1Beta1Client(connection) return connection, grafeasClient } func loadOccurrences(client grafeas_go_proto.GrafeasV1Beta1Client) { occurrences := make([]*grafeas_go_proto.Occurrence, numberOfOccurrences) for i := 0; i < len(occurrences); i++ { occurrences[i] = createRandomBuildOccurrence() } var occurrenceChunks [][]*grafeas_go_proto.Occurrence for i := 0; i < len(occurrences); i+= chunkSize { end := i + chunkSize if end > len(occurrences) { end = len(occurrences) } occurrenceChunks = append(occurrenceChunks, occurrences[i:end]) } for i := range occurrenceChunks { o := occurrenceChunks[i] _, err := client.BatchCreateOccurrences(context.Background(), &grafeas_go_proto.BatchCreateOccurrencesRequest{ Parent: "projects/"+project, Occurrences: o, }) if err != nil { log.Fatal("error batch creating occurrences", err) } } } func pageThroughOccurrences(client grafeas_go_proto.GrafeasV1Beta1Client) { currentPage := 1 pageToken := "" for { log.Println("requesting page", currentPage) request := &grafeas_go_proto.ListOccurrencesRequest{ Parent: "projects/"+project, Filter: "", PageSize: 1000, PageToken: pageToken, } response, err := client.ListOccurrences(context.Background(), request) if err != nil { log.Fatal("failed to list occurrences", err) } currentPage++ pageToken = response.NextPageToken log.Printf("got %d occurrences\n", len(response.Occurrences)) if len(response.Occurrences) == 0 { log.Println("reached the end of the result set") break } } } func createRandomBuildOccurrence() *grafeas_go_proto.Occurrence { return &grafeas_go_proto.Occurrence{ Name: fake.Name(), Resource: &grafeas_go_proto.Resource{ Uri: fake.URL(), }, NoteName: fmt.Sprintf("projects/%s/notes/%s", project, fake.UUID()), Kind: common_go_proto.NoteKind_BUILD, Remediation: "", CreateTime: nil, UpdateTime: nil, Details: &grafeas_go_proto.Occurrence_Build{ Build: &build_go_proto.Details{ Provenance: &provenance_go_proto.BuildProvenance{ Id: fake.UUID(), ProjectId: "projects/rode", Commands: nil, BuiltArtifacts: []*provenance_go_proto.Artifact{ { Checksum: fake.LetterN(5), Id: fake.UUID(), Names: []string{ fake.URL(), fake.URL(), }, }, }, SourceProvenance: &provenance_go_proto.Source{ ArtifactStorageSourceUri: fake.URL(), Context: &source_go_proto.SourceContext{ Context: &source_go_proto.SourceContext_Git{ Git: &source_go_proto.GitSourceContext{ Url: fake.URL(), RevisionId: fake.LetterN(7), }, }, Labels: nil, }, }, }, }, }, } } func jsonBody(val interface{}) io.Reader { jsonBytes, err := json.Marshal(val) if err != nil { log.Fatal("serialization error", err) } return bytes.NewReader(jsonBytes) } ```
output ```shell $ go run bulk.go WARNING: Package "github.com/golang/protobuf/protoc-gen-go/generator" is deprecated. A future release of golang/protobuf will delete this package, which has long been excluded from the compatibility promise. 2021/04/09 16:52:11 requesting page 1 2021/04/09 16:52:11 got 1000 occurrences 2021/04/09 16:52:11 requesting page 2 2021/04/09 16:52:12 got 1000 occurrences 2021/04/09 16:52:12 requesting page 3 2021/04/09 16:52:12 got 1000 occurrences 2021/04/09 16:52:12 requesting page 4 2021/04/09 16:52:12 got 1000 occurrences 2021/04/09 16:52:12 requesting page 5 2021/04/09 16:52:12 got 1000 occurrences 2021/04/09 16:52:12 requesting page 6 2021/04/09 16:52:12 got 1000 occurrences 2021/04/09 16:52:12 requesting page 7 2021/04/09 16:52:12 got 1000 occurrences 2021/04/09 16:52:12 requesting page 8 2021/04/09 16:52:13 got 1000 occurrences 2021/04/09 16:52:13 requesting page 9 2021/04/09 16:52:13 got 1000 occurrences 2021/04/09 16:52:13 requesting page 10 2021/04/09 16:52:13 got 1000 occurrences 2021/04/09 16:52:13 requesting page 11 2021/04/09 16:52:13 failed to list occurrencesrpc error: code = Internal desc = unexpected response from elasticsearch exit status 1 ```

On the 11th page, this error is returned from Elasticsearch:

{
  "error": {
    "root_cause": [
      {
        "type": "illegal_argument_exception",
        "reason": "Result window is too large, from + size must be less than or equal to: [10000] but was [11000]. See the scroll api for a more efficient way to request large data sets. This limit can be set by changing the [index.max_result_window] index level setting."
      }
    ],
    "type": "search_phase_execution_exception",
    "reason": "all shards failed",
    "phase": "query",
    "grouped": true,
    "failed_shards": [
      {
        "shard": 0,
        "index": "grafeas-v1beta2-rode-occurrences",
        "node": "y40fPpNDRm648olC-Ut-tA",
        "reason": {
          "type": "illegal_argument_exception",
          "reason": "Result window is too large, from + size must be less than or equal to: [10000] but was [11000]. See the scroll api for a more efficient way to request large data sets. This limit can be set by changing the [index.max_result_window] index level setting."
        }
      }
    ],
    "caused_by": {
      "type": "illegal_argument_exception",
      "reason": "Result window is too large, from + size must be less than or equal to: [10000] but was [11000]. See the scroll api for a more efficient way to request large data sets. This limit can be set by changing the [index.max_result_window] index level setting.",
      "caused_by": {
        "type": "illegal_argument_exception",
        "reason": "Result window is too large, from + size must be less than or equal to: [10000] but was [11000]. See the scroll api for a more efficient way to request large data sets. This limit can be set by changing the [index.max_result_window] index level setting."
      }
    }
  },
  "status": 400
}