projecthorus / sondehub-infra

GNU General Public License v3.0
26 stars 4 forks source link

Uploaders Statistics #49

Closed LukePrior closed 2 years ago

LukePrior commented 2 years ago

We can generate graphs showing the number and breakdown of receivers using the Visualise tool in ElasticSearch however it can be hard to make these publically accessible. The solution to this would be to create an API that returns the necessary data to generate these graphs.

The following ElasticSearch requests gets the last 7 days of telemetry data (~44 million packets) and returns the software names and versions according to the unique number of callsigns for each. The response also includes the raw number of packets which match each software and version.

{
  "aggs": {
    "2": {
      "terms": {
        "field": "software_name.keyword",
        "order": {
          "1": "desc"
        },
        "size": 10
      },
      "aggs": {
        "1": {
          "cardinality": {
            "field": "uploader_callsign.keyword"
          }
        },
        "3": {
          "terms": {
            "field": "software_version.keyword",
            "order": {
              "1": "desc"
            },
            "size": 10
          },
          "aggs": {
            "1": {
              "cardinality": {
                "field": "uploader_callsign.keyword"
              }
            }
          }
        }
      }
    }
  },
  "size": 0,
  "stored_fields": [
    "*"
  ],
  "script_fields": {},
  "docvalue_fields": [
    {
      "field": "@timestamp",
      "format": "date_time"
    },
    {
      "field": "datetime",
      "format": "date_time"
    },
    {
      "field": "time_received",
      "format": "date_time"
    },
    {
      "field": "time_server",
      "format": "date_time"
    }
  ],
  "_source": {
    "excludes": []
  },
  "query": {
    "bool": {
      "must": [],
      "filter": [
        {
          "match_all": {}
        },
        {
          "range": {
            "datetime": {
              "gte": "2022-01-05T07:50:52.795Z",
              "lte": "2022-01-12T07:50:52.795Z",
              "format": "strict_date_optional_time"
            }
          }
        }
      ],
      "should": [],
      "must_not": []
    }
  }
}

I'm not sure if this can easily be made into an API, it shouldn't require any input fields.

This is the ElasticSearch response from the above request:

{
  "took": 7063,
  "timed_out": false,
  "_shards": {
    "total": 3,
    "successful": 3,
    "skipped": 0,
    "failed": 0
  },
  "hits": {
    "total": 44772779,
    "max_score": null,
    "hits": []
  },
  "aggregations": {
    "2": {
      "doc_count_error_upper_bound": 0,
      "sum_other_doc_count": 0,
      "buckets": [
        {
          "1": {
            "value": 541
          },
          "3": {
            "doc_count_error_upper_bound": 0,
            "sum_other_doc_count": 1053782,
            "buckets": [
              {
                "1": {
                  "value": 312
                },
                "key": "1.5.8",
                "doc_count": 23576665
              },
              {
                "1": {
                  "value": 91
                },
                "key": "1.5.7",
                "doc_count": 4688112
              },
              {
                "1": {
                  "value": 53
                },
                "key": "1.5.6",
                "doc_count": 3088206
              },
              {
                "1": {
                  "value": 28
                },
                "key": "1.5.5",
                "doc_count": 1642882
              },
              {
                "1": {
                  "value": 22
                },
                "key": "1.5.3",
                "doc_count": 1510429
              },
              {
                "1": {
                  "value": 13
                },
                "key": "1.5.1",
                "doc_count": 909441
              },
              {
                "1": {
                  "value": 13
                },
                "key": "1.5.4",
                "doc_count": 435886
              },
              {
                "1": {
                  "value": 11
                },
                "key": "1.5.2",
                "doc_count": 608260
              },
              {
                "1": {
                  "value": 5
                },
                "key": "1.5.0",
                "doc_count": 390165
              },
              {
                "1": {
                  "value": 3
                },
                "key": "1.5.8-beta2",
                "doc_count": 273166
              }
            ]
          },
          "key": "radiosonde_auto_rx",
          "doc_count": 38176994
        },
        {
          "1": {
            "value": 147
          },
          "3": {
            "doc_count_error_upper_bound": 0,
            "sum_other_doc_count": 0,
            "buckets": [
              {
                "1": {
                  "value": 98
                },
                "key": "devel20211101",
                "doc_count": 4381623
              },
              {
                "1": {
                  "value": 36
                },
                "key": "master_v0.9.0",
                "doc_count": 1211087
              },
              {
                "1": {
                  "value": 4
                },
                "key": "devel20211003",
                "doc_count": 101988
              },
              {
                "1": {
                  "value": 3
                },
                "key": "devel20211005",
                "doc_count": 188027
              },
              {
                "1": {
                  "value": 3
                },
                "key": "devel20211010",
                "doc_count": 91851
              },
              {
                "1": {
                  "value": 2
                },
                "key": "devel20211013",
                "doc_count": 137199
              },
              {
                "1": {
                  "value": 2
                },
                "key": "devel20211016",
                "doc_count": 110658
              },
              {
                "1": {
                  "value": 2
                },
                "key": "devel20211023",
                "doc_count": 55280
              },
              {
                "1": {
                  "value": 1
                },
                "key": "devel20210913",
                "doc_count": 137183
              },
              {
                "1": {
                  "value": 1
                },
                "key": "devel20210914",
                "doc_count": 68819
              }
            ]
          },
          "key": "rdzTTGOsonde",
          "doc_count": 6483715
        },
        {
          "1": {
            "value": 8
          },
          "3": {
            "doc_count_error_upper_bound": 0,
            "sum_other_doc_count": 0,
            "buckets": [
              {
                "1": {
                  "value": 6
                },
                "key": "6.2.5.8",
                "doc_count": 44004
              },
              {
                "1": {
                  "value": 1
                },
                "key": "6.2.4.7",
                "doc_count": 66646
              },
              {
                "1": {
                  "value": 1
                },
                "key": "6.2.5.3",
                "doc_count": 1420
              }
            ]
          },
          "key": "SondeMonitor",
          "doc_count": 112070
        }
      ]
    }
  }
}

The API would need to return the following information:

{
  "time_generated": UTC,
  "total_packets": 44772779,
  "data": {
    "radiosonde_auto_rx": {"count": 38176994, "unique_count": 541, "versions": {"1.5.8":{"count": 23576665, "unique_count": 312}, etc}},
    "rdzTTGOsonde": {},
    "SondeMonitor": {}
  }
}

The ElasticSearch query takes about 7000ms so I would suggest caching it heavily using Cloudfront.

LukePrior commented 2 years ago

Here is a simple prototype graph page: https://jsfiddle.net/1yLqhca6/

TheSkorm commented 2 years ago

GET https://api.v2.sondehub.org/listener/stats now returns stats as below. It is cached for 5 minutes. Does this meet your requirements @LukePrior ?

{
  "radiosonde_auto_rx": {
    "telemetry_count": 39540539,
    "unique_callsigns": 553,
    "versions": {
      "1.5.8": {
        "telemetry_count": 25482738,
        "unique_callsigns": 328
      },
      "1.5.7": {
        "telemetry_count": 4657769,
        "unique_callsigns": 83
      },
      "1.5.6": {
        "telemetry_count": 3011163,
        "unique_callsigns": 52
      },
      "1.5.5": {
        "telemetry_count": 1668648,
        "unique_callsigns": 30
      },
      "1.5.3": {
        "telemetry_count": 1545693,
        "unique_callsigns": 21
      },
      "1.5.4": {
        "telemetry_count": 398691,
        "unique_callsigns": 16
      },
      "1.5.1": {
        "telemetry_count": 840853,
        "unique_callsigns": 14
      },
      "1.5.2": {
        "telemetry_count": 516157,
        "unique_callsigns": 10
      },
      "1.5.0": {
        "telemetry_count": 299957,
        "unique_callsigns": 6
      },
      "1.5.8-Z3": {
        "telemetry_count": 110221,
        "unique_callsigns": 2
      }
    }
  },
  "rdzTTGOsonde": {
    "telemetry_count": 6316474,
    "unique_callsigns": 154,
    "versions": {
      "devel20211101": {
        "telemetry_count": 4358131,
        "unique_callsigns": 97
      },
      "master_v0.9.0": {
        "telemetry_count": 1138525,
        "unique_callsigns": 38
      },
      "devel20211023": {
        "telemetry_count": 61601,
        "unique_callsigns": 7
      },
      "devel20211003": {
        "telemetry_count": 86372,
        "unique_callsigns": 3
      },
      "devel20211016": {
        "telemetry_count": 144274,
        "unique_callsigns": 3
      },
      "devel20211005": {
        "telemetry_count": 164342,
        "unique_callsigns": 2
      },
      "devel20211010": {
        "telemetry_count": 116775,
        "unique_callsigns": 2
      },
      "devel20211031": {
        "telemetry_count": 19485,
        "unique_callsigns": 2
      },
      "devel20210913": {
        "telemetry_count": 126205,
        "unique_callsigns": 1
      },
      "devel20210914": {
        "telemetry_count": 46328,
        "unique_callsigns": 1
      }
    }
  },
  "SondeMonitor": {
    "telemetry_count": 87314,
    "unique_callsigns": 11,
    "versions": {
      "6.2.5.8": {
        "telemetry_count": 36082,
        "unique_callsigns": 9
      },
      "6.2.4.7": {
        "telemetry_count": 50975,
        "unique_callsigns": 1
      },
      "6.2.5.3": {
        "telemetry_count": 257,
        "unique_callsigns": 1
      }
    }
  },
  "rdzTTGO": {
    "telemetry_count": 131422,
    "unique_callsigns": 2,
    "versions": {
      "DL2MF-20220111": {
        "telemetry_count": 131422,
        "unique_callsigns": 2
      }
    }
  }
}
LukePrior commented 2 years ago

Yeah that looks perfect to me, I do have one question and I'm assuming it's an ElasticSearch problem that we can't do much about but the sum of versions doesn't always equal the top sum. I think ElasticSearch creates an other category at least in the case of visualisations but that doesn't even make sense as the sum of versions is sometimes greater than the total sum for auto rx: 553 total vs 562 sum of top 10 versions, I'm assuming this is from people upgrading and getting counted twice?

TheSkorm commented 2 years ago

Yes that's right, when people upgrade they'll be counted twice. Not a lot we can do about that. I'm not exactly how Kibana manages displaying this, but I'd guess that they just create their own sum rather than using the top level agg, and only use the top level agg for labels.