mozilla / bigquery-etl

Bigquery ETL
https://mozilla.github.io/bigquery-etl
Mozilla Public License 2.0
253 stars 100 forks source link

Example config files for BigEye and Monte Carlo #5737

Closed scholtzan closed 3 months ago

scholtzan commented 4 months ago

Add some config files for BigEye and Monte Carlo

Checklist for reviewer:

For modifications to schemas in restricted namespaces (see CODEOWNERS):

┆Issue is synchronized with this Jira Task

dataops-ci-bot commented 4 months ago

Integration report for "Allow multiple monte carlo configs"

sql.diff

Click to expand! ```diff diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/dags/bqetl_unified.py /tmp/workspace/generated-sql/dags/bqetl_unified.py --- /tmp/workspace/main-generated-sql/dags/bqetl_unified.py 2024-06-04 21:17:28.000000000 +0000 +++ /tmp/workspace/generated-sql/dags/bqetl_unified.py 2024-06-04 21:18:25.000000000 +0000 @@ -188,6 +188,13 @@ execution_date="{{ (execution_date - macros.timedelta(seconds=10800)).isoformat() }}", ) + ExternalTaskMarker( + task_id="kpi_forecasting__wait_for_unified_metrics", + external_dag_id="kpi_forecasting", + external_task_id="wait_for_unified_metrics", + execution_date="{{ (execution_date + macros.timedelta(seconds=3600)).isoformat() }}", + ) + checks__fail_telemetry_derived__unified_metrics__v1_external.set_upstream( checks__fail_telemetry_derived__unified_metrics__v1 ) @@ -210,6 +217,20 @@ retries=0, ) + with TaskGroup( + "checks__warn_telemetry_derived__unified_metrics__v1_external", + ) as checks__warn_telemetry_derived__unified_metrics__v1_external: + ExternalTaskMarker( + task_id="kpi_forecasting__wait_for_unified_metrics", + external_dag_id="kpi_forecasting", + external_task_id="wait_for_unified_metrics", + execution_date="{{ (execution_date + macros.timedelta(seconds=3600)).isoformat() }}", + ) + + checks__warn_telemetry_derived__unified_metrics__v1_external.set_upstream( + checks__warn_telemetry_derived__unified_metrics__v1 + ) + telemetry_derived__rolling_cohorts__v1 = bigquery_etl_query( task_id="telemetry_derived__rolling_cohorts__v1", destination_table="rolling_cohorts_v1", @@ -257,6 +278,20 @@ depends_on_past=False, ) + with TaskGroup( + "telemetry_derived__unified_metrics__v1_external", + ) as telemetry_derived__unified_metrics__v1_external: + ExternalTaskMarker( + task_id="kpi_forecasting__wait_for_unified_metrics", + external_dag_id="kpi_forecasting", + external_task_id="wait_for_unified_metrics", + execution_date="{{ (execution_date + macros.timedelta(seconds=3600)).isoformat() }}", + ) + + telemetry_derived__unified_metrics__v1_external.set_upstream( + telemetry_derived__unified_metrics__v1 + ) + checks__fail_telemetry_derived__unified_metrics__v1.set_upstream( telemetry_derived__unified_metrics__v1 ) Only in /tmp/workspace/generated-sql/sql/data-observability-dev/fenix_derived/events_daily_v1: bigconfig.yaml Only in /tmp/workspace/generated-sql/sql/data-observability-dev/fenix_derived/events_daily_v1: montecarlo Only in /tmp/workspace/generated-sql/sql/data-observability-dev/fenix_derived/events_daily_v1: montecarlo.yaml Only in /tmp/workspace/generated-sql/sql/data-observability-dev/fenix_derived/event_types_v1: bigconfig.yaml Only in /tmp/workspace/generated-sql/sql/data-observability-dev/fenix_derived/event_types_v1: montecarlo Only in /tmp/workspace/generated-sql/sql/data-observability-dev: montecarlo.yaml diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/sql/data-observability-dev/fenix_derived/events_daily_v1/bigconfig.yaml /tmp/workspace/generated-sql/sql/data-observability-dev/fenix_derived/events_daily_v1/bigconfig.yaml --- /tmp/workspace/main-generated-sql/sql/data-observability-dev/fenix_derived/events_daily_v1/bigconfig.yaml 1970-01-01 00:00:00.000000000 +0000 +++ /tmp/workspace/generated-sql/sql/data-observability-dev/fenix_derived/events_daily_v1/bigconfig.yaml 2024-06-04 21:13:53.000000000 +0000 @@ -0,0 +1,30 @@ +type: BIGCONFIG_FILE +auto_apply_on_indexing: True + +table_deployments: + - collection: + name: Fenix_derived + deployments: + - fq_table_name: data-observability-dev.data-observability-dev.fenix_derived.events_daily_v1 + row_creation_time: submission_date + table_metrics: + - metric_type: + predefined_metric: COUNT_ROWS + columns: + - column_name: client_id + metrics: + - metric_type: + predefined_metric: PERCENT_NULL + - metric_type: + predefined_metric: COUNT_DUPLICATES + - column_name: channel + metrics: + - metric_type: + predefined_metric: PERCENT_NULL + - metric_type: + predefined_metric: COUNT_DISTINCT + - metric_type: + predefined_metric: PERCENT_VALUE_IN_LIST + parameters: + - key: list + string_value: "release,beta,nightly" diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/sql/data-observability-dev/fenix_derived/events_daily_v1/montecarlo/monitors.yaml /tmp/workspace/generated-sql/sql/data-observability-dev/fenix_derived/events_daily_v1/montecarlo/monitors.yaml --- /tmp/workspace/main-generated-sql/sql/data-observability-dev/fenix_derived/events_daily_v1/montecarlo/monitors.yaml 1970-01-01 00:00:00.000000000 +0000 +++ /tmp/workspace/generated-sql/sql/data-observability-dev/fenix_derived/events_daily_v1/montecarlo/monitors.yaml 2024-06-04 21:13:53.000000000 +0000 @@ -0,0 +1,50 @@ +namespace: fenix_derived +montecarlo: + field_health: + - table: data-observability-dev:fenix_derived.events_daily_v1 + timestamp_field: submission_date + name: submission_date_nulls + description: "Check for nulls in submission_date" + comparisons: + - type: threshold + operator: AUTO + metric: NULL_RATE + - table: data-observability-dev:fenix_derived.events_daily_v1 + timestamp_field: submission_date + name: client_id_uniqueness + description: client_id duplicates + comparisons: + - type: threshold + operator: LT + fields: + - client_id + threshold_value: 1 + metric: DUPLICATE_COUNT + - table: data-observability-dev:fenix_derived.events_daily_v1 + timestamp_field: submission_date + name: client_id_nullness + description: client_id NULLs + comparisons: + - type: threshold + operator: LT + metric: NULL_COUNT + threshold_value: 1 + fields: + - client_id + custom_sql: + - sql: | + select COUNT(*) from {{table}} where channel not in ('release', 'beta', 'nightly') + variables: + table: + - data-observability-dev:fenix_derived.events_daily_v1 + name: channel_value_check + description: Check channel values + query_result_type: SINGLE_NUMERIC + schedule: + type: fixed + start_time: "2024-06-02T19:00:00" + interval_minutes: 1440 + comparisons: + - type: threshold + operator: LT + threshold_value: 0 diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/sql/data-observability-dev/fenix_derived/events_daily_v1/montecarlo.yaml /tmp/workspace/generated-sql/sql/data-observability-dev/fenix_derived/events_daily_v1/montecarlo.yaml --- /tmp/workspace/main-generated-sql/sql/data-observability-dev/fenix_derived/events_daily_v1/montecarlo.yaml 1970-01-01 00:00:00.000000000 +0000 +++ /tmp/workspace/generated-sql/sql/data-observability-dev/fenix_derived/events_daily_v1/montecarlo.yaml 2024-06-04 21:13:53.000000000 +0000 @@ -0,0 +1,2 @@ +version: 1 +namespace: fenix_derived diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/sql/data-observability-dev/fenix_derived/event_types_v1/bigconfig.yaml /tmp/workspace/generated-sql/sql/data-observability-dev/fenix_derived/event_types_v1/bigconfig.yaml --- /tmp/workspace/main-generated-sql/sql/data-observability-dev/fenix_derived/event_types_v1/bigconfig.yaml 1970-01-01 00:00:00.000000000 +0000 +++ /tmp/workspace/generated-sql/sql/data-observability-dev/fenix_derived/event_types_v1/bigconfig.yaml 2024-06-04 21:13:53.000000000 +0000 @@ -0,0 +1,12 @@ +type: BIGCONFIG_FILE +auto_apply_on_indexing: True + +table_deployments: + - collection: + name: Fenix_derived + deployments: + - fq_table_name: data-observability-dev.data-observability-dev.fenix_derived.event_types_v1 + row_creation_time: first_timestamp + table_metrics: + - metric_type: + predefined_metric: COUNT_ROWS diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/sql/data-observability-dev/fenix_derived/event_types_v1/montecarlo/monitors.yaml /tmp/workspace/generated-sql/sql/data-observability-dev/fenix_derived/event_types_v1/montecarlo/monitors.yaml --- /tmp/workspace/main-generated-sql/sql/data-observability-dev/fenix_derived/event_types_v1/montecarlo/monitors.yaml 1970-01-01 00:00:00.000000000 +0000 +++ /tmp/workspace/generated-sql/sql/data-observability-dev/fenix_derived/event_types_v1/montecarlo/monitors.yaml 2024-06-04 21:13:53.000000000 +0000 @@ -0,0 +1,11 @@ +namespace: fenix_derived +montecarlo: + field_health: + - table: data-observability-dev:fenix_derived.event_types_v1 + timestamp_field: first_timestamp + name: first_timestamp_nulls + description: "Check for nulls in first_timestamp" + comparisons: + - type: threshold + operator: AUTO + metric: NULL_RATE diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/sql/data-observability-dev/fenix_derived/firefox_android_clients_v1/query.sql /tmp/workspace/generated-sql/sql/data-observability-dev/fenix_derived/firefox_android_clients_v1/query.sql --- /tmp/workspace/main-generated-sql/sql/data-observability-dev/fenix_derived/firefox_android_clients_v1/query.sql 2024-06-04 21:13:56.000000000 +0000 +++ /tmp/workspace/generated-sql/sql/data-observability-dev/fenix_derived/firefox_android_clients_v1/query.sql 2024-06-04 21:13:53.000000000 +0000 @@ -1,9 +1,4 @@ SELECT - *, - -- add some additional fields to test schema changes - CAST(NULL AS STRING) AS additional_field_1, - CAST(NULL AS INT64) AS additional_field_2, - CAST(NULL AS STRING) AS additional_field_3, - CAST(NULL AS STRING) AS additional_field_4, + * FROM `moz-fx-data-shared-prod.fenix_derived.firefox_android_clients_v1` diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/sql/data-observability-dev/montecarlo.yaml /tmp/workspace/generated-sql/sql/data-observability-dev/montecarlo.yaml --- /tmp/workspace/main-generated-sql/sql/data-observability-dev/montecarlo.yaml 1970-01-01 00:00:00.000000000 +0000 +++ /tmp/workspace/generated-sql/sql/data-observability-dev/montecarlo.yaml 2024-06-04 21:13:53.000000000 +0000 @@ -0,0 +1,3 @@ +version: 1 +include_file_patterns: + - "*/*/*/monitors.yaml" diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/firefox_ios_derived/retention_v1/backfill.yaml /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/firefox_ios_derived/retention_v1/backfill.yaml --- /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/firefox_ios_derived/retention_v1/backfill.yaml 2024-06-04 21:13:56.000000000 +0000 +++ /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/firefox_ios_derived/retention_v1/backfill.yaml 2024-06-04 21:13:54.000000000 +0000 @@ -4,4 +4,4 @@ reason: The table is created, this is to populate it with data. watchers: - kik@mozilla.com - status: Complete + status: Initiate diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/telemetry_derived/desktop_retention_v1/backfill.yaml /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/telemetry_derived/desktop_retention_v1/backfill.yaml --- /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/telemetry_derived/desktop_retention_v1/backfill.yaml 2024-06-04 21:13:56.000000000 +0000 +++ /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/telemetry_derived/desktop_retention_v1/backfill.yaml 2024-06-04 21:13:54.000000000 +0000 @@ -6,4 +6,4 @@ - mhirose@mozilla.com - anicholson@mozilla.com - wichan@mozilla.com - status: Complete + status: Initiate diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/telemetry_derived/unified_metrics_v1/metadata.yaml /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/telemetry_derived/unified_metrics_v1/metadata.yaml --- /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/telemetry_derived/unified_metrics_v1/metadata.yaml 2024-06-04 21:15:35.000000000 +0000 +++ /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/telemetry_derived/unified_metrics_v1/metadata.yaml 2024-06-04 21:15:40.000000000 +0000 @@ -19,6 +19,10 @@ owner1: loines scheduling: dag_name: bqetl_unified + external_downstream_tasks: + - task_id: wait_for_unified_metrics + dag_name: kpi_forecasting + execution_delta: 1h bigquery: time_partitioning: type: day ```

Link to full diff

dataops-ci-bot commented 3 months ago

Integration report for "Merge branch 'main' into bigconfig-example"

sql.diff

Click to expand! ```diff Only in /tmp/workspace/generated-sql/sql/data-observability-dev/fenix_derived/events_daily_v1: bigconfig.yaml Only in /tmp/workspace/generated-sql/sql/data-observability-dev/fenix_derived/events_daily_v1: montecarlo Only in /tmp/workspace/generated-sql/sql/data-observability-dev/fenix_derived/events_daily_v1: montecarlo.yaml Only in /tmp/workspace/generated-sql/sql/data-observability-dev/fenix_derived/event_types_v1: bigconfig.yaml Only in /tmp/workspace/generated-sql/sql/data-observability-dev/fenix_derived/event_types_v1: montecarlo Only in /tmp/workspace/generated-sql/sql/data-observability-dev: montecarlo.yaml diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/sql/data-observability-dev/fenix_derived/events_daily_v1/bigconfig.yaml /tmp/workspace/generated-sql/sql/data-observability-dev/fenix_derived/events_daily_v1/bigconfig.yaml --- /tmp/workspace/main-generated-sql/sql/data-observability-dev/fenix_derived/events_daily_v1/bigconfig.yaml 1970-01-01 00:00:00.000000000 +0000 +++ /tmp/workspace/generated-sql/sql/data-observability-dev/fenix_derived/events_daily_v1/bigconfig.yaml 2024-06-05 17:43:29.000000000 +0000 @@ -0,0 +1,30 @@ +type: BIGCONFIG_FILE +auto_apply_on_indexing: True + +table_deployments: + - collection: + name: Fenix_derived + deployments: + - fq_table_name: data-observability-dev.data-observability-dev.fenix_derived.events_daily_v1 + row_creation_time: submission_date + table_metrics: + - metric_type: + predefined_metric: COUNT_ROWS + columns: + - column_name: client_id + metrics: + - metric_type: + predefined_metric: PERCENT_NULL + - metric_type: + predefined_metric: COUNT_DUPLICATES + - column_name: channel + metrics: + - metric_type: + predefined_metric: PERCENT_NULL + - metric_type: + predefined_metric: COUNT_DISTINCT + - metric_type: + predefined_metric: PERCENT_VALUE_IN_LIST + parameters: + - key: list + string_value: "release,beta,nightly" diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/sql/data-observability-dev/fenix_derived/events_daily_v1/montecarlo/monitors.yaml /tmp/workspace/generated-sql/sql/data-observability-dev/fenix_derived/events_daily_v1/montecarlo/monitors.yaml --- /tmp/workspace/main-generated-sql/sql/data-observability-dev/fenix_derived/events_daily_v1/montecarlo/monitors.yaml 1970-01-01 00:00:00.000000000 +0000 +++ /tmp/workspace/generated-sql/sql/data-observability-dev/fenix_derived/events_daily_v1/montecarlo/monitors.yaml 2024-06-05 17:43:29.000000000 +0000 @@ -0,0 +1,50 @@ +namespace: fenix_derived +montecarlo: + field_health: + - table: data-observability-dev:fenix_derived.events_daily_v1 + timestamp_field: submission_date + name: submission_date_nulls + description: "Check for nulls in submission_date" + comparisons: + - type: threshold + operator: AUTO + metric: NULL_RATE + - table: data-observability-dev:fenix_derived.events_daily_v1 + timestamp_field: submission_date + name: client_id_uniqueness + description: client_id duplicates + comparisons: + - type: threshold + operator: LT + fields: + - client_id + threshold_value: 1 + metric: DUPLICATE_COUNT + - table: data-observability-dev:fenix_derived.events_daily_v1 + timestamp_field: submission_date + name: client_id_nullness + description: client_id NULLs + comparisons: + - type: threshold + operator: LT + metric: NULL_COUNT + threshold_value: 1 + fields: + - client_id + custom_sql: + - sql: | + select COUNT(*) from {{table}} where channel not in ('release', 'beta', 'nightly') + variables: + table: + - data-observability-dev:fenix_derived.events_daily_v1 + name: channel_value_check + description: Check channel values + query_result_type: SINGLE_NUMERIC + schedule: + type: fixed + start_time: "2024-06-02T19:00:00" + interval_minutes: 1440 + comparisons: + - type: threshold + operator: LT + threshold_value: 0 diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/sql/data-observability-dev/fenix_derived/events_daily_v1/montecarlo.yaml /tmp/workspace/generated-sql/sql/data-observability-dev/fenix_derived/events_daily_v1/montecarlo.yaml --- /tmp/workspace/main-generated-sql/sql/data-observability-dev/fenix_derived/events_daily_v1/montecarlo.yaml 1970-01-01 00:00:00.000000000 +0000 +++ /tmp/workspace/generated-sql/sql/data-observability-dev/fenix_derived/events_daily_v1/montecarlo.yaml 2024-06-05 17:43:29.000000000 +0000 @@ -0,0 +1,2 @@ +version: 1 +namespace: fenix_derived diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/sql/data-observability-dev/fenix_derived/event_types_v1/bigconfig.yaml /tmp/workspace/generated-sql/sql/data-observability-dev/fenix_derived/event_types_v1/bigconfig.yaml --- /tmp/workspace/main-generated-sql/sql/data-observability-dev/fenix_derived/event_types_v1/bigconfig.yaml 1970-01-01 00:00:00.000000000 +0000 +++ /tmp/workspace/generated-sql/sql/data-observability-dev/fenix_derived/event_types_v1/bigconfig.yaml 2024-06-05 17:43:29.000000000 +0000 @@ -0,0 +1,12 @@ +type: BIGCONFIG_FILE +auto_apply_on_indexing: True + +table_deployments: + - collection: + name: Fenix_derived + deployments: + - fq_table_name: data-observability-dev.data-observability-dev.fenix_derived.event_types_v1 + row_creation_time: first_timestamp + table_metrics: + - metric_type: + predefined_metric: COUNT_ROWS diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/sql/data-observability-dev/fenix_derived/event_types_v1/montecarlo/monitors.yaml /tmp/workspace/generated-sql/sql/data-observability-dev/fenix_derived/event_types_v1/montecarlo/monitors.yaml --- /tmp/workspace/main-generated-sql/sql/data-observability-dev/fenix_derived/event_types_v1/montecarlo/monitors.yaml 1970-01-01 00:00:00.000000000 +0000 +++ /tmp/workspace/generated-sql/sql/data-observability-dev/fenix_derived/event_types_v1/montecarlo/monitors.yaml 2024-06-05 17:43:29.000000000 +0000 @@ -0,0 +1,11 @@ +namespace: fenix_derived +montecarlo: + field_health: + - table: data-observability-dev:fenix_derived.event_types_v1 + timestamp_field: first_timestamp + name: first_timestamp_nulls + description: "Check for nulls in first_timestamp" + comparisons: + - type: threshold + operator: AUTO + metric: NULL_RATE diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/sql/data-observability-dev/montecarlo.yaml /tmp/workspace/generated-sql/sql/data-observability-dev/montecarlo.yaml --- /tmp/workspace/main-generated-sql/sql/data-observability-dev/montecarlo.yaml 1970-01-01 00:00:00.000000000 +0000 +++ /tmp/workspace/generated-sql/sql/data-observability-dev/montecarlo.yaml 2024-06-05 17:43:29.000000000 +0000 @@ -0,0 +1,3 @@ +version: 1 +include_file_patterns: + - "*/*/*/monitors.yaml" ```

Link to full diff