mozilla / bigquery-etl

Bigquery ETL
https://mozilla.github.io/bigquery-etl
Mozilla Public License 2.0
241 stars 98 forks source link

DENG-3273 Add check to validate backend event counts in accounts_events vs events pings #5831

Closed akkomar closed 6 days ago

akkomar commented 6 days ago

Checklist for reviewer:

For modifications to schemas in restricted namespaces (see CODEOWNERS):

┆Issue is synchronized with this Jira Task

dataops-ci-bot commented 6 days ago

Integration report for "DENG-3273 Add check to validate backend event counts in accounts_events vs events pings"

sql.diff

Click to expand! ```diff diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/dags/bqetl_accounts_derived.py /tmp/workspace/generated-sql/dags/bqetl_accounts_derived.py --- /tmp/workspace/main-generated-sql/dags/bqetl_accounts_derived.py 2024-06-25 07:28:34.000000000 +0000 +++ /tmp/workspace/generated-sql/dags/bqetl_accounts_derived.py 2024-06-25 07:29:12.000000000 +0000 @@ -88,6 +88,19 @@ depends_on_past=False, ) + checks__warn_accounts_backend_derived__users_services_daily__v1 = bigquery_dq_check( + task_id="checks__warn_accounts_backend_derived__users_services_daily__v1", + source_table="users_services_daily_v1", + dataset_id="accounts_backend_derived", + project_id="moz-fx-data-shared-prod", + is_dq_check_fail=False, + owner="ksiegler@mozilla.com", + email=["akomar@mozilla.com", "ksiegler@mozilla.com"], + depends_on_past=False, + parameters=["submission_date:DATE:{{ds}}"], + retries=0, + ) + accounts_backend_derived__users_services_daily__v1.set_upstream( wait_for_copy_deduplicate_all ) @@ -95,3 +108,11 @@ accounts_backend_derived__users_services_last_seen__v1.set_upstream( accounts_backend_derived__users_services_daily__v1 ) + + checks__warn_accounts_backend_derived__users_services_daily__v1.set_upstream( + accounts_backend_derived__users_services_daily__v1 + ) + + checks__warn_accounts_backend_derived__users_services_daily__v1.set_upstream( + wait_for_copy_deduplicate_all + ) Only in /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/accounts_backend_derived/users_services_daily_v1: checks.sql diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/accounts_backend_derived/users_services_daily_v1/checks.sql /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/accounts_backend_derived/users_services_daily_v1/checks.sql --- /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/accounts_backend_derived/users_services_daily_v1/checks.sql 1970-01-01 00:00:00.000000000 +0000 +++ /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/accounts_backend_derived/users_services_daily_v1/checks.sql 2024-06-25 07:24:22.000000000 +0000 @@ -0,0 +1,55 @@ +#warn +-- check if we're receiving the same number of events in `accounts_events` and `events` pings +-- this is a temporary check that will be removed once migration is completed (https://mozilla-hub.atlassian.net/browse/DENG-2407) +WITH events_new AS ( + SELECT + DATE(e.submission_timestamp) AS day, + CONCAT(event.category, "_", event.name) AS event_name, + COUNT(*) AS count_new + FROM + `moz-fx-data-shared-prod.accounts_backend_stable.events_v1` AS e + CROSS JOIN + UNNEST(e.events) AS event + WHERE + DATE(submission_timestamp) = @submission_date + GROUP BY + ALL + ORDER BY + 1 +), +events_old AS ( + SELECT + DATE(submission_timestamp) AS day, + metrics.string.event_name AS event_name, + COUNT(*) AS count_old + FROM + `moz-fx-data-shared-prod.accounts_backend_stable.accounts_events_v1` + WHERE + DATE(submission_timestamp) = @submission_date + GROUP BY + ALL + ORDER BY + 1 +), +check_results AS ( + SELECT + *, + events_new.count_new - events_old.count_old AS count_diff + FROM + events_new + FULL OUTER JOIN + events_old + USING (day, event_name) + WHERE + events_new.count_new IS NULL + OR events_old.count_old IS NULL + OR ( + (event_name NOT LIKE 'access_token_%' AND events_new.count_new - events_old.count_old > 1) + -- access_token_checked is sent frequently, 300M per day and due to small time differences some events might end up in a different day's parition + OR (event_name LIKE 'access_token_%' AND events_new.count_new - events_old.count_old > 50) + ) +) +SELECT + IF(COUNT(*) > 0, ERROR('Events count mismatch between accounts_events and events pings'), NULL) +FROM + check_results; ```

Link to full diff

dataops-ci-bot commented 6 days ago

Integration report for "Remove group by all"

sql.diff

Click to expand! ```diff diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/dags/bqetl_accounts_derived.py /tmp/workspace/generated-sql/dags/bqetl_accounts_derived.py --- /tmp/workspace/main-generated-sql/dags/bqetl_accounts_derived.py 2024-06-25 09:18:19.000000000 +0000 +++ /tmp/workspace/generated-sql/dags/bqetl_accounts_derived.py 2024-06-25 09:20:18.000000000 +0000 @@ -88,6 +88,19 @@ depends_on_past=False, ) + checks__warn_accounts_backend_derived__users_services_daily__v1 = bigquery_dq_check( + task_id="checks__warn_accounts_backend_derived__users_services_daily__v1", + source_table="users_services_daily_v1", + dataset_id="accounts_backend_derived", + project_id="moz-fx-data-shared-prod", + is_dq_check_fail=False, + owner="ksiegler@mozilla.com", + email=["akomar@mozilla.com", "ksiegler@mozilla.com"], + depends_on_past=False, + parameters=["submission_date:DATE:{{ds}}"], + retries=0, + ) + accounts_backend_derived__users_services_daily__v1.set_upstream( wait_for_copy_deduplicate_all ) @@ -95,3 +108,11 @@ accounts_backend_derived__users_services_last_seen__v1.set_upstream( accounts_backend_derived__users_services_daily__v1 ) + + checks__warn_accounts_backend_derived__users_services_daily__v1.set_upstream( + accounts_backend_derived__users_services_daily__v1 + ) + + checks__warn_accounts_backend_derived__users_services_daily__v1.set_upstream( + wait_for_copy_deduplicate_all + ) Only in /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/accounts_backend_derived/users_services_daily_v1: checks.sql diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/accounts_backend_derived/users_services_daily_v1/checks.sql /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/accounts_backend_derived/users_services_daily_v1/checks.sql --- /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/accounts_backend_derived/users_services_daily_v1/checks.sql 1970-01-01 00:00:00.000000000 +0000 +++ /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/accounts_backend_derived/users_services_daily_v1/checks.sql 2024-06-25 09:14:27.000000000 +0000 @@ -0,0 +1,53 @@ +#warn +-- check if we're receiving the same number of events in `accounts_events` and `events` pings +-- this is a temporary check that will be removed once migration is completed (https://mozilla-hub.atlassian.net/browse/DENG-2407) +WITH events_new AS ( + SELECT + DATE(e.submission_timestamp) AS day, + CONCAT(event.category, "_", event.name) AS event_name, + COUNT(*) AS count_new + FROM + `moz-fx-data-shared-prod.accounts_backend_stable.events_v1` AS e + CROSS JOIN + UNNEST(e.events) AS event + WHERE + DATE(submission_timestamp) = @submission_date + GROUP BY + DATE(e.submission_timestamp), + CONCAT(event.category, "_", event.name) +), +events_old AS ( + SELECT + DATE(submission_timestamp) AS day, + metrics.string.event_name AS event_name, + COUNT(*) AS count_old + FROM + `moz-fx-data-shared-prod.accounts_backend_stable.accounts_events_v1` + WHERE + DATE(submission_timestamp) = @submission_date + GROUP BY + DATE(submission_timestamp), + metrics.string.event_name +), +check_results AS ( + SELECT + *, + events_new.count_new - events_old.count_old AS count_diff + FROM + events_new + FULL OUTER JOIN + events_old + USING (day, event_name) + WHERE + events_new.count_new IS NULL + OR events_old.count_old IS NULL + OR ( + (event_name NOT LIKE 'access_token_%' AND events_new.count_new - events_old.count_old > 1) + -- access_token_checked is sent frequently, 300M per day and due to small time differences some events might end up in a different day's parition + OR (event_name LIKE 'access_token_%' AND events_new.count_new - events_old.count_old > 50) + ) +) +SELECT + IF(COUNT(*) > 0, ERROR('Events count mismatch between accounts_events and events pings'), NULL) +FROM + check_results; ```

Link to full diff