[ ] Commits should reference a bug or github issue, if relevant (if a bug is referenced, the pull request should include the bug number in the title).
[ ] If the PR comes from a fork, trigger integration CI tests by running the Push to upstream workflow and provide the <username>:<branch> of the fork as parameter. The parameter will also show up
in the logs of the manual-trigger-required-for-fork CI task together with more detailed instructions.
[ ] If adding a new field to a query, ensure that the schema and dependent downstream schemas have been updated.
[ ] When adding a new derived dataset, ensure that data is not available already (fully or partially) and recommend extending an existing dataset in favor of creating new ones. Data can be available in the bigquery-etl repository, looker-hub or in looker-spoke-default.
For modifications to schemas in restricted namespaces (see CODEOWNERS):
Integration report for "DENG-3273 Add check to validate backend event counts in accounts_events vs events pings"
sql.diff
Click to expand!
```diff
diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/dags/bqetl_accounts_derived.py /tmp/workspace/generated-sql/dags/bqetl_accounts_derived.py
--- /tmp/workspace/main-generated-sql/dags/bqetl_accounts_derived.py 2024-06-25 07:28:34.000000000 +0000
+++ /tmp/workspace/generated-sql/dags/bqetl_accounts_derived.py 2024-06-25 07:29:12.000000000 +0000
@@ -88,6 +88,19 @@
depends_on_past=False,
)
+ checks__warn_accounts_backend_derived__users_services_daily__v1 = bigquery_dq_check(
+ task_id="checks__warn_accounts_backend_derived__users_services_daily__v1",
+ source_table="users_services_daily_v1",
+ dataset_id="accounts_backend_derived",
+ project_id="moz-fx-data-shared-prod",
+ is_dq_check_fail=False,
+ owner="ksiegler@mozilla.com",
+ email=["akomar@mozilla.com", "ksiegler@mozilla.com"],
+ depends_on_past=False,
+ parameters=["submission_date:DATE:{{ds}}"],
+ retries=0,
+ )
+
accounts_backend_derived__users_services_daily__v1.set_upstream(
wait_for_copy_deduplicate_all
)
@@ -95,3 +108,11 @@
accounts_backend_derived__users_services_last_seen__v1.set_upstream(
accounts_backend_derived__users_services_daily__v1
)
+
+ checks__warn_accounts_backend_derived__users_services_daily__v1.set_upstream(
+ accounts_backend_derived__users_services_daily__v1
+ )
+
+ checks__warn_accounts_backend_derived__users_services_daily__v1.set_upstream(
+ wait_for_copy_deduplicate_all
+ )
Only in /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/accounts_backend_derived/users_services_daily_v1: checks.sql
diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/accounts_backend_derived/users_services_daily_v1/checks.sql /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/accounts_backend_derived/users_services_daily_v1/checks.sql
--- /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/accounts_backend_derived/users_services_daily_v1/checks.sql 1970-01-01 00:00:00.000000000 +0000
+++ /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/accounts_backend_derived/users_services_daily_v1/checks.sql 2024-06-25 07:24:22.000000000 +0000
@@ -0,0 +1,55 @@
+#warn
+-- check if we're receiving the same number of events in `accounts_events` and `events` pings
+-- this is a temporary check that will be removed once migration is completed (https://mozilla-hub.atlassian.net/browse/DENG-2407)
+WITH events_new AS (
+ SELECT
+ DATE(e.submission_timestamp) AS day,
+ CONCAT(event.category, "_", event.name) AS event_name,
+ COUNT(*) AS count_new
+ FROM
+ `moz-fx-data-shared-prod.accounts_backend_stable.events_v1` AS e
+ CROSS JOIN
+ UNNEST(e.events) AS event
+ WHERE
+ DATE(submission_timestamp) = @submission_date
+ GROUP BY
+ ALL
+ ORDER BY
+ 1
+),
+events_old AS (
+ SELECT
+ DATE(submission_timestamp) AS day,
+ metrics.string.event_name AS event_name,
+ COUNT(*) AS count_old
+ FROM
+ `moz-fx-data-shared-prod.accounts_backend_stable.accounts_events_v1`
+ WHERE
+ DATE(submission_timestamp) = @submission_date
+ GROUP BY
+ ALL
+ ORDER BY
+ 1
+),
+check_results AS (
+ SELECT
+ *,
+ events_new.count_new - events_old.count_old AS count_diff
+ FROM
+ events_new
+ FULL OUTER JOIN
+ events_old
+ USING (day, event_name)
+ WHERE
+ events_new.count_new IS NULL
+ OR events_old.count_old IS NULL
+ OR (
+ (event_name NOT LIKE 'access_token_%' AND events_new.count_new - events_old.count_old > 1)
+ -- access_token_checked is sent frequently, 300M per day and due to small time differences some events might end up in a different day's parition
+ OR (event_name LIKE 'access_token_%' AND events_new.count_new - events_old.count_old > 50)
+ )
+)
+SELECT
+ IF(COUNT(*) > 0, ERROR('Events count mismatch between accounts_events and events pings'), NULL)
+FROM
+ check_results;
```
Click to expand!
```diff
diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/dags/bqetl_accounts_derived.py /tmp/workspace/generated-sql/dags/bqetl_accounts_derived.py
--- /tmp/workspace/main-generated-sql/dags/bqetl_accounts_derived.py 2024-06-25 09:18:19.000000000 +0000
+++ /tmp/workspace/generated-sql/dags/bqetl_accounts_derived.py 2024-06-25 09:20:18.000000000 +0000
@@ -88,6 +88,19 @@
depends_on_past=False,
)
+ checks__warn_accounts_backend_derived__users_services_daily__v1 = bigquery_dq_check(
+ task_id="checks__warn_accounts_backend_derived__users_services_daily__v1",
+ source_table="users_services_daily_v1",
+ dataset_id="accounts_backend_derived",
+ project_id="moz-fx-data-shared-prod",
+ is_dq_check_fail=False,
+ owner="ksiegler@mozilla.com",
+ email=["akomar@mozilla.com", "ksiegler@mozilla.com"],
+ depends_on_past=False,
+ parameters=["submission_date:DATE:{{ds}}"],
+ retries=0,
+ )
+
accounts_backend_derived__users_services_daily__v1.set_upstream(
wait_for_copy_deduplicate_all
)
@@ -95,3 +108,11 @@
accounts_backend_derived__users_services_last_seen__v1.set_upstream(
accounts_backend_derived__users_services_daily__v1
)
+
+ checks__warn_accounts_backend_derived__users_services_daily__v1.set_upstream(
+ accounts_backend_derived__users_services_daily__v1
+ )
+
+ checks__warn_accounts_backend_derived__users_services_daily__v1.set_upstream(
+ wait_for_copy_deduplicate_all
+ )
Only in /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/accounts_backend_derived/users_services_daily_v1: checks.sql
diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/accounts_backend_derived/users_services_daily_v1/checks.sql /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/accounts_backend_derived/users_services_daily_v1/checks.sql
--- /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/accounts_backend_derived/users_services_daily_v1/checks.sql 1970-01-01 00:00:00.000000000 +0000
+++ /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/accounts_backend_derived/users_services_daily_v1/checks.sql 2024-06-25 09:14:27.000000000 +0000
@@ -0,0 +1,53 @@
+#warn
+-- check if we're receiving the same number of events in `accounts_events` and `events` pings
+-- this is a temporary check that will be removed once migration is completed (https://mozilla-hub.atlassian.net/browse/DENG-2407)
+WITH events_new AS (
+ SELECT
+ DATE(e.submission_timestamp) AS day,
+ CONCAT(event.category, "_", event.name) AS event_name,
+ COUNT(*) AS count_new
+ FROM
+ `moz-fx-data-shared-prod.accounts_backend_stable.events_v1` AS e
+ CROSS JOIN
+ UNNEST(e.events) AS event
+ WHERE
+ DATE(submission_timestamp) = @submission_date
+ GROUP BY
+ DATE(e.submission_timestamp),
+ CONCAT(event.category, "_", event.name)
+),
+events_old AS (
+ SELECT
+ DATE(submission_timestamp) AS day,
+ metrics.string.event_name AS event_name,
+ COUNT(*) AS count_old
+ FROM
+ `moz-fx-data-shared-prod.accounts_backend_stable.accounts_events_v1`
+ WHERE
+ DATE(submission_timestamp) = @submission_date
+ GROUP BY
+ DATE(submission_timestamp),
+ metrics.string.event_name
+),
+check_results AS (
+ SELECT
+ *,
+ events_new.count_new - events_old.count_old AS count_diff
+ FROM
+ events_new
+ FULL OUTER JOIN
+ events_old
+ USING (day, event_name)
+ WHERE
+ events_new.count_new IS NULL
+ OR events_old.count_old IS NULL
+ OR (
+ (event_name NOT LIKE 'access_token_%' AND events_new.count_new - events_old.count_old > 1)
+ -- access_token_checked is sent frequently, 300M per day and due to small time differences some events might end up in a different day's parition
+ OR (event_name LIKE 'access_token_%' AND events_new.count_new - events_old.count_old > 50)
+ )
+)
+SELECT
+ IF(COUNT(*) > 0, ERROR('Events count mismatch between accounts_events and events pings'), NULL)
+FROM
+ check_results;
```
Checklist for reviewer:
<username>:<branch>
of the fork as parameter. The parameter will also show up in the logs of themanual-trigger-required-for-fork
CI task together with more detailed instructions.For modifications to schemas in restricted namespaces (see
CODEOWNERS
):┆Issue is synchronized with this Jira Task