mozilla / bigquery-etl

Bigquery ETL
https://mozilla.github.io/bigquery-etl
Mozilla Public License 2.0
246 stars 98 forks source link

[DENG-3889] Rewrite events_stream_v1 for better performance #5659

Closed BenWu closed 2 months ago

BenWu commented 2 months ago

DENG-3889

Changes:

performance

1% sample (sample_id = 1) on firefox desktop for 2024-05-22: changes slot hours job id (backfill-2)
base 24.5 bquxjob_7a85e33a_18fab4919e3
rearranged 5.7 bquxjob_4ababcd5_18fab3b62ea
sql udfs 4.3 bquxjob_129253ac_18fab52d5c9
rearranged + sql udfs 3.4 bquxjob_4145760d_18fab765f03

100% of firefox desktop for 2024-05-22 (both overwriting a clustered partition):

changes slot hours job id (backfill-2)
base (airflow run) 3910 😵‍💫 bqjob_r6deaccf950e8d7f5_0000018fa35c4fce_1
rearranged + sql udfs 699 bquxjob_5293faeb_18fab5ab707

I'm mostly confident the output is equivalent because this doesn't throw any errors:

SELECT
  mozfun.assert.json_equals(from_map_event_extra(event.extra), mozfun.json.from_map(event.extra)),
  mozfun.assert.json_equals(from_map_experiment(ping_info.experiments), mozfun.json.from_map(ping_info.experiments)),
FROM
  `moz-fx-data-shared-prod.firefox_desktop_stable.events_v1`
CROSS JOIN 
  UNNEST(events) AS event
WHERE 
  DATE(submission_timestamp) IN ('2024-05-22')
  AND sample_id = 1

Also ran this for fenix and ios

Checklist for reviewer:

For modifications to schemas in restricted namespaces (see CODEOWNERS):

┆Issue is synchronized with this Jira Task

BenWu commented 2 months ago

cc @badboy as a query owner

badboy commented 2 months ago

Fantastic!

dataops-ci-bot commented 2 months ago

Integration report for "Merge branch 'main' into benwu/events-stream-udf"

sql.diff

Click to expand! ```diff Only in /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/telemetry/desktop_retention_clients: schema.yaml diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/accounts_backend_derived/event_monitoring_live_v1/materialized_view.sql /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/accounts_backend_derived/event_monitoring_live_v1/materialized_view.sql --- /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/accounts_backend_derived/event_monitoring_live_v1/materialized_view.sql 2024-05-27 14:55:32.000000000 +0000 +++ /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/accounts_backend_derived/event_monitoring_live_v1/materialized_view.sql 2024-05-27 14:57:24.000000000 +0000 @@ -60,7 +60,7 @@ LEFT JOIN UNNEST(event.extra) AS event_extra WHERE - DATE(submission_timestamp) >= "2024-05-24" + DATE(submission_timestamp) >= "2024-05-27" GROUP BY submission_date, window_start, diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/accounts_backend_derived/events_stream_v1/query.sql /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/accounts_backend_derived/events_stream_v1/query.sql --- /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/accounts_backend_derived/events_stream_v1/query.sql 2024-05-27 14:55:32.000000000 +0000 +++ /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/accounts_backend_derived/events_stream_v1/query.sql 2024-05-27 14:57:25.000000000 +0000 @@ -1,7 +1,73 @@ -- Generated via bigquery_etl.glean_usage +-- convert array of key value pairs to a json object, cast numbers and booleans if possible +CREATE TEMP FUNCTION from_map_event_extra(input ARRAY>) +RETURNS json AS ( + IF( + ARRAY_LENGTH(input) = 0, + NULL, + JSON_OBJECT( + ARRAY(SELECT key FROM UNNEST(input)), + ARRAY( + SELECT + CASE + WHEN SAFE_CAST(value AS NUMERIC) IS NOT NULL + THEN TO_JSON(SAFE_CAST(value AS NUMERIC)) + WHEN SAFE_CAST(value AS BOOL) IS NOT NULL + THEN TO_JSON(SAFE_CAST(value AS BOOL)) + ELSE TO_JSON(value) + END + FROM + UNNEST(input) + ) + ) + ) +); + +-- convert array of key value pairs to a json object +-- values are nested structs and will be converted to json objects +CREATE TEMP FUNCTION from_map_experiment( + input ARRAY< + STRUCT>> + > +) +RETURNS json AS ( + IF( + ARRAY_LENGTH(input) = 0, + NULL, + JSON_OBJECT(ARRAY(SELECT key FROM UNNEST(input)), ARRAY(SELECT value FROM UNNEST(input))) + ) +); + +CREATE TEMP FUNCTION metrics_to_json(metrics JSON) +RETURNS JSON AS ( + JSON_STRIP_NULLS( + JSON_REMOVE( + -- labeled_* are the only ones that SHOULD show up as context for events pings, + -- thus we special-case them + -- + -- The JSON_SET/JSON_EXTRACT shenanigans are needed + -- because those subfields might not exist, so accessing the columns would fail. + -- but accessing non-existent fields in a JSON object simply gives us NULL. + JSON_SET( + metrics, + '$.labeled_counter', + mozfun.json.from_nested_map(metrics.labeled_counter), + '$.labeled_string', + mozfun.json.from_nested_map(metrics.labeled_string), + '$.labeled_boolean', + mozfun.json.from_nested_map(metrics.labeled_boolean), + '$.url', + metrics.url2 + ), + '$.url2' + ), + remove_empty => TRUE + ) +); + WITH base AS ( SELECT - * EXCEPT (metrics, events, name, category, extra, timestamp) REPLACE( + * REPLACE ( STRUCT( client_info.app_build AS app_build, client_info.app_channel AS app_channel, @@ -25,24 +91,14 @@ ping_info.end_time, ping_info.parsed_end_time, ping_info.ping_type - ) AS ping_info + ) AS ping_info, + metrics_to_json(TO_JSON(metrics)) AS metrics ), client_info.client_id AS client_id, ping_info.reason AS reason, - `mozfun.json.from_map`(ping_info.experiments) AS experiments, - COALESCE( - SAFE.TIMESTAMP_MILLIS(SAFE_CAST(mozfun.map.get_key(event.extra, 'glean_timestamp') AS INT64)), - SAFE.TIMESTAMP_ADD(ping_info.parsed_start_time, INTERVAL event.timestamp MILLISECOND) - ) AS event_timestamp, - event.category AS event_category, - event.name AS event_name, - ARRAY_TO_STRING([event.category, event.name], '.') AS event, -- handles NULL values better - `mozfun.json.from_map`(event.extra) AS event_extra, - TO_JSON(metrics) AS metrics + from_map_experiment(ping_info.experiments) AS experiments, FROM - `accounts_backend.events` AS e - CROSS JOIN - UNNEST(events) AS event + `accounts_backend.events` WHERE {% if is_init() %} DATE(submission_timestamp) >= '2023-11-01' @@ -52,34 +108,16 @@ ) -- SELECT - * REPLACE ( - -- expose as easy to access JSON column, - -- strip nulls, - -- translate nested array records into a JSON object, - -- rename url2 -> url - JSON_STRIP_NULLS( - JSON_REMOVE( - -- labeled_* are the only ones that SHOULD show up as context for events pings, - -- thus we special-case them - -- - -- The JSON_SET/JSON_EXTRACT shenanigans are needed - -- because those subfields might not exist, so accessing the columns would fail. - -- but accessing non-existent fields in a JSON object simply gives us NULL. - JSON_SET( - metrics, - '$.labeled_counter', - mozfun.json.from_nested_map(metrics.labeled_counter), - '$.labeled_string', - mozfun.json.from_nested_map(metrics.labeled_string), - '$.labeled_boolean', - mozfun.json.from_nested_map(metrics.labeled_boolean), - '$.url', - metrics.url2 - ), - '$.url2' - ), - remove_empty => TRUE - ) AS metrics - ) + base.* EXCEPT (events), + COALESCE( + SAFE.TIMESTAMP_MILLIS(SAFE_CAST(mozfun.map.get_key(event.extra, 'glean_timestamp') AS INT64)), + SAFE.TIMESTAMP_ADD(ping_info.parsed_start_time, INTERVAL event.timestamp MILLISECOND) + ) AS event_timestamp, + event.category AS event_category, + event.name AS event_name, + ARRAY_TO_STRING([event.category, event.name], '.') AS event, -- handles NULL values better + from_map_event_extra(event.extra) AS event_extra, FROM base +CROSS JOIN + UNNEST(events) AS event diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/accounts_cirrus_derived/events_stream_v1/query.sql /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/accounts_cirrus_derived/events_stream_v1/query.sql --- /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/accounts_cirrus_derived/events_stream_v1/query.sql 2024-05-27 14:55:32.000000000 +0000 +++ /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/accounts_cirrus_derived/events_stream_v1/query.sql 2024-05-27 14:57:26.000000000 +0000 @@ -1,7 +1,73 @@ -- Generated via bigquery_etl.glean_usage +-- convert array of key value pairs to a json object, cast numbers and booleans if possible +CREATE TEMP FUNCTION from_map_event_extra(input ARRAY>) +RETURNS json AS ( + IF( + ARRAY_LENGTH(input) = 0, + NULL, + JSON_OBJECT( + ARRAY(SELECT key FROM UNNEST(input)), + ARRAY( + SELECT + CASE + WHEN SAFE_CAST(value AS NUMERIC) IS NOT NULL + THEN TO_JSON(SAFE_CAST(value AS NUMERIC)) + WHEN SAFE_CAST(value AS BOOL) IS NOT NULL + THEN TO_JSON(SAFE_CAST(value AS BOOL)) + ELSE TO_JSON(value) + END + FROM + UNNEST(input) + ) + ) + ) +); + +-- convert array of key value pairs to a json object +-- values are nested structs and will be converted to json objects +CREATE TEMP FUNCTION from_map_experiment( + input ARRAY< + STRUCT>> + > +) +RETURNS json AS ( + IF( + ARRAY_LENGTH(input) = 0, + NULL, + JSON_OBJECT(ARRAY(SELECT key FROM UNNEST(input)), ARRAY(SELECT value FROM UNNEST(input))) + ) +); + +CREATE TEMP FUNCTION metrics_to_json(metrics JSON) +RETURNS JSON AS ( + JSON_STRIP_NULLS( + JSON_REMOVE( + -- labeled_* are the only ones that SHOULD show up as context for events pings, + -- thus we special-case them + -- + -- The JSON_SET/JSON_EXTRACT shenanigans are needed + -- because those subfields might not exist, so accessing the columns would fail. + -- but accessing non-existent fields in a JSON object simply gives us NULL. + JSON_SET( + metrics, + '$.labeled_counter', + mozfun.json.from_nested_map(metrics.labeled_counter), + '$.labeled_string', + mozfun.json.from_nested_map(metrics.labeled_string), + '$.labeled_boolean', + mozfun.json.from_nested_map(metrics.labeled_boolean), + '$.url', + metrics.url2 + ), + '$.url2' + ), + remove_empty => TRUE + ) +); + WITH base AS ( SELECT - * EXCEPT (metrics, events, name, category, extra, timestamp) REPLACE( + * REPLACE ( STRUCT( client_info.app_build AS app_build, client_info.app_channel AS app_channel, @@ -25,24 +91,14 @@ ping_info.end_time, ping_info.parsed_end_time, ping_info.ping_type - ) AS ping_info + ) AS ping_info, + metrics_to_json(TO_JSON(metrics)) AS metrics ), client_info.client_id AS client_id, ping_info.reason AS reason, - `mozfun.json.from_map`(ping_info.experiments) AS experiments, - COALESCE( - SAFE.TIMESTAMP_MILLIS(SAFE_CAST(mozfun.map.get_key(event.extra, 'glean_timestamp') AS INT64)), - SAFE.TIMESTAMP_ADD(ping_info.parsed_start_time, INTERVAL event.timestamp MILLISECOND) - ) AS event_timestamp, - event.category AS event_category, - event.name AS event_name, - ARRAY_TO_STRING([event.category, event.name], '.') AS event, -- handles NULL values better - `mozfun.json.from_map`(event.extra) AS event_extra, - TO_JSON(metrics) AS metrics + from_map_experiment(ping_info.experiments) AS experiments, FROM - `accounts_cirrus.events` AS e - CROSS JOIN - UNNEST(events) AS event + `accounts_cirrus.events` WHERE {% if is_init() %} DATE(submission_timestamp) >= '2023-11-01' @@ -52,34 +108,16 @@ ) -- SELECT - * REPLACE ( - -- expose as easy to access JSON column, - -- strip nulls, - -- translate nested array records into a JSON object, - -- rename url2 -> url - JSON_STRIP_NULLS( - JSON_REMOVE( - -- labeled_* are the only ones that SHOULD show up as context for events pings, - -- thus we special-case them - -- - -- The JSON_SET/JSON_EXTRACT shenanigans are needed - -- because those subfields might not exist, so accessing the columns would fail. - -- but accessing non-existent fields in a JSON object simply gives us NULL. - JSON_SET( - metrics, - '$.labeled_counter', - mozfun.json.from_nested_map(metrics.labeled_counter), - '$.labeled_string', - mozfun.json.from_nested_map(metrics.labeled_string), - '$.labeled_boolean', - mozfun.json.from_nested_map(metrics.labeled_boolean), - '$.url', - metrics.url2 - ), - '$.url2' - ), - remove_empty => TRUE - ) AS metrics - ) + base.* EXCEPT (events), + COALESCE( + SAFE.TIMESTAMP_MILLIS(SAFE_CAST(mozfun.map.get_key(event.extra, 'glean_timestamp') AS INT64)), + SAFE.TIMESTAMP_ADD(ping_info.parsed_start_time, INTERVAL event.timestamp MILLISECOND) + ) AS event_timestamp, + event.category AS event_category, + event.name AS event_name, + ARRAY_TO_STRING([event.category, event.name], '.') AS event, -- handles NULL values better + from_map_event_extra(event.extra) AS event_extra, FROM base +CROSS JOIN + UNNEST(events) AS event diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/accounts_frontend_derived/event_monitoring_live_v1/materialized_view.sql /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/accounts_frontend_derived/event_monitoring_live_v1/materialized_view.sql --- /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/accounts_frontend_derived/event_monitoring_live_v1/materialized_view.sql 2024-05-27 14:55:32.000000000 +0000 +++ /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/accounts_frontend_derived/event_monitoring_live_v1/materialized_view.sql 2024-05-27 14:57:24.000000000 +0000 @@ -60,7 +60,7 @@ LEFT JOIN UNNEST(event.extra) AS event_extra WHERE - DATE(submission_timestamp) >= "2024-05-24" + DATE(submission_timestamp) >= "2024-05-27" GROUP BY submission_date, window_start, diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/accounts_frontend_derived/events_stream_v1/query.sql /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/accounts_frontend_derived/events_stream_v1/query.sql --- /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/accounts_frontend_derived/events_stream_v1/query.sql 2024-05-27 14:55:32.000000000 +0000 +++ /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/accounts_frontend_derived/events_stream_v1/query.sql 2024-05-27 14:57:27.000000000 +0000 @@ -1,7 +1,73 @@ -- Generated via bigquery_etl.glean_usage +-- convert array of key value pairs to a json object, cast numbers and booleans if possible +CREATE TEMP FUNCTION from_map_event_extra(input ARRAY>) +RETURNS json AS ( + IF( + ARRAY_LENGTH(input) = 0, + NULL, + JSON_OBJECT( + ARRAY(SELECT key FROM UNNEST(input)), + ARRAY( + SELECT + CASE + WHEN SAFE_CAST(value AS NUMERIC) IS NOT NULL + THEN TO_JSON(SAFE_CAST(value AS NUMERIC)) + WHEN SAFE_CAST(value AS BOOL) IS NOT NULL + THEN TO_JSON(SAFE_CAST(value AS BOOL)) + ELSE TO_JSON(value) + END + FROM + UNNEST(input) + ) + ) + ) +); + +-- convert array of key value pairs to a json object +-- values are nested structs and will be converted to json objects +CREATE TEMP FUNCTION from_map_experiment( + input ARRAY< + STRUCT>> + > +) +RETURNS json AS ( + IF( + ARRAY_LENGTH(input) = 0, + NULL, + JSON_OBJECT(ARRAY(SELECT key FROM UNNEST(input)), ARRAY(SELECT value FROM UNNEST(input))) + ) +); + +CREATE TEMP FUNCTION metrics_to_json(metrics JSON) +RETURNS JSON AS ( + JSON_STRIP_NULLS( + JSON_REMOVE( + -- labeled_* are the only ones that SHOULD show up as context for events pings, + -- thus we special-case them + -- + -- The JSON_SET/JSON_EXTRACT shenanigans are needed + -- because those subfields might not exist, so accessing the columns would fail. + -- but accessing non-existent fields in a JSON object simply gives us NULL. + JSON_SET( + metrics, + '$.labeled_counter', + mozfun.json.from_nested_map(metrics.labeled_counter), + '$.labeled_string', + mozfun.json.from_nested_map(metrics.labeled_string), + '$.labeled_boolean', + mozfun.json.from_nested_map(metrics.labeled_boolean), + '$.url', + metrics.url2 + ), + '$.url2' + ), + remove_empty => TRUE + ) +); + WITH base AS ( SELECT - * EXCEPT (metrics, events, name, category, extra, timestamp) REPLACE( + * REPLACE ( STRUCT( client_info.app_build AS app_build, client_info.app_channel AS app_channel, @@ -25,24 +91,14 @@ ping_info.end_time, ping_info.parsed_end_time, ping_info.ping_type - ) AS ping_info + ) AS ping_info, + metrics_to_json(TO_JSON(metrics)) AS metrics ), client_info.client_id AS client_id, ping_info.reason AS reason, - `mozfun.json.from_map`(ping_info.experiments) AS experiments, - COALESCE( - SAFE.TIMESTAMP_MILLIS(SAFE_CAST(mozfun.map.get_key(event.extra, 'glean_timestamp') AS INT64)), - SAFE.TIMESTAMP_ADD(ping_info.parsed_start_time, INTERVAL event.timestamp MILLISECOND) - ) AS event_timestamp, - event.category AS event_category, - event.name AS event_name, - ARRAY_TO_STRING([event.category, event.name], '.') AS event, -- handles NULL values better - `mozfun.json.from_map`(event.extra) AS event_extra, - TO_JSON(metrics) AS metrics + from_map_experiment(ping_info.experiments) AS experiments, FROM - `accounts_frontend.events` AS e - CROSS JOIN - UNNEST(events) AS event + `accounts_frontend.events` WHERE {% if is_init() %} DATE(submission_timestamp) >= '2023-11-01' @@ -52,34 +108,16 @@ ) -- SELECT - * REPLACE ( - -- expose as easy to access JSON column, - -- strip nulls, - -- translate nested array records into a JSON object, - -- rename url2 -> url - JSON_STRIP_NULLS( - JSON_REMOVE( - -- labeled_* are the only ones that SHOULD show up as context for events pings, - -- thus we special-case them - -- - -- The JSON_SET/JSON_EXTRACT shenanigans are needed - -- because those subfields might not exist, so accessing the columns would fail. - -- but accessing non-existent fields in a JSON object simply gives us NULL. - JSON_SET( - metrics, - '$.labeled_counter', - mozfun.json.from_nested_map(metrics.labeled_counter), - '$.labeled_string', - mozfun.json.from_nested_map(metrics.labeled_string), - '$.labeled_boolean', - mozfun.json.from_nested_map(metrics.labeled_boolean), - '$.url', - metrics.url2 - ), - '$.url2' - ), - remove_empty => TRUE - ) AS metrics - ) + base.* EXCEPT (events), + COALESCE( + SAFE.TIMESTAMP_MILLIS(SAFE_CAST(mozfun.map.get_key(event.extra, 'glean_timestamp') AS INT64)), + SAFE.TIMESTAMP_ADD(ping_info.parsed_start_time, INTERVAL event.timestamp MILLISECOND) + ) AS event_timestamp, + event.category AS event_category, + event.name AS event_name, + ARRAY_TO_STRING([event.category, event.name], '.') AS event, -- handles NULL values better + from_map_event_extra(event.extra) AS event_extra, FROM base +CROSS JOIN + UNNEST(events) AS event diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/bedrock_derived/event_monitoring_live_v1/materialized_view.sql /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/bedrock_derived/event_monitoring_live_v1/materialized_view.sql --- /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/bedrock_derived/event_monitoring_live_v1/materialized_view.sql 2024-05-27 14:55:32.000000000 +0000 +++ /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/bedrock_derived/event_monitoring_live_v1/materialized_view.sql 2024-05-27 14:57:24.000000000 +0000 @@ -50,7 +50,7 @@ client_info.app_display_version AS version, ping_info FROM - `moz-fx-data-shared-prod.bedrock_live.events_v1` + `moz-fx-data-shared-prod.bedrock_live.non_interaction_v1` UNION ALL SELECT submission_timestamp, @@ -70,7 +70,7 @@ client_info.app_display_version AS version, ping_info FROM - `moz-fx-data-shared-prod.bedrock_live.non_interaction_v1` + `moz-fx-data-shared-prod.bedrock_live.events_v1` ) CROSS JOIN UNNEST(events) AS event, @@ -80,7 +80,7 @@ LEFT JOIN UNNEST(event.extra) AS event_extra WHERE - DATE(submission_timestamp) >= "2024-05-24" + DATE(submission_timestamp) >= "2024-05-27" GROUP BY submission_date, window_start, diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/bedrock_derived/events_stream_v1/query.sql /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/bedrock_derived/events_stream_v1/query.sql --- /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/bedrock_derived/events_stream_v1/query.sql 2024-05-27 14:55:32.000000000 +0000 +++ /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/bedrock_derived/events_stream_v1/query.sql 2024-05-27 14:57:27.000000000 +0000 @@ -1,7 +1,73 @@ -- Generated via bigquery_etl.glean_usage +-- convert array of key value pairs to a json object, cast numbers and booleans if possible +CREATE TEMP FUNCTION from_map_event_extra(input ARRAY>) +RETURNS json AS ( + IF( + ARRAY_LENGTH(input) = 0, + NULL, + JSON_OBJECT( + ARRAY(SELECT key FROM UNNEST(input)), + ARRAY( + SELECT + CASE + WHEN SAFE_CAST(value AS NUMERIC) IS NOT NULL + THEN TO_JSON(SAFE_CAST(value AS NUMERIC)) + WHEN SAFE_CAST(value AS BOOL) IS NOT NULL + THEN TO_JSON(SAFE_CAST(value AS BOOL)) + ELSE TO_JSON(value) + END + FROM + UNNEST(input) + ) + ) + ) +); + +-- convert array of key value pairs to a json object +-- values are nested structs and will be converted to json objects +CREATE TEMP FUNCTION from_map_experiment( + input ARRAY< + STRUCT>> + > +) +RETURNS json AS ( + IF( + ARRAY_LENGTH(input) = 0, + NULL, + JSON_OBJECT(ARRAY(SELECT key FROM UNNEST(input)), ARRAY(SELECT value FROM UNNEST(input))) + ) +); + +CREATE TEMP FUNCTION metrics_to_json(metrics JSON) +RETURNS JSON AS ( + JSON_STRIP_NULLS( + JSON_REMOVE( + -- labeled_* are the only ones that SHOULD show up as context for events pings, + -- thus we special-case them + -- + -- The JSON_SET/JSON_EXTRACT shenanigans are needed + -- because those subfields might not exist, so accessing the columns would fail. + -- but accessing non-existent fields in a JSON object simply gives us NULL. + JSON_SET( + metrics, + '$.labeled_counter', + mozfun.json.from_nested_map(metrics.labeled_counter), + '$.labeled_string', + mozfun.json.from_nested_map(metrics.labeled_string), + '$.labeled_boolean', + mozfun.json.from_nested_map(metrics.labeled_boolean), + '$.url', + metrics.url2 + ), + '$.url2' + ), + remove_empty => TRUE + ) +); + WITH base AS ( SELECT - * EXCEPT (metrics, events, name, category, extra, timestamp) REPLACE( + * REPLACE ( STRUCT( client_info.app_build AS app_build, client_info.app_channel AS app_channel, @@ -25,24 +91,14 @@ ping_info.end_time, ping_info.parsed_end_time, ping_info.ping_type - ) AS ping_info + ) AS ping_info, + metrics_to_json(TO_JSON(metrics)) AS metrics ), client_info.client_id AS client_id, ping_info.reason AS reason, - `mozfun.json.from_map`(ping_info.experiments) AS experiments, - COALESCE( - SAFE.TIMESTAMP_MILLIS(SAFE_CAST(mozfun.map.get_key(event.extra, 'glean_timestamp') AS INT64)), - SAFE.TIMESTAMP_ADD(ping_info.parsed_start_time, INTERVAL event.timestamp MILLISECOND) - ) AS event_timestamp, - event.category AS event_category, - event.name AS event_name, - ARRAY_TO_STRING([event.category, event.name], '.') AS event, -- handles NULL values better - `mozfun.json.from_map`(event.extra) AS event_extra, - TO_JSON(metrics) AS metrics + from_map_experiment(ping_info.experiments) AS experiments, FROM - `bedrock.events` AS e - CROSS JOIN - UNNEST(events) AS event + `bedrock.events` WHERE {% if is_init() %} DATE(submission_timestamp) >= '2023-11-01' @@ -52,34 +108,16 @@ ) -- SELECT - * REPLACE ( - -- expose as easy to access JSON column, - -- strip nulls, - -- translate nested array records into a JSON object, - -- rename url2 -> url - JSON_STRIP_NULLS( - JSON_REMOVE( - -- labeled_* are the only ones that SHOULD show up as context for events pings, - -- thus we special-case them - -- - -- The JSON_SET/JSON_EXTRACT shenanigans are needed - -- because those subfields might not exist, so accessing the columns would fail. - -- but accessing non-existent fields in a JSON object simply gives us NULL. - JSON_SET( - metrics, - '$.labeled_counter', - mozfun.json.from_nested_map(metrics.labeled_counter), - '$.labeled_string', - mozfun.json.from_nested_map(metrics.labeled_string), - '$.labeled_boolean', - mozfun.json.from_nested_map(metrics.labeled_boolean), - '$.url', - metrics.url2 - ), - '$.url2' - ), - remove_empty => TRUE - ) AS metrics - ) + base.* EXCEPT (events), + COALESCE( + SAFE.TIMESTAMP_MILLIS(SAFE_CAST(mozfun.map.get_key(event.extra, 'glean_timestamp') AS INT64)), + SAFE.TIMESTAMP_ADD(ping_info.parsed_start_time, INTERVAL event.timestamp MILLISECOND) + ) AS event_timestamp, + event.category AS event_category, + event.name AS event_name, + ARRAY_TO_STRING([event.category, event.name], '.') AS event, -- handles NULL values better + from_map_event_extra(event.extra) AS event_extra, FROM base +CROSS JOIN + UNNEST(events) AS event diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/burnham_derived/events_stream_v1/query.sql /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/burnham_derived/events_stream_v1/query.sql --- /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/burnham_derived/events_stream_v1/query.sql 2024-05-27 14:55:32.000000000 +0000 +++ /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/burnham_derived/events_stream_v1/query.sql 2024-05-27 14:57:27.000000000 +0000 @@ -1,7 +1,73 @@ -- Generated via bigquery_etl.glean_usage +-- convert array of key value pairs to a json object, cast numbers and booleans if possible +CREATE TEMP FUNCTION from_map_event_extra(input ARRAY>) +RETURNS json AS ( + IF( + ARRAY_LENGTH(input) = 0, + NULL, + JSON_OBJECT( + ARRAY(SELECT key FROM UNNEST(input)), + ARRAY( + SELECT + CASE + WHEN SAFE_CAST(value AS NUMERIC) IS NOT NULL + THEN TO_JSON(SAFE_CAST(value AS NUMERIC)) + WHEN SAFE_CAST(value AS BOOL) IS NOT NULL + THEN TO_JSON(SAFE_CAST(value AS BOOL)) + ELSE TO_JSON(value) + END + FROM + UNNEST(input) + ) + ) + ) +); + +-- convert array of key value pairs to a json object +-- values are nested structs and will be converted to json objects +CREATE TEMP FUNCTION from_map_experiment( + input ARRAY< + STRUCT>> + > +) +RETURNS json AS ( + IF( + ARRAY_LENGTH(input) = 0, + NULL, + JSON_OBJECT(ARRAY(SELECT key FROM UNNEST(input)), ARRAY(SELECT value FROM UNNEST(input))) + ) +); + +CREATE TEMP FUNCTION metrics_to_json(metrics JSON) +RETURNS JSON AS ( + JSON_STRIP_NULLS( + JSON_REMOVE( + -- labeled_* are the only ones that SHOULD show up as context for events pings, + -- thus we special-case them + -- + -- The JSON_SET/JSON_EXTRACT shenanigans are needed + -- because those subfields might not exist, so accessing the columns would fail. + -- but accessing non-existent fields in a JSON object simply gives us NULL. + JSON_SET( + metrics, + '$.labeled_counter', + mozfun.json.from_nested_map(metrics.labeled_counter), + '$.labeled_string', + mozfun.json.from_nested_map(metrics.labeled_string), + '$.labeled_boolean', + mozfun.json.from_nested_map(metrics.labeled_boolean), + '$.url', + metrics.url2 + ), + '$.url2' + ), + remove_empty => TRUE + ) +); + WITH base AS ( SELECT - * EXCEPT (metrics, events, name, category, extra, timestamp) REPLACE( + * REPLACE ( STRUCT( client_info.app_build AS app_build, client_info.app_channel AS app_channel, @@ -25,24 +91,14 @@ ping_info.end_time, ping_info.parsed_end_time, ping_info.ping_type - ) AS ping_info + ) AS ping_info, + metrics_to_json(TO_JSON(metrics)) AS metrics ), client_info.client_id AS client_id, ping_info.reason AS reason, - `mozfun.json.from_map`(ping_info.experiments) AS experiments, - COALESCE( - SAFE.TIMESTAMP_MILLIS(SAFE_CAST(mozfun.map.get_key(event.extra, 'glean_timestamp') AS INT64)), - SAFE.TIMESTAMP_ADD(ping_info.parsed_start_time, INTERVAL event.timestamp MILLISECOND) - ) AS event_timestamp, - event.category AS event_category, - event.name AS event_name, - ARRAY_TO_STRING([event.category, event.name], '.') AS event, -- handles NULL values better - `mozfun.json.from_map`(event.extra) AS event_extra, - TO_JSON(metrics) AS metrics + from_map_experiment(ping_info.experiments) AS experiments, FROM - `burnham.events` AS e - CROSS JOIN - UNNEST(events) AS event + `burnham.events` WHERE {% if is_init() %} DATE(submission_timestamp) >= '2023-11-01' @@ -52,34 +108,16 @@ ) -- SELECT - * REPLACE ( - -- expose as easy to access JSON column, - -- strip nulls, - -- translate nested array records into a JSON object, - -- rename url2 -> url - JSON_STRIP_NULLS( - JSON_REMOVE( - -- labeled_* are the only ones that SHOULD show up as context for events pings, - -- thus we special-case them - -- - -- The JSON_SET/JSON_EXTRACT shenanigans are needed - -- because those subfields might not exist, so accessing the columns would fail. - -- but accessing non-existent fields in a JSON object simply gives us NULL. - JSON_SET( - metrics, - '$.labeled_counter', - mozfun.json.from_nested_map(metrics.labeled_counter), - '$.labeled_string', - mozfun.json.from_nested_map(metrics.labeled_string), - '$.labeled_boolean', - mozfun.json.from_nested_map(metrics.labeled_boolean), - '$.url', - metrics.url2 - ), - '$.url2' - ), - remove_empty => TRUE - ) AS metrics - ) + base.* EXCEPT (events), + COALESCE( + SAFE.TIMESTAMP_MILLIS(SAFE_CAST(mozfun.map.get_key(event.extra, 'glean_timestamp') AS INT64)), + SAFE.TIMESTAMP_ADD(ping_info.parsed_start_time, INTERVAL event.timestamp MILLISECOND) + ) AS event_timestamp, + event.category AS event_category, + event.name AS event_name, + ARRAY_TO_STRING([event.category, event.name], '.') AS event, -- handles NULL values better + from_map_event_extra(event.extra) AS event_extra, FROM base +CROSS JOIN + UNNEST(events) AS event diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/contextual_services/event_aggregates/schema.yaml /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/contextual_services/event_aggregates/schema.yaml --- /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/contextual_services/event_aggregates/schema.yaml 2024-05-27 14:54:51.000000000 +0000 +++ /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/contextual_services/event_aggregates/schema.yaml 2024-05-27 15:03:47.000000000 +0000 @@ -1,49 +1,49 @@ fields: -- mode: NULLABLE - name: submission_date +- name: submission_date type: DATE -- mode: NULLABLE - name: source + mode: NULLABLE +- name: source type: STRING -- mode: NULLABLE - name: event_type + mode: NULLABLE +- name: event_type type: STRING -- mode: NULLABLE - name: form_factor + mode: NULLABLE +- name: form_factor type: STRING -- mode: NULLABLE - name: country + mode: NULLABLE +- name: country type: STRING -- mode: NULLABLE - name: subdivision1 + mode: NULLABLE +- name: subdivision1 type: STRING -- mode: NULLABLE - name: advertiser + mode: NULLABLE +- name: advertiser type: STRING -- mode: NULLABLE - name: release_channel + mode: NULLABLE +- name: release_channel type: STRING -- mode: NULLABLE - name: position + mode: NULLABLE +- name: position type: INTEGER -- mode: NULLABLE - name: provider + mode: NULLABLE +- name: provider type: STRING -- mode: NULLABLE - name: match_type + mode: NULLABLE +- name: match_type type: STRING -- mode: NULLABLE - name: normalized_os + mode: NULLABLE +- name: normalized_os type: STRING -- mode: NULLABLE - name: suggest_data_sharing_enabled + mode: NULLABLE +- name: suggest_data_sharing_enabled type: BOOLEAN -- mode: NULLABLE - name: event_count + mode: NULLABLE +- name: event_count type: INTEGER -- mode: NULLABLE - name: user_count + mode: NULLABLE +- name: user_count type: INTEGER -- mode: NULLABLE - name: query_type + mode: NULLABLE +- name: query_type type: STRING + mode: NULLABLE diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/contextual_services/event_aggregates_suggest/schema.yaml /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/contextual_services/event_aggregates_suggest/schema.yaml --- /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/contextual_services/event_aggregates_suggest/schema.yaml 2024-05-27 14:54:51.000000000 +0000 +++ /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/contextual_services/event_aggregates_suggest/schema.yaml 2024-05-27 15:03:47.000000000 +0000 @@ -1,40 +1,40 @@ fields: -- mode: NULLABLE - name: submission_date +- name: submission_date type: DATE -- mode: NULLABLE - name: form_factor + mode: NULLABLE +- name: form_factor type: STRING -- mode: NULLABLE - name: country + mode: NULLABLE +- name: country type: STRING -- mode: NULLABLE - name: advertiser + mode: NULLABLE +- name: advertiser type: STRING -- mode: NULLABLE - name: normalized_os + mode: NULLABLE +- name: normalized_os type: STRING -- mode: NULLABLE - name: release_channel + mode: NULLABLE +- name: release_channel type: STRING -- mode: NULLABLE - name: position + mode: NULLABLE +- name: position type: INTEGER -- mode: NULLABLE - name: provider + mode: NULLABLE +- name: provider type: STRING -- mode: NULLABLE - name: match_type + mode: NULLABLE +- name: match_type type: STRING -- mode: NULLABLE - name: suggest_data_sharing_enabled + mode: NULLABLE +- name: suggest_data_sharing_enabled type: BOOLEAN -- mode: NULLABLE - name: impression_count + mode: NULLABLE +- name: impression_count type: INTEGER -- mode: NULLABLE - name: click_count + mode: NULLABLE +- name: click_count type: INTEGER -- mode: NULLABLE - name: query_type + mode: NULLABLE +- name: query_type type: STRING + mode: NULLABLE diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/debug_ping_view_derived/event_monitoring_live_v1/materialized_view.sql /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/debug_ping_view_derived/event_monitoring_live_v1/materialized_view.sql --- /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/debug_ping_view_derived/event_monitoring_live_v1/materialized_view.sql 2024-05-27 14:55:26.000000000 +0000 +++ /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/debug_ping_view_derived/event_monitoring_live_v1/materialized_view.sql 2024-05-27 14:57:25.000000000 +0000 @@ -60,7 +60,7 @@ LEFT JOIN UNNEST(event.extra) AS event_extra WHERE - DATE(submission_timestamp) >= "2024-05-24" + DATE(submission_timestamp) >= "2024-05-27" GROUP BY submission_date, window_start, diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/debug_ping_view_derived/events_stream_v1/query.sql /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/debug_ping_view_derived/events_stream_v1/query.sql --- /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/debug_ping_view_derived/events_stream_v1/query.sql 2024-05-27 14:55:26.000000000 +0000 +++ /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/debug_ping_view_derived/events_stream_v1/query.sql 2024-05-27 14:57:28.000000000 +0000 @@ -1,7 +1,73 @@ -- Generated via bigquery_etl.glean_usage +-- convert array of key value pairs to a json object, cast numbers and booleans if possible +CREATE TEMP FUNCTION from_map_event_extra(input ARRAY>) +RETURNS json AS ( + IF( + ARRAY_LENGTH(input) = 0, + NULL, + JSON_OBJECT( + ARRAY(SELECT key FROM UNNEST(input)), + ARRAY( + SELECT + CASE + WHEN SAFE_CAST(value AS NUMERIC) IS NOT NULL + THEN TO_JSON(SAFE_CAST(value AS NUMERIC)) + WHEN SAFE_CAST(value AS BOOL) IS NOT NULL + THEN TO_JSON(SAFE_CAST(value AS BOOL)) + ELSE TO_JSON(value) + END + FROM + UNNEST(input) + ) + ) + ) +); + +-- convert array of key value pairs to a json object +-- values are nested structs and will be converted to json objects +CREATE TEMP FUNCTION from_map_experiment( + input ARRAY< + STRUCT>> + > +) +RETURNS json AS ( + IF( + ARRAY_LENGTH(input) = 0, + NULL, + JSON_OBJECT(ARRAY(SELECT key FROM UNNEST(input)), ARRAY(SELECT value FROM UNNEST(input))) + ) +); + +CREATE TEMP FUNCTION metrics_to_json(metrics JSON) +RETURNS JSON AS ( + JSON_STRIP_NULLS( + JSON_REMOVE( + -- labeled_* are the only ones that SHOULD show up as context for events pings, + -- thus we special-case them + -- + -- The JSON_SET/JSON_EXTRACT shenanigans are needed + -- because those subfields might not exist, so accessing the columns would fail. + -- but accessing non-existent fields in a JSON object simply gives us NULL. + JSON_SET( + metrics, + '$.labeled_counter', + mozfun.json.from_nested_map(metrics.labeled_counter), + '$.labeled_string', + mozfun.json.from_nested_map(metrics.labeled_string), + '$.labeled_boolean', + mozfun.json.from_nested_map(metrics.labeled_boolean), + '$.url', + metrics.url2 + ), + '$.url2' + ), + remove_empty => TRUE + ) +); + WITH base AS ( SELECT - * EXCEPT (metrics, events, name, category, extra, timestamp) REPLACE( + * REPLACE ( STRUCT( client_info.app_build AS app_build, client_info.app_channel AS app_channel, @@ -25,24 +91,14 @@ ping_info.end_time, ping_info.parsed_end_time, ping_info.ping_type - ) AS ping_info + ) AS ping_info, + metrics_to_json(TO_JSON(metrics)) AS metrics ), client_info.client_id AS client_id, ping_info.reason AS reason, - `mozfun.json.from_map`(ping_info.experiments) AS experiments, - COALESCE( - SAFE.TIMESTAMP_MILLIS(SAFE_CAST(mozfun.map.get_key(event.extra, 'glean_timestamp') AS INT64)), - SAFE.TIMESTAMP_ADD(ping_info.parsed_start_time, INTERVAL event.timestamp MILLISECOND) - ) AS event_timestamp, - event.category AS event_category, - event.name AS event_name, - ARRAY_TO_STRING([event.category, event.name], '.') AS event, -- handles NULL values better - `mozfun.json.from_map`(event.extra) AS event_extra, - TO_JSON(metrics) AS metrics + from_map_experiment(ping_info.experiments) AS experiments, FROM - `debug_ping_view.events` AS e - CROSS JOIN - UNNEST(events) AS event + `debug_ping_view.events` WHERE {% if is_init() %} DATE(submission_timestamp) >= '2023-11-01' @@ -52,34 +108,16 @@ ) -- SELECT - * REPLACE ( - -- expose as easy to access JSON column, - -- strip nulls, - -- translate nested array records into a JSON object, - -- rename url2 -> url - JSON_STRIP_NULLS( - JSON_REMOVE( - -- labeled_* are the only ones that SHOULD show up as context for events pings, - -- thus we special-case them - -- - -- The JSON_SET/JSON_EXTRACT shenanigans are needed - -- because those subfields might not exist, so accessing the columns would fail. - -- but accessing non-existent fields in a JSON object simply gives us NULL. - JSON_SET( - metrics, - '$.labeled_counter', - mozfun.json.from_nested_map(metrics.labeled_counter), - '$.labeled_string', - mozfun.json.from_nested_map(metrics.labeled_string), - '$.labeled_boolean', - mozfun.json.from_nested_map(metrics.labeled_boolean), - '$.url', - metrics.url2 - ), - '$.url2' - ), - remove_empty => TRUE - ) AS metrics - ) + base.* EXCEPT (events), + COALESCE( + SAFE.TIMESTAMP_MILLIS(SAFE_CAST(mozfun.map.get_key(event.extra, 'glean_timestamp') AS INT64)), + SAFE.TIMESTAMP_ADD(ping_info.parsed_start_time, INTERVAL event.timestamp MILLISECOND) + ) AS event_timestamp, + event.category AS event_category, + event.name AS event_name, + ARRAY_TO_STRING([event.category, event.name], '.') AS event, -- handles NULL values better + from_map_event_extra(event.extra) AS event_extra, FROM base +CROSS JOIN + UNNEST(events) AS event diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/fenix/funnel_retention_clients/schema.yaml /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/fenix/funnel_retention_clients/schema.yaml --- /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/fenix/funnel_retention_clients/schema.yaml 2024-05-27 14:54:51.000000000 +0000 +++ /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/fenix/funnel_retention_clients/schema.yaml 2024-05-27 15:02:38.000000000 +0000 @@ -26,6 +26,9 @@ - name: adjust_network type: STRING mode: NULLABLE +- name: install_source + type: STRING + mode: NULLABLE - name: retained_week_2 type: BOOLEAN mode: NULLABLE diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/fenix/funnel_retention_week_4/schema.yaml /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/fenix/funnel_retention_week_4/schema.yaml --- /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/fenix/funnel_retention_week_4/schema.yaml 2024-05-27 14:54:51.000000000 +0000 +++ /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/fenix/funnel_retention_week_4/schema.yaml 2024-05-27 15:02:38.000000000 +0000 @@ -48,6 +48,10 @@ description: 'The type of source of a client installation. ' +- name: install_source + type: STRING + mode: NULLABLE + description: null - name: new_profiles type: INTEGER mode: NULLABLE diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/firefox_desktop_background_defaultagent_derived/events_stream_v1/query.sql /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/firefox_desktop_background_defaultagent_derived/events_stream_v1/query.sql --- /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/firefox_desktop_background_defaultagent_derived/events_stream_v1/query.sql 2024-05-27 14:55:32.000000000 +0000 +++ /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/firefox_desktop_background_defaultagent_derived/events_stream_v1/query.sql 2024-05-27 14:57:29.000000000 +0000 @@ -1,7 +1,73 @@ -- Generated via bigquery_etl.glean_usage +-- convert array of key value pairs to a json object, cast numbers and booleans if possible +CREATE TEMP FUNCTION from_map_event_extra(input ARRAY>) +RETURNS json AS ( + IF( + ARRAY_LENGTH(input) = 0, + NULL, + JSON_OBJECT( + ARRAY(SELECT key FROM UNNEST(input)), + ARRAY( + SELECT + CASE + WHEN SAFE_CAST(value AS NUMERIC) IS NOT NULL + THEN TO_JSON(SAFE_CAST(value AS NUMERIC)) + WHEN SAFE_CAST(value AS BOOL) IS NOT NULL + THEN TO_JSON(SAFE_CAST(value AS BOOL)) + ELSE TO_JSON(value) + END + FROM + UNNEST(input) + ) + ) + ) +); + +-- convert array of key value pairs to a json object +-- values are nested structs and will be converted to json objects +CREATE TEMP FUNCTION from_map_experiment( + input ARRAY< + STRUCT>> + > +) +RETURNS json AS ( + IF( + ARRAY_LENGTH(input) = 0, + NULL, + JSON_OBJECT(ARRAY(SELECT key FROM UNNEST(input)), ARRAY(SELECT value FROM UNNEST(input))) + ) +); + +CREATE TEMP FUNCTION metrics_to_json(metrics JSON) +RETURNS JSON AS ( + JSON_STRIP_NULLS( + JSON_REMOVE( + -- labeled_* are the only ones that SHOULD show up as context for events pings, + -- thus we special-case them + -- + -- The JSON_SET/JSON_EXTRACT shenanigans are needed + -- because those subfields might not exist, so accessing the columns would fail. + -- but accessing non-existent fields in a JSON object simply gives us NULL. + JSON_SET( + metrics, + '$.labeled_counter', + mozfun.json.from_nested_map(metrics.labeled_counter), + '$.labeled_string', + mozfun.json.from_nested_map(metrics.labeled_string), + '$.labeled_boolean', + mozfun.json.from_nested_map(metrics.labeled_boolean), + '$.url', + metrics.url2 + ), + '$.url2' + ), + remove_empty => TRUE + ) +); + WITH base AS ( SELECT - * EXCEPT (metrics, events, name, category, extra, timestamp) REPLACE( + * REPLACE ( STRUCT( client_info.app_build AS app_build, client_info.app_channel AS app_channel, @@ -25,24 +91,14 @@ ping_info.end_time, ping_info.parsed_end_time, ping_info.ping_type - ) AS ping_info + ) AS ping_info, + metrics_to_json(TO_JSON(metrics)) AS metrics ), client_info.client_id AS client_id, ping_info.reason AS reason, - `mozfun.json.from_map`(ping_info.experiments) AS experiments, - COALESCE( - SAFE.TIMESTAMP_MILLIS(SAFE_CAST(mozfun.map.get_key(event.extra, 'glean_timestamp') AS INT64)), - SAFE.TIMESTAMP_ADD(ping_info.parsed_start_time, INTERVAL event.timestamp MILLISECOND) - ) AS event_timestamp, - event.category AS event_category, - event.name AS event_name, - ARRAY_TO_STRING([event.category, event.name], '.') AS event, -- handles NULL values better - `mozfun.json.from_map`(event.extra) AS event_extra, - TO_JSON(metrics) AS metrics + from_map_experiment(ping_info.experiments) AS experiments, FROM - `firefox_desktop_background_defaultagent.events` AS e - CROSS JOIN - UNNEST(events) AS event + `firefox_desktop_background_defaultagent.events` WHERE {% if is_init() %} DATE(submission_timestamp) >= '2023-11-01' @@ -52,34 +108,16 @@ ) -- SELECT - * REPLACE ( - -- expose as easy to access JSON column, - -- strip nulls, - -- translate nested array records into a JSON object, - -- rename url2 -> url - JSON_STRIP_NULLS( - JSON_REMOVE( - -- labeled_* are the only ones that SHOULD show up as context for events pings, - -- thus we special-case them - -- - -- The JSON_SET/JSON_EXTRACT shenanigans are needed - -- because those subfields might not exist, so accessing the columns would fail. - -- but accessing non-existent fields in a JSON object simply gives us NULL. - JSON_SET( - metrics, - '$.labeled_counter', - mozfun.json.from_nested_map(metrics.labeled_counter), - '$.labeled_string', - mozfun.json.from_nested_map(metrics.labeled_string), - '$.labeled_boolean', - mozfun.json.from_nested_map(metrics.labeled_boolean), - '$.url', - metrics.url2 - ), - '$.url2' - ), - remove_empty => TRUE - ) AS metrics - ) + base.* EXCEPT (events), + COALESCE( + SAFE.TIMESTAMP_MILLIS(SAFE_CAST(mozfun.map.get_key(event.extra, 'glean_timestamp') AS INT64)), + SAFE.TIMESTAMP_ADD(ping_info.parsed_start_time, INTERVAL event.timestamp MILLISECOND) + ) AS event_timestamp, + event.category AS event_category, + event.name AS event_name, + ARRAY_TO_STRING([event.category, event.name], '.') AS event, -- handles NULL values better + from_map_event_extra(event.extra) AS event_extra, FROM base +CROSS JOIN + UNNEST(events) AS event diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/firefox_desktop_background_tasks_derived/event_monitoring_live_v1/materialized_view.sql /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/firefox_desktop_background_tasks_derived/event_monitoring_live_v1/materialized_view.sql --- /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/firefox_desktop_background_tasks_derived/event_monitoring_live_v1/materialized_view.sql 2024-05-27 14:55:32.000000000 +0000 +++ /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/firefox_desktop_background_tasks_derived/event_monitoring_live_v1/materialized_view.sql 2024-05-27 14:57:25.000000000 +0000 @@ -50,7 +50,7 @@ client_info.app_display_version AS version, ping_info FROM - `moz-fx-data-shared-prod.firefox_desktop_background_tasks_live.events_v1` + `moz-fx-data-shared-prod.firefox_desktop_background_tasks_live.background_tasks_v1` UNION ALL SELECT submission_timestamp, @@ -60,7 +60,7 @@ client_info.app_display_version AS version, ping_info FROM - `moz-fx-data-shared-prod.firefox_desktop_background_tasks_live.background_tasks_v1` + `moz-fx-data-shared-prod.firefox_desktop_background_tasks_live.events_v1` ) CROSS JOIN UNNEST(events) AS event, @@ -70,7 +70,7 @@ LEFT JOIN UNNEST(event.extra) AS event_extra WHERE - DATE(submission_timestamp) >= "2024-05-24" + DATE(submission_timestamp) >= "2024-05-27" GROUP BY submission_date, window_start, diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/firefox_desktop_background_tasks_derived/events_stream_v1/query.sql /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/firefox_desktop_background_tasks_derived/events_stream_v1/query.sql --- /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/firefox_desktop_background_tasks_derived/events_stream_v1/query.sql 2024-05-27 14:55:32.000000000 +0000 +++ /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/firefox_desktop_background_tasks_derived/events_stream_v1/query.sql 2024-05-27 14:57:30.000000000 +0000 @@ -1,7 +1,73 @@ -- Generated via bigquery_etl.glean_usage +-- convert array of key value pairs to a json object, cast numbers and booleans if possible +CREATE TEMP FUNCTION from_map_event_extra(input ARRAY>) +RETURNS json AS ( + IF( + ARRAY_LENGTH(input) = 0, + NULL, + JSON_OBJECT( + ARRAY(SELECT key FROM UNNEST(input)), + ARRAY( + SELECT + CASE + WHEN SAFE_CAST(value AS NUMERIC) IS NOT NULL + THEN TO_JSON(SAFE_CAST(value AS NUMERIC)) + WHEN SAFE_CAST(value AS BOOL) IS NOT NULL + THEN TO_JSON(SAFE_CAST(value AS BOOL)) + ELSE TO_JSON(value) + END + FROM + UNNEST(input) + ) + ) + ) +); + +-- convert array of key value pairs to a json object +-- values are nested structs and will be converted to json objects +CREATE TEMP FUNCTION from_map_experiment( + input ARRAY< + STRUCT>> + > +) +RETURNS json AS ( + IF( + ARRAY_LENGTH(input) = 0, + NULL, + JSON_OBJECT(ARRAY(SELECT key FROM UNNEST(input)), ARRAY(SELECT value FROM UNNEST(input))) + ) +); + +CREATE TEMP FUNCTION metrics_to_json(metrics JSON) +RETURNS JSON AS ( + JSON_STRIP_NULLS( + JSON_REMOVE( + -- labeled_* are the only ones that SHOULD show up as context for events pings, + -- thus we special-case them + -- + -- The JSON_SET/JSON_EXTRACT shenanigans are needed + -- because those subfields might not exist, so accessing the columns would fail. + -- but accessing non-existent fields in a JSON object simply gives us NULL. + JSON_SET( + metrics, + '$.labeled_counter', + mozfun.json.from_nested_map(metrics.labeled_counter), + '$.labeled_string', + mozfun.json.from_nested_map(metrics.labeled_string), + '$.labeled_boolean', + mozfun.json.from_nested_map(metrics.labeled_boolean), + '$.url', + metrics.url2 + ), + '$.url2' + ), + remove_empty => TRUE + ) +); + WITH base AS ( SELECT - * EXCEPT (metrics, events, name, category, extra, timestamp) REPLACE( + * REPLACE ( STRUCT( client_info.app_build AS app_build, client_info.app_channel AS app_channel, @@ -25,24 +91,14 @@ ping_info.end_time, ping_info.parsed_end_time, ping_info.ping_type - ) AS ping_info + ) AS ping_info, + metrics_to_json(TO_JSON(metrics)) AS metrics ), client_info.client_id AS client_id, ping_info.reason AS reason, - `mozfun.json.from_map`(ping_info.experiments) AS experiments, - COALESCE( - SAFE.TIMESTAMP_MILLIS(SAFE_CAST(mozfun.map.get_key(event.extra, 'glean_timestamp') AS INT64)), - SAFE.TIMESTAMP_ADD(ping_info.parsed_start_time, INTERVAL event.timestamp MILLISECOND) - ) AS event_timestamp, - event.category AS event_category, - event.name AS event_name, - ARRAY_TO_STRING([event.category, event.name], '.') AS event, -- handles NULL values better - `mozfun.json.from_map`(event.extra) AS event_extra, - TO_JSON(metrics) AS metrics + from_map_experiment(ping_info.experiments) AS experiments, FROM - `firefox_desktop_background_tasks.events` AS e - CROSS JOIN - UNNEST(events) AS event + `firefox_desktop_background_tasks.events` WHERE {% if is_init() %} DATE(submission_timestamp) >= '2023-11-01' @@ -52,34 +108,16 @@ ) -- SELECT - * REPLACE ( - -- expose as easy to access JSON column, - -- strip nulls, - -- translate nested array records into a JSON object, - -- rename url2 -> url - JSON_STRIP_NULLS( - JSON_REMOVE( - -- labeled_* are the only ones that SHOULD show up as context for events pings, - -- thus we special-case them - -- - -- The JSON_SET/JSON_EXTRACT shenanigans are needed - -- because those subfields might not exist, so accessing the columns would fail. - -- but accessing non-existent fields in a JSON object simply gives us NULL. - JSON_SET( - metrics, - '$.labeled_counter', - mozfun.json.from_nested_map(metrics.labeled_counter), - '$.labeled_string', - mozfun.json.from_nested_map(metrics.labeled_string), - '$.labeled_boolean', - mozfun.json.from_nested_map(metrics.labeled_boolean), - '$.url', - metrics.url2 - ), - '$.url2' - ), - remove_empty => TRUE - ) AS metrics - ) + base.* EXCEPT (events), + COALESCE( + SAFE.TIMESTAMP_MILLIS(SAFE_CAST(mozfun.map.get_key(event.extra, 'glean_timestamp') AS INT64)), + SAFE.TIMESTAMP_ADD(ping_info.parsed_start_time, INTERVAL event.timestamp MILLISECOND) + ) AS event_timestamp, + event.category AS event_category, + event.name AS event_name, + ARRAY_TO_STRING([event.category, event.name], '.') AS event, -- handles NULL values better + from_map_event_extra(event.extra) AS event_extra, FROM base +CROSS JOIN + UNNEST(events) AS event diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/firefox_desktop_background_update_derived/events_stream_v1/query.sql /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/firefox_desktop_background_update_derived/events_stream_v1/query.sql --- /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/firefox_desktop_background_update_derived/events_stream_v1/query.sql 2024-05-27 14:55:26.000000000 +0000 +++ /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/firefox_desktop_background_update_derived/events_stream_v1/query.sql 2024-05-27 14:57:30.000000000 +0000 @@ -1,7 +1,73 @@ -- Generated via bigquery_etl.glean_usage +-- convert array of key value pairs to a json object, cast numbers and booleans if possible +CREATE TEMP FUNCTION from_map_event_extra(input ARRAY>) +RETURNS json AS ( + IF( + ARRAY_LENGTH(input) = 0, + NULL, + JSON_OBJECT( + ARRAY(SELECT key FROM UNNEST(input)), + ARRAY( + SELECT + CASE + WHEN SAFE_CAST(value AS NUMERIC) IS NOT NULL + THEN TO_JSON(SAFE_CAST(value AS NUMERIC)) + WHEN SAFE_CAST(value AS BOOL) IS NOT NULL + THEN TO_JSON(SAFE_CAST(value AS BOOL)) + ELSE TO_JSON(value) + END + FROM + UNNEST(input) + ) + ) + ) +); + +-- convert array of key value pairs to a json object +-- values are nested structs and will be converted to json objects +CREATE TEMP FUNCTION from_map_experiment( + input ARRAY< + STRUCT>> + > +) +RETURNS json AS ( + IF( + ARRAY_LENGTH(input) = 0, + NULL, + JSON_OBJECT(ARRAY(SELECT key FROM UNNEST(input)), ARRAY(SELECT value FROM UNNEST(input))) + ) +); + +CREATE TEMP FUNCTION metrics_to_json(metrics JSON) +RETURNS JSON AS ( + JSON_STRIP_NULLS( + JSON_REMOVE( + -- labeled_* are the only ones that SHOULD show up as context for events pings, + -- thus we special-case them + -- + -- The JSON_SET/JSON_EXTRACT shenanigans are needed + -- because those subfields might not exist, so accessing the columns would fail. + -- but accessing non-existent fields in a JSON object simply gives us NULL. + JSON_SET( + metrics, + '$.labeled_counter', + mozfun.json.from_nested_map(metrics.labeled_counter), + '$.labeled_string', + mozfun.json.from_nested_map(metrics.labeled_string), + '$.labeled_boolean', + mozfun.json.from_nested_map(metrics.labeled_boolean), + '$.url', + metrics.url2 + ), + '$.url2' + ), + remove_empty => TRUE + ) +); + WITH base AS ( SELECT - * EXCEPT (metrics, events, name, category, extra, timestamp) REPLACE( + * REPLACE ( STRUCT( client_info.app_build AS app_build, client_info.app_channel AS app_channel, @@ -25,24 +91,14 @@ ping_info.end_time, ping_info.parsed_end_time, ping_info.ping_type - ) AS ping_info + ) AS ping_info, + metrics_to_json(TO_JSON(metrics)) AS metrics ), client_info.client_id AS client_id, ping_info.reason AS reason, - `mozfun.json.from_map`(ping_info.experiments) AS experiments, - COALESCE( - SAFE.TIMESTAMP_MILLIS(SAFE_CAST(mozfun.map.get_key(event.extra, 'glean_timestamp') AS INT64)), - SAFE.TIMESTAMP_ADD(ping_info.parsed_start_time, INTERVAL event.timestamp MILLISECOND) - ) AS event_timestamp, - event.category AS event_category, - event.name AS event_name, - ARRAY_TO_STRING([event.category, event.name], '.') AS event, -- handles NULL values better - `mozfun.json.from_map`(event.extra) AS event_extra, - TO_JSON(metrics) AS metrics + from_map_experiment(ping_info.experiments) AS experiments, FROM - `firefox_desktop_background_update.events` AS e - CROSS JOIN - UNNEST(events) AS event + `firefox_desktop_background_update.events` WHERE {% if is_init() %} DATE(submission_timestamp) >= '2023-11-01' @@ -52,34 +108,16 @@ ) -- SELECT - * REPLACE ( - -- expose as easy to access JSON column, - -- strip nulls, - -- translate nested array records into a JSON object, - -- rename url2 -> url - JSON_STRIP_NULLS( - JSON_REMOVE( - -- labeled_* are the only ones that SHOULD show up as context for events pings, - -- thus we special-case them - -- - -- The JSON_SET/JSON_EXTRACT shenanigans are needed - -- because those subfields might not exist, so accessing the columns would fail. - -- but accessing non-existent fields in a JSON object simply gives us NULL. - JSON_SET( - metrics, - '$.labeled_counter', - mozfun.json.from_nested_map(metrics.labeled_counter), - '$.labeled_string', - mozfun.json.from_nested_map(metrics.labeled_string), - '$.labeled_boolean', - mozfun.json.from_nested_map(metrics.labeled_boolean), - '$.url', - metrics.url2 - ), - '$.url2' - ), - remove_empty => TRUE - ) AS metrics - ) + base.* EXCEPT (events), + COALESCE( + SAFE.TIMESTAMP_MILLIS(SAFE_CAST(mozfun.map.get_key(event.extra, 'glean_timestamp') AS INT64)), + SAFE.TIMESTAMP_ADD(ping_info.parsed_start_time, INTERVAL event.timestamp MILLISECOND) + ) AS event_timestamp, + event.category AS event_category, + event.name AS event_name, + ARRAY_TO_STRING([event.category, event.name], '.') AS event, -- handles NULL values better + from_map_event_extra(event.extra) AS event_extra, FROM base +CROSS JOIN + UNNEST(events) AS event diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/firefox_desktop_derived/event_monitoring_live_v1/materialized_view.sql /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/firefox_desktop_derived/event_monitoring_live_v1/materialized_view.sql --- /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/firefox_desktop_derived/event_monitoring_live_v1/materialized_view.sql 2024-05-27 14:55:26.000000000 +0000 +++ /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/firefox_desktop_derived/event_monitoring_live_v1/materialized_view.sql 2024-05-27 14:57:26.000000000 +0000 @@ -50,7 +50,7 @@ client_info.app_display_version AS version, ping_info FROM - `moz-fx-data-shared-prod.firefox_desktop_live.urlbar_potential_exposure_v1` + `moz-fx-data-shared-prod.firefox_desktop_live.newtab_v1` UNION ALL SELECT submission_timestamp, @@ -80,7 +80,7 @@ client_info.app_display_version AS version, ping_info FROM - `moz-fx-data-shared-prod.firefox_desktop_live.newtab_v1` + `moz-fx-data-shared-prod.firefox_desktop_live.urlbar_potential_exposure_v1` ) CROSS JOIN UNNEST(events) AS event, @@ -90,7 +90,7 @@ LEFT JOIN UNNEST(event.extra) AS event_extra WHERE - DATE(submission_timestamp) >= "2024-05-24" + DATE(submission_timestamp) >= "2024-05-27" GROUP BY submission_date, window_start, diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/firefox_desktop_derived/events_stream_v1/query.sql /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/firefox_desktop_derived/events_stream_v1/query.sql --- /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/firefox_desktop_derived/events_stream_v1/query.sql 2024-05-27 14:55:26.000000000 +0000 +++ /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/firefox_desktop_derived/events_stream_v1/query.sql 2024-05-27 14:57:31.000000000 +0000 @@ -1,7 +1,73 @@ -- Generated via bigquery_etl.glean_usage +-- convert array of key value pairs to a json object, cast numbers and booleans if possible +CREATE TEMP FUNCTION from_map_event_extra(input ARRAY>) +RETURNS json AS ( + IF( + ARRAY_LENGTH(input) = 0, + NULL, + JSON_OBJECT( + ARRAY(SELECT key FROM UNNEST(input)), + ARRAY( + SELECT + CASE + WHEN SAFE_CAST(value AS NUMERIC) IS NOT NULL + THEN TO_JSON(SAFE_CAST(value AS NUMERIC)) + WHEN SAFE_CAST(value AS BOOL) IS NOT NULL + THEN TO_JSON(SAFE_CAST(value AS BOOL)) + ELSE TO_JSON(value) + END + FROM + UNNEST(input) + ) + ) + ) +); + +-- convert array of key value pairs to a json object +-- values are nested structs and ```

⚠️ Only part of the diff is displayed.

Link to full diff