mozilla / bigquery-etl

Bigquery ETL
https://mozilla.github.io/bigquery-etl
Mozilla Public License 2.0
253 stars 100 forks source link

copy code for metadata, query, schema from marketing-prod to shared-prod #6020

Closed Marlene-M-Hirose closed 2 months ago

Marlene-M-Hirose commented 2 months ago

Checklist for reviewer:

For modifications to schemas in restricted namespaces (see CODEOWNERS):

┆Issue is synchronized with this Jira Task

dataops-ci-bot commented 2 months ago

Integration report for "copy code for metadata, query, schema from marketing-prod to shared-prod"

sql.diff

Click to expand! ```diff diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/dags/bqetl_google_analytics_derived_ga4.py /tmp/workspace/generated-sql/dags/bqetl_google_analytics_derived_ga4.py --- /tmp/workspace/main-generated-sql/dags/bqetl_google_analytics_derived_ga4.py 2024-08-06 22:01:10.000000000 +0000 +++ /tmp/workspace/generated-sql/dags/bqetl_google_analytics_derived_ga4.py 2024-08-06 22:02:46.000000000 +0000 @@ -233,6 +233,23 @@ retries=0, ) + checks__warn_mozilla_org_derived__www_site_hits__v2 = bigquery_dq_check( + task_id="checks__warn_mozilla_org_derived__www_site_hits__v2", + source_table="www_site_hits_v2", + dataset_id="mozilla_org_derived", + project_id="moz-fx-data-shared-prod", + is_dq_check_fail=False, + owner="mhirose@mozilla.com", + email=[ + "kwindau@mozilla.com", + "mhirose@mozilla.com", + "telemetry-alerts@mozilla.com", + ], + depends_on_past=False, + parameters=["submission_date:DATE:{{ds}}"], + retries=0, + ) + ga_derived__blogs_daily_summary__v2 = bigquery_etl_query( task_id="ga_derived__blogs_daily_summary__v2", destination_table="blogs_daily_summary_v2", @@ -421,6 +438,21 @@ depends_on_past=False, ) + mozilla_org_derived__www_site_hits__v2 = bigquery_etl_query( + task_id="mozilla_org_derived__www_site_hits__v2", + destination_table="www_site_hits_v2", + dataset_id="mozilla_org_derived", + project_id="moz-fx-data-shared-prod", + owner="mhirose@mozilla.com", + email=[ + "kwindau@mozilla.com", + "mhirose@mozilla.com", + "telemetry-alerts@mozilla.com", + ], + date_partition_parameter="submission_date", + depends_on_past=False, + ) + mozilla_org_derived__www_site_metrics_summary__v2 = bigquery_etl_query( task_id="mozilla_org_derived__www_site_metrics_summary__v2", destination_table="www_site_metrics_summary_v2", @@ -469,6 +501,10 @@ mozilla_org_derived__ga_sessions__v2 ) + checks__warn_mozilla_org_derived__www_site_hits__v2.set_upstream( + mozilla_org_derived__www_site_hits__v2 + ) + ga_derived__blogs_daily_summary__v2.set_upstream(ga_derived__blogs_goals__v2) ga_derived__blogs_daily_summary__v2.set_upstream(ga_derived__blogs_sessions__v2) @@ -531,6 +567,8 @@ mozilla_org_derived__www_site_downloads__v2.set_upstream(wait_for_wmo_events_table) + mozilla_org_derived__www_site_hits__v2.set_upstream(wait_for_wmo_events_table) + mozilla_org_derived__www_site_metrics_summary__v2.set_upstream( wait_for_wmo_events_table ) Only in /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived: www_site_hits_v2 diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/www_site_hits_v2/checks.sql /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/www_site_hits_v2/checks.sql --- /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/www_site_hits_v2/checks.sql 1970-01-01 00:00:00.000000000 +0000 +++ /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/www_site_hits_v2/checks.sql 2024-08-06 21:55:06.000000000 +0000 @@ -0,0 +1,2 @@ +#warn +{{ matches_pattern(column="visit_identifier", pattern="^[0-9]+\\.{1}[0-9]+\\-{1}[0-9]+$", where="date = @submission_date", threshold_fail_percentage=0, message="Warn - some visit_identifier not matching expected pattern") }} diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/www_site_hits_v2/metadata.yaml /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/www_site_hits_v2/metadata.yaml --- /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/www_site_hits_v2/metadata.yaml 1970-01-01 00:00:00.000000000 +0000 +++ /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/www_site_hits_v2/metadata.yaml 2024-08-06 21:57:08.000000000 +0000 @@ -0,0 +1,31 @@ +friendly_name: WWW Site Hits V2 +description: |- + Normalized events from Google Analytics 4 (GA4) for www.mozilla.org. The events triggered at the same timestamp are given the same hit number. +owners: +- mhirose@mozilla.com +labels: + incremental: true + owner1: mhirose + dag: bqetl_google_analytics_derived_ga4 +scheduling: + dag_name: bqetl_google_analytics_derived_ga4 + depends_on: + - task_id: wait_for_wmo_events_table + dag_name: bqetl_google_analytics_derived_ga4 +bigquery: + time_partitioning: + type: day + field: date + require_partition_filter: true + expiration_days: null + range_partitioning: null + clustering: + fields: + - country + - language + - event_name +workgroup_access: +- role: roles/bigquery.dataViewer + members: + - workgroup:mozilla-confidential +references: {} diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/www_site_hits_v2/query.sql /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/www_site_hits_v2/query.sql --- /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/www_site_hits_v2/query.sql 1970-01-01 00:00:00.000000000 +0000 +++ /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/www_site_hits_v2/query.sql 2024-08-06 21:55:06.000000000 +0000 @@ -0,0 +1,321 @@ +WITH get_session_start_time AS ( + SELECT + SAFE.PARSE_DATE('%Y%m%d', a.event_date) AS date, + a.user_pseudo_id || '-' || CAST( + ( + SELECT + `value` + FROM + UNNEST(event_params) + WHERE + key = 'ga_session_id' + LIMIT + 1 + ).int_value AS STRING + ) AS visit_identifier, + a.user_pseudo_id AS full_visitor_id, + MIN(event_timestamp) AS visit_start_time + FROM + `moz-fx-data-marketing-prod.analytics_313696158.events_*` a + JOIN + UNNEST(event_params) e + WHERE + e.key = 'ga_session_id' + AND e.value.int_value IS NOT NULL + AND _TABLE_SUFFIX = SAFE.FORMAT_DATE('%Y%m%d', @submission_date) + GROUP BY + date, + visit_identifier, + full_visitor_id +), +get_all_events_in_each_session_staging AS ( + SELECT + SAFE.PARSE_DATE('%Y%m%d', a.event_date) AS date, + a.user_pseudo_id || '-' || CAST( + ( + SELECT + `value` + FROM + UNNEST(event_params) + WHERE + key = 'ga_session_id' + LIMIT + 1 + ).int_value AS string + ) AS visit_identifier, + a.user_pseudo_id AS full_visitor_id, + a.event_name, + a.event_timestamp, + device.category AS device_category, + device.operating_system AS operating_system, + device.language AS language, + device.web_info.browser AS browser, + device.web_info.browser_version AS browser_version, + geo.country AS country, + traffic_source.name AS traffic_source_name, + traffic_source.medium AS traffic_source_medium, + traffic_source.source AS traffic_source_source, + collected_traffic_source.manual_campaign_id AS manual_campaign_id, + collected_traffic_source.manual_term AS manual_term, + collected_traffic_source.manual_source AS source, + collected_traffic_source.manual_medium AS medium, + collected_traffic_source.manual_campaign_name AS campaign, + collected_traffic_source.manual_content AS ad_content, + CASE + WHEN event_name = 'product_download' + THEN (SELECT `value` FROM UNNEST(event_params) WHERE key = 'product' LIMIT 1).string_value + ELSE NULL + END AS product_type, + CASE + WHEN event_name = 'product_download' + THEN (SELECT `value` FROM UNNEST(event_params) WHERE key = 'platform' LIMIT 1).string_value + ELSE NULL + END AS platform_type, + ( + SELECT + `value` + FROM + UNNEST(event_params) + WHERE + key = 'engagement_time_msec' + LIMIT + 1 + ).int_value AS engagement_time_msec, --not sure if this is the same as hit time yet or not + ( + SELECT + `value` + FROM + UNNEST(event_params) + WHERE + key = 'engaged_session_event' + LIMIT + 1 + ).int_value AS engaged_session_event, + ( + SELECT + `value` + FROM + UNNEST(event_params) + WHERE + key = 'page_location' + LIMIT + 1 + ).string_value AS page_location, + COALESCE( + (SELECT `value` FROM UNNEST(event_params) WHERE key = 'entrances' LIMIT 1).int_value, + 0 + ) AS is_entrance + FROM + `moz-fx-data-marketing-prod.analytics_313696158.events_*` a + WHERE + _TABLE_SUFFIX = SAFE.FORMAT_DATE('%Y%m%d', @submission_date) +), +--if there are multiple events at the same timestamp in the same session, assign them all the same hit number +get_all_events_in_each_session AS ( + SELECT + a.date, + a.visit_identifier, + a.full_visitor_id, + a.event_name, + a.event_timestamp, + a.device_category, + a.operating_system, + a.language, + a.browser, + a.browser_version, + a.country, + a.traffic_source_name, + a.traffic_source_medium, + a.traffic_source_source, + a.manual_campaign_id, + a.manual_term, + a.source, + a.medium, + a.campaign, + a.ad_content, + a.product_type, + a.platform_type, + a.engagement_time_msec, + a.engaged_session_event, + SPLIT(a.page_location, '?')[OFFSET(0)] AS page_location, + a.is_entrance, + DENSE_RANK() OVER (PARTITION BY visit_identifier ORDER BY event_timestamp ASC) AS hit_number, + ROW_NUMBER() OVER ( + PARTITION BY + visit_identifier + ORDER BY + event_timestamp, + event_name, + page_location ASC + ) AS row_nbr + FROM + get_all_events_in_each_session_staging a +), +--get the row number associated with the last page view in each session +row_nbr_of_last_page_view_in_each_session AS ( + SELECT + visit_identifier, + COUNT(1) AS nbr_page_view_events, + MAX(row_nbr) AS max_row_number + FROM + get_all_events_in_each_session + WHERE + event_name = 'page_view' + GROUP BY + visit_identifier +), +first_and_last_interaction AS ( + SELECT + visit_identifier, + MAX(CASE WHEN engaged_session_event = 1 THEN 1 ELSE 0 END) AS session_had_an_engaged_event, + MIN(CASE WHEN engaged_session_event = 1 THEN hit_number ELSE NULL END) AS first_interaction, + MAX(CASE WHEN engaged_session_event = 1 THEN hit_number ELSE NULL END) AS last_interaction + FROM + get_all_events_in_each_session + GROUP BY + visit_identifier +), +final_staging AS ( + SELECT + all_sessions.date, + all_sessions.visit_identifier, + all_sessions.full_visitor_id, + all_sessions.visit_start_time, + REGEXP_REPLACE(all_events.page_location, '^https://www.mozilla.org', '') AS page_path, + SPLIT(REGEXP_REPLACE(all_events.page_location, '^https://www.mozilla.org', ''), '/')[ + SAFE_OFFSET(1) + ] AS page_path_level1, + CASE + WHEN all_events.event_name = 'page_view' + THEN 'PAGE' + ELSE 'EVENT' + END AS hit_type, + CAST(CASE WHEN exits.max_row_number = all_events.row_nbr THEN 1 ELSE 0 END AS bool) AS is_exit, + CAST(all_events.is_entrance AS bool) AS is_entrance, + all_events.hit_number, + all_events.event_timestamp AS hit_timestamp, + CAST(NULL AS string) AS event_category, --GA4 has no notion of event_label, unlike GA3 (UA360) + all_events.event_name, + CAST(NULL AS string) AS event_label, --GA4 has no notion of event_label, unlike GA3 (UA360) + CAST(NULL AS string) AS event_action, --GA4 has no notion of event_action, unlike GA3 (UA360) + all_events.device_category, + all_events.operating_system, + all_events.language, + all_events.browser, + all_events.browser_version, + all_events.country, + all_events.traffic_source_name, + all_events.traffic_source_medium, + all_events.traffic_source_source, + all_events.manual_campaign_id, + all_events.manual_term, + all_events.source, + all_events.medium, + all_events.campaign, + all_events.ad_content, + all_events.product_type, + all_events.platform_type, + engmgt.session_had_an_engaged_event AS visits, --this is the equivalent logic to totals.visits in UA + CASE + WHEN exits.nbr_page_view_events = 1 + THEN TRUE + ELSE FALSE + END AS single_page_session, + CASE + WHEN engmgt.session_had_an_engaged_event = 0 + THEN 1 + ELSE 0 + END AS bounces, --if the session did not have an engaged event, then the session is considered a bounce, else it is not + (all_events.event_timestamp - all_sessions.visit_start_time) / 1000000 AS hit_time, + SAFE_DIVIDE(engagement_time_msec, 1000) AS engagement_time, + engmgt.first_interaction, + CAST(engmgt.last_interaction AS float64) AS last_interaction, + all_events.is_entrance AS entrances, + COALESCE(CASE WHEN exits.max_row_number = all_events.row_nbr THEN 1 ELSE 0 END, 0) AS exits, + CAST( + NULL AS string + ) AS event_id, --old table defined this from event category, action, and label, which no longer exist in GA4 + SPLIT(REGEXP_REPLACE(all_events.page_location, '^https://www.mozilla.org', ''), '/')[ + SAFE_OFFSET(2) + ] AS page_level_1, + SPLIT(REGEXP_REPLACE(all_events.page_location, '^https://www.mozilla.org', ''), '/')[ + SAFE_OFFSET(3) + ] AS page_level_2, + SPLIT(REGEXP_REPLACE(all_events.page_location, '^https://www.mozilla.org', ''), '/')[ + SAFE_OFFSET(4) + ] AS page_level_3, + SPLIT(REGEXP_REPLACE(all_events.page_location, '^https://www.mozilla.org', ''), '/')[ + SAFE_OFFSET(5) + ] AS page_level_4, + SPLIT(REGEXP_REPLACE(all_events.page_location, '^https://www.mozilla.org', ''), '/')[ + SAFE_OFFSET(6) + ] AS page_level_5 + FROM + get_session_start_time all_sessions + LEFT OUTER JOIN + get_all_events_in_each_session all_events + ON all_sessions.date = all_events.date + AND all_sessions.visit_identifier = all_events.visit_identifier + AND all_sessions.full_visitor_id = all_events.full_visitor_id + LEFT OUTER JOIN + row_nbr_of_last_page_view_in_each_session exits + ON all_sessions.visit_identifier = exits.visit_identifier + LEFT OUTER JOIN + first_and_last_interaction engmgt + ON all_sessions.visit_identifier = engmgt.visit_identifier +) +SELECT + final.date, + final.visit_identifier, + final.full_visitor_id, + final.visit_start_time, + final.page_path, + final.page_path_level1, + final.hit_type, + final.is_exit, + final.is_entrance, + final.hit_number, + final.hit_timestamp, + final.event_category, + final.event_name, + final.event_label, + final.event_action, + final.device_category, + final.operating_system, + final.language, + final.browser, + final.browser_version, + final.country, + final.traffic_source_name, + final.traffic_source_medium, + final.traffic_source_source, + final.manual_campaign_id, + final.manual_term, + final.source, + final.medium, + final.campaign, + final.ad_content, + final.visits, + final.bounces, + final.hit_time, + final.engagement_time, + final.first_interaction, + final.last_interaction, + final.entrances, + final.exits, + final.event_id, + final.page_level_1, + final.page_level_2, + final.page_level_3, + final.page_level_4, + final.page_level_5, + IF( + page_level_2 IS NULL, + CONCAT('/', page_level_1, '/'), + ARRAY_TO_STRING(['', page_level_1, page_level_2, page_level_3, page_level_4, page_level_5], '/') + ) AS page_name, + final.single_page_session, + final.product_type, + final.platform_type +FROM + final_staging final diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/www_site_hits_v2/schema.yaml /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/www_site_hits_v2/schema.yaml --- /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/www_site_hits_v2/schema.yaml 1970-01-01 00:00:00.000000000 +0000 +++ /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/www_site_hits_v2/schema.yaml 2024-08-06 21:55:06.000000000 +0000 @@ -0,0 +1,193 @@ +fields: +- mode: NULLABLE + name: date + type: DATE + description: Date of the visit +- mode: NULLABLE + name: visit_identifier + type: STRING + description: Visit Identifier - Uniquely identifies a visit; concatenation of user_pseudo_id and ga_session_id +- mode: NULLABLE + name: full_visitor_id + type: STRING + description: Full Visitor ID - Uniquely identifies a visitor - this is the same as GA4 user_pseudo_id +- mode: NULLABLE + name: visit_start_time + type: INT64 + description: Visit Start Time - The event timestamp from the first event in the visit +- mode: NULLABLE + name: page_path + type: STRING + description: Page Path +- mode: NULLABLE + name: page_path_level1 + type: STRING + description: Page Path Level 1 +- mode: NULLABLE + name: hit_type + type: STRING + description: Hit Type - PAGE if event_name is a page view, EVENT for all other event types +- mode: NULLABLE + name: is_exit + type: BOOLEAN + description: Is Exit - The specific page view event that is considered the exit to the visit +- mode: NULLABLE + name: is_entrance + type: BOOLEAN + description: Is Entrance - The specific page view event that is considered the entrance to the visit +- mode: NULLABLE + name: hit_number + type: INT64 + description: Hit Number - Densely ranked, since there can be multiple events at the same time for a user +- mode: NULLABLE + name: hit_timestamp + type: INT64 + description: Hit Timestamp - Same as the "Event Timestamp" +- mode: NULLABLE + name: event_category + type: STRING + description: Event Category +- mode: NULLABLE + name: event_name + type: STRING + description: Event Name +- mode: NULLABLE + name: event_label + type: STRING + description: Event Label +- mode: NULLABLE + name: event_action + type: STRING + description: Event Action +- mode: NULLABLE + name: device_category + type: STRING + description: Device Category - The device category the visitor used to visit the site +- mode: NULLABLE + name: operating_system + type: STRING + description: Operating System - The operating system the visitor used to visit the site +- mode: NULLABLE + name: language + type: STRING + description: Language - The language the visiting device was using when it visited the site +- mode: NULLABLE + name: browser + type: STRING + description: Browser - The browser the visiting device was using when it visited the site +- mode: NULLABLE + name: browser_version + type: STRING + description: Browser Version - The version that the visiting device's browser was using when it visited the site +- mode: NULLABLE + name: country + type: STRING + description: Country - The country from which events were reported, based on IP address +- mode: NULLABLE + name: traffic_source_name + type: STRING + description: Traffic Source - Name +- mode: NULLABLE + name: traffic_source_medium + type: STRING + description: Traffic Source - Medium +- mode: NULLABLE + name: traffic_source_source + type: STRING + description: Traffic Source - Source +- mode: NULLABLE + name: manual_campaign_id + type: STRING + description: Manual Campaign ID (comes from collected traffic source) +- mode: NULLABLE + name: manual_term + type: STRING + description: Manual Term (comes from collected traffic source) +- mode: NULLABLE + name: source + type: STRING + description: Source - Referring partner domain (comes from collected traffic source) +- mode: NULLABLE + name: medium + type: STRING + description: Medium - Category of the source, such as 'organic' for a search engine (comes from collected traffic source) +- mode: NULLABLE + name: campaign + type: STRING + description: Campaign - Identifier for the marketing campaign (comes from collected traffic source) +- mode: NULLABLE + name: ad_content + type: STRING + description: Ad Content - Indicates the particular link within a campaign (comes from collected traffic source) +- mode: NULLABLE + name: visits + type: INT64 + description: Visits - A flag that indicates if the hit belongs to a visit that had 1 or more engaged event during the visit +- mode: NULLABLE + name: bounces + type: INT64 + description: Bounces - Indicator that displays a 1 for each hit in a bounce session (a session with only 1 page view) +- mode: NULLABLE + name: hit_time + type: FLOAT + description: Hit Time - The number of seconds after the visitStartTime that the hit was registered +- mode: NULLABLE + name: engagement_time + type: FLOAT + description: Engagement Time - Engagement Time in Seconds +- mode: NULLABLE + name: first_interaction + type: INT64 + description: First Interaction - The first hit number in the visit associated with an engaged session event +- mode: NULLABLE + name: last_interaction + type: FLOAT + description: Last Interaction - The last hit number in the visit associated with an engaged session event +- mode: NULLABLE + name: entrances + type: INT64 + description: Entrances - Denotes which pageview was the first in the session +- mode: NULLABLE + name: exits + type: INT64 + description: Exits - Denotes which page view was the last in the session +- mode: NULLABLE + name: event_id + type: STRING + description: Event ID +- mode: NULLABLE + name: page_level_1 + type: STRING + description: Page Level 1 +- mode: NULLABLE + name: page_level_2 + type: STRING + description: Page Level 2 +- mode: NULLABLE + name: page_level_3 + type: STRING + description: Page Level 3 +- mode: NULLABLE + name: page_level_4 + type: STRING + description: Page Level 4 +- mode: NULLABLE + name: page_level_5 + type: STRING + description: Page Level 5 +- mode: NULLABLE + name: page_name + type: STRING + description: Page Name - Consists of page levels 1 to 5 concatenated together +- mode: NULLABLE + name: single_page_session + type: BOOLEAN + description: Single Page Session - Indicator if the hit belongs to a single page session +- mode: NULLABLE + name: product_type + type: STRING + description: Product Type - For product download events, what product type was downloaded +- mode: NULLABLE + name: platform_type + type: STRING + description: Platform Type - For product download events, what platform type was the download requested from ```

Link to full diff

dataops-ci-bot commented 2 months ago

Integration report for "Merge branch 'main' into deng_4202_move_site_hits_v2_to_shared_prod"

sql.diff

Click to expand! ```diff diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/dags/bqetl_google_analytics_derived_ga4.py /tmp/workspace/generated-sql/dags/bqetl_google_analytics_derived_ga4.py --- /tmp/workspace/main-generated-sql/dags/bqetl_google_analytics_derived_ga4.py 2024-08-07 16:33:56.000000000 +0000 +++ /tmp/workspace/generated-sql/dags/bqetl_google_analytics_derived_ga4.py 2024-08-07 16:34:04.000000000 +0000 @@ -233,6 +233,23 @@ retries=0, ) + checks__warn_mozilla_org_derived__www_site_hits__v2 = bigquery_dq_check( + task_id="checks__warn_mozilla_org_derived__www_site_hits__v2", + source_table="www_site_hits_v2", + dataset_id="mozilla_org_derived", + project_id="moz-fx-data-shared-prod", + is_dq_check_fail=False, + owner="mhirose@mozilla.com", + email=[ + "kwindau@mozilla.com", + "mhirose@mozilla.com", + "telemetry-alerts@mozilla.com", + ], + depends_on_past=False, + parameters=["submission_date:DATE:{{ds}}"], + retries=0, + ) + ga_derived__blogs_daily_summary__v2 = bigquery_etl_query( task_id="ga_derived__blogs_daily_summary__v2", destination_table="blogs_daily_summary_v2", @@ -421,6 +438,21 @@ depends_on_past=False, ) + mozilla_org_derived__www_site_hits__v2 = bigquery_etl_query( + task_id="mozilla_org_derived__www_site_hits__v2", + destination_table="www_site_hits_v2", + dataset_id="mozilla_org_derived", + project_id="moz-fx-data-shared-prod", + owner="mhirose@mozilla.com", + email=[ + "kwindau@mozilla.com", + "mhirose@mozilla.com", + "telemetry-alerts@mozilla.com", + ], + date_partition_parameter="submission_date", + depends_on_past=False, + ) + mozilla_org_derived__www_site_metrics_summary__v2 = bigquery_etl_query( task_id="mozilla_org_derived__www_site_metrics_summary__v2", destination_table="www_site_metrics_summary_v2", @@ -469,6 +501,10 @@ mozilla_org_derived__ga_sessions__v2 ) + checks__warn_mozilla_org_derived__www_site_hits__v2.set_upstream( + mozilla_org_derived__www_site_hits__v2 + ) + ga_derived__blogs_daily_summary__v2.set_upstream(ga_derived__blogs_goals__v2) ga_derived__blogs_daily_summary__v2.set_upstream(ga_derived__blogs_sessions__v2) @@ -531,6 +567,8 @@ mozilla_org_derived__www_site_downloads__v2.set_upstream(wait_for_wmo_events_table) + mozilla_org_derived__www_site_hits__v2.set_upstream(wait_for_wmo_events_table) + mozilla_org_derived__www_site_metrics_summary__v2.set_upstream( wait_for_wmo_events_table ) Only in /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived: www_site_hits_v2 diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/www_site_hits_v2/checks.sql /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/www_site_hits_v2/checks.sql --- /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/www_site_hits_v2/checks.sql 1970-01-01 00:00:00.000000000 +0000 +++ /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/www_site_hits_v2/checks.sql 2024-08-07 16:26:48.000000000 +0000 @@ -0,0 +1,2 @@ +#warn +{{ matches_pattern(column="visit_identifier", pattern="^[0-9]+\\.{1}[0-9]+\\-{1}[0-9]+$", where="date = @submission_date", threshold_fail_percentage=0, message="Warn - some visit_identifier not matching expected pattern") }} diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/www_site_hits_v2/metadata.yaml /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/www_site_hits_v2/metadata.yaml --- /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/www_site_hits_v2/metadata.yaml 1970-01-01 00:00:00.000000000 +0000 +++ /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/www_site_hits_v2/metadata.yaml 2024-08-07 16:28:38.000000000 +0000 @@ -0,0 +1,31 @@ +friendly_name: WWW Site Hits V2 +description: |- + Normalized events from Google Analytics 4 (GA4) for www.mozilla.org. The events triggered at the same timestamp are given the same hit number. +owners: +- mhirose@mozilla.com +labels: + incremental: true + owner1: mhirose + dag: bqetl_google_analytics_derived_ga4 +scheduling: + dag_name: bqetl_google_analytics_derived_ga4 + depends_on: + - task_id: wait_for_wmo_events_table + dag_name: bqetl_google_analytics_derived_ga4 +bigquery: + time_partitioning: + type: day + field: date + require_partition_filter: true + expiration_days: null + range_partitioning: null + clustering: + fields: + - country + - language + - event_name +workgroup_access: +- role: roles/bigquery.dataViewer + members: + - workgroup:mozilla-confidential +references: {} diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/www_site_hits_v2/query.sql /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/www_site_hits_v2/query.sql --- /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/www_site_hits_v2/query.sql 1970-01-01 00:00:00.000000000 +0000 +++ /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/www_site_hits_v2/query.sql 2024-08-07 16:26:48.000000000 +0000 @@ -0,0 +1,321 @@ +WITH get_session_start_time AS ( + SELECT + SAFE.PARSE_DATE('%Y%m%d', a.event_date) AS date, + a.user_pseudo_id || '-' || CAST( + ( + SELECT + `value` + FROM + UNNEST(event_params) + WHERE + key = 'ga_session_id' + LIMIT + 1 + ).int_value AS STRING + ) AS visit_identifier, + a.user_pseudo_id AS full_visitor_id, + MIN(event_timestamp) AS visit_start_time + FROM + `moz-fx-data-marketing-prod.analytics_313696158.events_*` a + JOIN + UNNEST(event_params) e + WHERE + e.key = 'ga_session_id' + AND e.value.int_value IS NOT NULL + AND _TABLE_SUFFIX = SAFE.FORMAT_DATE('%Y%m%d', @submission_date) + GROUP BY + date, + visit_identifier, + full_visitor_id +), +get_all_events_in_each_session_staging AS ( + SELECT + SAFE.PARSE_DATE('%Y%m%d', a.event_date) AS date, + a.user_pseudo_id || '-' || CAST( + ( + SELECT + `value` + FROM + UNNEST(event_params) + WHERE + key = 'ga_session_id' + LIMIT + 1 + ).int_value AS string + ) AS visit_identifier, + a.user_pseudo_id AS full_visitor_id, + a.event_name, + a.event_timestamp, + device.category AS device_category, + device.operating_system AS operating_system, + device.language AS language, + device.web_info.browser AS browser, + device.web_info.browser_version AS browser_version, + geo.country AS country, + traffic_source.name AS traffic_source_name, + traffic_source.medium AS traffic_source_medium, + traffic_source.source AS traffic_source_source, + collected_traffic_source.manual_campaign_id AS manual_campaign_id, + collected_traffic_source.manual_term AS manual_term, + collected_traffic_source.manual_source AS source, + collected_traffic_source.manual_medium AS medium, + collected_traffic_source.manual_campaign_name AS campaign, + collected_traffic_source.manual_content AS ad_content, + CASE + WHEN event_name = 'product_download' + THEN (SELECT `value` FROM UNNEST(event_params) WHERE key = 'product' LIMIT 1).string_value + ELSE NULL + END AS product_type, + CASE + WHEN event_name = 'product_download' + THEN (SELECT `value` FROM UNNEST(event_params) WHERE key = 'platform' LIMIT 1).string_value + ELSE NULL + END AS platform_type, + ( + SELECT + `value` + FROM + UNNEST(event_params) + WHERE + key = 'engagement_time_msec' + LIMIT + 1 + ).int_value AS engagement_time_msec, --not sure if this is the same as hit time yet or not + ( + SELECT + `value` + FROM + UNNEST(event_params) + WHERE + key = 'engaged_session_event' + LIMIT + 1 + ).int_value AS engaged_session_event, + ( + SELECT + `value` + FROM + UNNEST(event_params) + WHERE + key = 'page_location' + LIMIT + 1 + ).string_value AS page_location, + COALESCE( + (SELECT `value` FROM UNNEST(event_params) WHERE key = 'entrances' LIMIT 1).int_value, + 0 + ) AS is_entrance + FROM + `moz-fx-data-marketing-prod.analytics_313696158.events_*` a + WHERE + _TABLE_SUFFIX = SAFE.FORMAT_DATE('%Y%m%d', @submission_date) +), +--if there are multiple events at the same timestamp in the same session, assign them all the same hit number +get_all_events_in_each_session AS ( + SELECT + a.date, + a.visit_identifier, + a.full_visitor_id, + a.event_name, + a.event_timestamp, + a.device_category, + a.operating_system, + a.language, + a.browser, + a.browser_version, + a.country, + a.traffic_source_name, + a.traffic_source_medium, + a.traffic_source_source, + a.manual_campaign_id, + a.manual_term, + a.source, + a.medium, + a.campaign, + a.ad_content, + a.product_type, + a.platform_type, + a.engagement_time_msec, + a.engaged_session_event, + SPLIT(a.page_location, '?')[OFFSET(0)] AS page_location, + a.is_entrance, + DENSE_RANK() OVER (PARTITION BY visit_identifier ORDER BY event_timestamp ASC) AS hit_number, + ROW_NUMBER() OVER ( + PARTITION BY + visit_identifier + ORDER BY + event_timestamp, + event_name, + page_location ASC + ) AS row_nbr + FROM + get_all_events_in_each_session_staging a +), +--get the row number associated with the last page view in each session +row_nbr_of_last_page_view_in_each_session AS ( + SELECT + visit_identifier, + COUNT(1) AS nbr_page_view_events, + MAX(row_nbr) AS max_row_number + FROM + get_all_events_in_each_session + WHERE + event_name = 'page_view' + GROUP BY + visit_identifier +), +first_and_last_interaction AS ( + SELECT + visit_identifier, + MAX(CASE WHEN engaged_session_event = 1 THEN 1 ELSE 0 END) AS session_had_an_engaged_event, + MIN(CASE WHEN engaged_session_event = 1 THEN hit_number ELSE NULL END) AS first_interaction, + MAX(CASE WHEN engaged_session_event = 1 THEN hit_number ELSE NULL END) AS last_interaction + FROM + get_all_events_in_each_session + GROUP BY + visit_identifier +), +final_staging AS ( + SELECT + all_sessions.date, + all_sessions.visit_identifier, + all_sessions.full_visitor_id, + all_sessions.visit_start_time, + REGEXP_REPLACE(all_events.page_location, '^https://www.mozilla.org', '') AS page_path, + SPLIT(REGEXP_REPLACE(all_events.page_location, '^https://www.mozilla.org', ''), '/')[ + SAFE_OFFSET(1) + ] AS page_path_level1, + CASE + WHEN all_events.event_name = 'page_view' + THEN 'PAGE' + ELSE 'EVENT' + END AS hit_type, + CAST(CASE WHEN exits.max_row_number = all_events.row_nbr THEN 1 ELSE 0 END AS bool) AS is_exit, + CAST(all_events.is_entrance AS bool) AS is_entrance, + all_events.hit_number, + all_events.event_timestamp AS hit_timestamp, + CAST(NULL AS string) AS event_category, --GA4 has no notion of event_label, unlike GA3 (UA360) + all_events.event_name, + CAST(NULL AS string) AS event_label, --GA4 has no notion of event_label, unlike GA3 (UA360) + CAST(NULL AS string) AS event_action, --GA4 has no notion of event_action, unlike GA3 (UA360) + all_events.device_category, + all_events.operating_system, + all_events.language, + all_events.browser, + all_events.browser_version, + all_events.country, + all_events.traffic_source_name, + all_events.traffic_source_medium, + all_events.traffic_source_source, + all_events.manual_campaign_id, + all_events.manual_term, + all_events.source, + all_events.medium, + all_events.campaign, + all_events.ad_content, + all_events.product_type, + all_events.platform_type, + engmgt.session_had_an_engaged_event AS visits, --this is the equivalent logic to totals.visits in UA + CASE + WHEN exits.nbr_page_view_events = 1 + THEN TRUE + ELSE FALSE + END AS single_page_session, + CASE + WHEN engmgt.session_had_an_engaged_event = 0 + THEN 1 + ELSE 0 + END AS bounces, --if the session did not have an engaged event, then the session is considered a bounce, else it is not + (all_events.event_timestamp - all_sessions.visit_start_time) / 1000000 AS hit_time, + SAFE_DIVIDE(engagement_time_msec, 1000) AS engagement_time, + engmgt.first_interaction, + CAST(engmgt.last_interaction AS float64) AS last_interaction, + all_events.is_entrance AS entrances, + COALESCE(CASE WHEN exits.max_row_number = all_events.row_nbr THEN 1 ELSE 0 END, 0) AS exits, + CAST( + NULL AS string + ) AS event_id, --old table defined this from event category, action, and label, which no longer exist in GA4 + SPLIT(REGEXP_REPLACE(all_events.page_location, '^https://www.mozilla.org', ''), '/')[ + SAFE_OFFSET(2) + ] AS page_level_1, + SPLIT(REGEXP_REPLACE(all_events.page_location, '^https://www.mozilla.org', ''), '/')[ + SAFE_OFFSET(3) + ] AS page_level_2, + SPLIT(REGEXP_REPLACE(all_events.page_location, '^https://www.mozilla.org', ''), '/')[ + SAFE_OFFSET(4) + ] AS page_level_3, + SPLIT(REGEXP_REPLACE(all_events.page_location, '^https://www.mozilla.org', ''), '/')[ + SAFE_OFFSET(5) + ] AS page_level_4, + SPLIT(REGEXP_REPLACE(all_events.page_location, '^https://www.mozilla.org', ''), '/')[ + SAFE_OFFSET(6) + ] AS page_level_5 + FROM + get_session_start_time all_sessions + LEFT OUTER JOIN + get_all_events_in_each_session all_events + ON all_sessions.date = all_events.date + AND all_sessions.visit_identifier = all_events.visit_identifier + AND all_sessions.full_visitor_id = all_events.full_visitor_id + LEFT OUTER JOIN + row_nbr_of_last_page_view_in_each_session exits + ON all_sessions.visit_identifier = exits.visit_identifier + LEFT OUTER JOIN + first_and_last_interaction engmgt + ON all_sessions.visit_identifier = engmgt.visit_identifier +) +SELECT + final.date, + final.visit_identifier, + final.full_visitor_id, + final.visit_start_time, + final.page_path, + final.page_path_level1, + final.hit_type, + final.is_exit, + final.is_entrance, + final.hit_number, + final.hit_timestamp, + final.event_category, + final.event_name, + final.event_label, + final.event_action, + final.device_category, + final.operating_system, + final.language, + final.browser, + final.browser_version, + final.country, + final.traffic_source_name, + final.traffic_source_medium, + final.traffic_source_source, + final.manual_campaign_id, + final.manual_term, + final.source, + final.medium, + final.campaign, + final.ad_content, + final.visits, + final.bounces, + final.hit_time, + final.engagement_time, + final.first_interaction, + final.last_interaction, + final.entrances, + final.exits, + final.event_id, + final.page_level_1, + final.page_level_2, + final.page_level_3, + final.page_level_4, + final.page_level_5, + IF( + page_level_2 IS NULL, + CONCAT('/', page_level_1, '/'), + ARRAY_TO_STRING(['', page_level_1, page_level_2, page_level_3, page_level_4, page_level_5], '/') + ) AS page_name, + final.single_page_session, + final.product_type, + final.platform_type +FROM + final_staging final diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/www_site_hits_v2/schema.yaml /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/www_site_hits_v2/schema.yaml --- /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/www_site_hits_v2/schema.yaml 1970-01-01 00:00:00.000000000 +0000 +++ /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/www_site_hits_v2/schema.yaml 2024-08-07 16:26:48.000000000 +0000 @@ -0,0 +1,193 @@ +fields: +- mode: NULLABLE + name: date + type: DATE + description: Date of the visit +- mode: NULLABLE + name: visit_identifier + type: STRING + description: Visit Identifier - Uniquely identifies a visit; concatenation of user_pseudo_id and ga_session_id +- mode: NULLABLE + name: full_visitor_id + type: STRING + description: Full Visitor ID - Uniquely identifies a visitor - this is the same as GA4 user_pseudo_id +- mode: NULLABLE + name: visit_start_time + type: INT64 + description: Visit Start Time - The event timestamp from the first event in the visit +- mode: NULLABLE + name: page_path + type: STRING + description: Page Path +- mode: NULLABLE + name: page_path_level1 + type: STRING + description: Page Path Level 1 +- mode: NULLABLE + name: hit_type + type: STRING + description: Hit Type - PAGE if event_name is a page view, EVENT for all other event types +- mode: NULLABLE + name: is_exit + type: BOOLEAN + description: Is Exit - The specific page view event that is considered the exit to the visit +- mode: NULLABLE + name: is_entrance + type: BOOLEAN + description: Is Entrance - The specific page view event that is considered the entrance to the visit +- mode: NULLABLE + name: hit_number + type: INT64 + description: Hit Number - Densely ranked, since there can be multiple events at the same time for a user +- mode: NULLABLE + name: hit_timestamp + type: INT64 + description: Hit Timestamp - Same as the "Event Timestamp" +- mode: NULLABLE + name: event_category + type: STRING + description: Event Category +- mode: NULLABLE + name: event_name + type: STRING + description: Event Name +- mode: NULLABLE + name: event_label + type: STRING + description: Event Label +- mode: NULLABLE + name: event_action + type: STRING + description: Event Action +- mode: NULLABLE + name: device_category + type: STRING + description: Device Category - The device category the visitor used to visit the site +- mode: NULLABLE + name: operating_system + type: STRING + description: Operating System - The operating system the visitor used to visit the site +- mode: NULLABLE + name: language + type: STRING + description: Language - The language the visiting device was using when it visited the site +- mode: NULLABLE + name: browser + type: STRING + description: Browser - The browser the visiting device was using when it visited the site +- mode: NULLABLE + name: browser_version + type: STRING + description: Browser Version - The version that the visiting device's browser was using when it visited the site +- mode: NULLABLE + name: country + type: STRING + description: Country - The country from which events were reported, based on IP address +- mode: NULLABLE + name: traffic_source_name + type: STRING + description: Traffic Source - Name +- mode: NULLABLE + name: traffic_source_medium + type: STRING + description: Traffic Source - Medium +- mode: NULLABLE + name: traffic_source_source + type: STRING + description: Traffic Source - Source +- mode: NULLABLE + name: manual_campaign_id + type: STRING + description: Manual Campaign ID (comes from collected traffic source) +- mode: NULLABLE + name: manual_term + type: STRING + description: Manual Term (comes from collected traffic source) +- mode: NULLABLE + name: source + type: STRING + description: Source - Referring partner domain (comes from collected traffic source) +- mode: NULLABLE + name: medium + type: STRING + description: Medium - Category of the source, such as 'organic' for a search engine (comes from collected traffic source) +- mode: NULLABLE + name: campaign + type: STRING + description: Campaign - Identifier for the marketing campaign (comes from collected traffic source) +- mode: NULLABLE + name: ad_content + type: STRING + description: Ad Content - Indicates the particular link within a campaign (comes from collected traffic source) +- mode: NULLABLE + name: visits + type: INT64 + description: Visits - A flag that indicates if the hit belongs to a visit that had 1 or more engaged event during the visit +- mode: NULLABLE + name: bounces + type: INT64 + description: Bounces - Indicator that displays a 1 for each hit in a bounce session (a session with only 1 page view) +- mode: NULLABLE + name: hit_time + type: FLOAT + description: Hit Time - The number of seconds after the visitStartTime that the hit was registered +- mode: NULLABLE + name: engagement_time + type: FLOAT + description: Engagement Time - Engagement Time in Seconds +- mode: NULLABLE + name: first_interaction + type: INT64 + description: First Interaction - The first hit number in the visit associated with an engaged session event +- mode: NULLABLE + name: last_interaction + type: FLOAT + description: Last Interaction - The last hit number in the visit associated with an engaged session event +- mode: NULLABLE + name: entrances + type: INT64 + description: Entrances - Denotes which pageview was the first in the session +- mode: NULLABLE + name: exits + type: INT64 + description: Exits - Denotes which page view was the last in the session +- mode: NULLABLE + name: event_id + type: STRING + description: Event ID +- mode: NULLABLE + name: page_level_1 + type: STRING + description: Page Level 1 +- mode: NULLABLE + name: page_level_2 + type: STRING + description: Page Level 2 +- mode: NULLABLE + name: page_level_3 + type: STRING + description: Page Level 3 +- mode: NULLABLE + name: page_level_4 + type: STRING + description: Page Level 4 +- mode: NULLABLE + name: page_level_5 + type: STRING + description: Page Level 5 +- mode: NULLABLE + name: page_name + type: STRING + description: Page Name - Consists of page levels 1 to 5 concatenated together +- mode: NULLABLE + name: single_page_session + type: BOOLEAN + description: Single Page Session - Indicator if the hit belongs to a single page session +- mode: NULLABLE + name: product_type + type: STRING + description: Product Type - For product download events, what product type was downloaded +- mode: NULLABLE + name: platform_type + type: STRING + description: Platform Type - For product download events, what platform type was the download requested from ```

Link to full diff

dataops-ci-bot commented 2 months ago

Integration report for "Merge branch 'main' into deng_4202_move_site_hits_v2_to_shared_prod"

sql.diff

Click to expand! ```diff diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/dags/bqetl_google_analytics_derived_ga4.py /tmp/workspace/generated-sql/dags/bqetl_google_analytics_derived_ga4.py --- /tmp/workspace/main-generated-sql/dags/bqetl_google_analytics_derived_ga4.py 2024-08-07 17:06:58.000000000 +0000 +++ /tmp/workspace/generated-sql/dags/bqetl_google_analytics_derived_ga4.py 2024-08-07 17:08:26.000000000 +0000 @@ -233,6 +233,23 @@ retries=0, ) + checks__warn_mozilla_org_derived__www_site_hits__v2 = bigquery_dq_check( + task_id="checks__warn_mozilla_org_derived__www_site_hits__v2", + source_table="www_site_hits_v2", + dataset_id="mozilla_org_derived", + project_id="moz-fx-data-shared-prod", + is_dq_check_fail=False, + owner="mhirose@mozilla.com", + email=[ + "kwindau@mozilla.com", + "mhirose@mozilla.com", + "telemetry-alerts@mozilla.com", + ], + depends_on_past=False, + parameters=["submission_date:DATE:{{ds}}"], + retries=0, + ) + ga_derived__blogs_daily_summary__v2 = bigquery_etl_query( task_id="ga_derived__blogs_daily_summary__v2", destination_table="blogs_daily_summary_v2", @@ -421,6 +438,21 @@ depends_on_past=False, ) + mozilla_org_derived__www_site_hits__v2 = bigquery_etl_query( + task_id="mozilla_org_derived__www_site_hits__v2", + destination_table="www_site_hits_v2", + dataset_id="mozilla_org_derived", + project_id="moz-fx-data-shared-prod", + owner="mhirose@mozilla.com", + email=[ + "kwindau@mozilla.com", + "mhirose@mozilla.com", + "telemetry-alerts@mozilla.com", + ], + date_partition_parameter="submission_date", + depends_on_past=False, + ) + mozilla_org_derived__www_site_metrics_summary__v2 = bigquery_etl_query( task_id="mozilla_org_derived__www_site_metrics_summary__v2", destination_table="www_site_metrics_summary_v2", @@ -469,6 +501,10 @@ mozilla_org_derived__ga_sessions__v2 ) + checks__warn_mozilla_org_derived__www_site_hits__v2.set_upstream( + mozilla_org_derived__www_site_hits__v2 + ) + ga_derived__blogs_daily_summary__v2.set_upstream(ga_derived__blogs_goals__v2) ga_derived__blogs_daily_summary__v2.set_upstream(ga_derived__blogs_sessions__v2) @@ -531,6 +567,8 @@ mozilla_org_derived__www_site_downloads__v2.set_upstream(wait_for_wmo_events_table) + mozilla_org_derived__www_site_hits__v2.set_upstream(wait_for_wmo_events_table) + mozilla_org_derived__www_site_metrics_summary__v2.set_upstream( wait_for_wmo_events_table ) Only in /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived: www_site_hits_v2 diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/www_site_hits_v2/checks.sql /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/www_site_hits_v2/checks.sql --- /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/www_site_hits_v2/checks.sql 1970-01-01 00:00:00.000000000 +0000 +++ /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/www_site_hits_v2/checks.sql 2024-08-07 17:00:48.000000000 +0000 @@ -0,0 +1,2 @@ +#warn +{{ matches_pattern(column="visit_identifier", pattern="^[0-9]+\\.{1}[0-9]+\\-{1}[0-9]+$", where="date = @submission_date", threshold_fail_percentage=0, message="Warn - some visit_identifier not matching expected pattern") }} diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/www_site_hits_v2/metadata.yaml /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/www_site_hits_v2/metadata.yaml --- /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/www_site_hits_v2/metadata.yaml 1970-01-01 00:00:00.000000000 +0000 +++ /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/www_site_hits_v2/metadata.yaml 2024-08-07 17:02:53.000000000 +0000 @@ -0,0 +1,31 @@ +friendly_name: WWW Site Hits V2 +description: |- + Normalized events from Google Analytics 4 (GA4) for www.mozilla.org. The events triggered at the same timestamp are given the same hit number. +owners: +- mhirose@mozilla.com +labels: + incremental: true + owner1: mhirose + dag: bqetl_google_analytics_derived_ga4 +scheduling: + dag_name: bqetl_google_analytics_derived_ga4 + depends_on: + - task_id: wait_for_wmo_events_table + dag_name: bqetl_google_analytics_derived_ga4 +bigquery: + time_partitioning: + type: day + field: date + require_partition_filter: true + expiration_days: null + range_partitioning: null + clustering: + fields: + - country + - language + - event_name +workgroup_access: +- role: roles/bigquery.dataViewer + members: + - workgroup:mozilla-confidential +references: {} diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/www_site_hits_v2/query.sql /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/www_site_hits_v2/query.sql --- /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/www_site_hits_v2/query.sql 1970-01-01 00:00:00.000000000 +0000 +++ /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/www_site_hits_v2/query.sql 2024-08-07 17:00:48.000000000 +0000 @@ -0,0 +1,321 @@ +WITH get_session_start_time AS ( + SELECT + SAFE.PARSE_DATE('%Y%m%d', a.event_date) AS date, + a.user_pseudo_id || '-' || CAST( + ( + SELECT + `value` + FROM + UNNEST(event_params) + WHERE + key = 'ga_session_id' + LIMIT + 1 + ).int_value AS STRING + ) AS visit_identifier, + a.user_pseudo_id AS full_visitor_id, + MIN(event_timestamp) AS visit_start_time + FROM + `moz-fx-data-marketing-prod.analytics_313696158.events_*` a + JOIN + UNNEST(event_params) e + WHERE + e.key = 'ga_session_id' + AND e.value.int_value IS NOT NULL + AND _TABLE_SUFFIX = SAFE.FORMAT_DATE('%Y%m%d', @submission_date) + GROUP BY + date, + visit_identifier, + full_visitor_id +), +get_all_events_in_each_session_staging AS ( + SELECT + SAFE.PARSE_DATE('%Y%m%d', a.event_date) AS date, + a.user_pseudo_id || '-' || CAST( + ( + SELECT + `value` + FROM + UNNEST(event_params) + WHERE + key = 'ga_session_id' + LIMIT + 1 + ).int_value AS string + ) AS visit_identifier, + a.user_pseudo_id AS full_visitor_id, + a.event_name, + a.event_timestamp, + device.category AS device_category, + device.operating_system AS operating_system, + device.language AS language, + device.web_info.browser AS browser, + device.web_info.browser_version AS browser_version, + geo.country AS country, + traffic_source.name AS traffic_source_name, + traffic_source.medium AS traffic_source_medium, + traffic_source.source AS traffic_source_source, + collected_traffic_source.manual_campaign_id AS manual_campaign_id, + collected_traffic_source.manual_term AS manual_term, + collected_traffic_source.manual_source AS source, + collected_traffic_source.manual_medium AS medium, + collected_traffic_source.manual_campaign_name AS campaign, + collected_traffic_source.manual_content AS ad_content, + CASE + WHEN event_name = 'product_download' + THEN (SELECT `value` FROM UNNEST(event_params) WHERE key = 'product' LIMIT 1).string_value + ELSE NULL + END AS product_type, + CASE + WHEN event_name = 'product_download' + THEN (SELECT `value` FROM UNNEST(event_params) WHERE key = 'platform' LIMIT 1).string_value + ELSE NULL + END AS platform_type, + ( + SELECT + `value` + FROM + UNNEST(event_params) + WHERE + key = 'engagement_time_msec' + LIMIT + 1 + ).int_value AS engagement_time_msec, --not sure if this is the same as hit time yet or not + ( + SELECT + `value` + FROM + UNNEST(event_params) + WHERE + key = 'engaged_session_event' + LIMIT + 1 + ).int_value AS engaged_session_event, + ( + SELECT + `value` + FROM + UNNEST(event_params) + WHERE + key = 'page_location' + LIMIT + 1 + ).string_value AS page_location, + COALESCE( + (SELECT `value` FROM UNNEST(event_params) WHERE key = 'entrances' LIMIT 1).int_value, + 0 + ) AS is_entrance + FROM + `moz-fx-data-marketing-prod.analytics_313696158.events_*` a + WHERE + _TABLE_SUFFIX = SAFE.FORMAT_DATE('%Y%m%d', @submission_date) +), +--if there are multiple events at the same timestamp in the same session, assign them all the same hit number +get_all_events_in_each_session AS ( + SELECT + a.date, + a.visit_identifier, + a.full_visitor_id, + a.event_name, + a.event_timestamp, + a.device_category, + a.operating_system, + a.language, + a.browser, + a.browser_version, + a.country, + a.traffic_source_name, + a.traffic_source_medium, + a.traffic_source_source, + a.manual_campaign_id, + a.manual_term, + a.source, + a.medium, + a.campaign, + a.ad_content, + a.product_type, + a.platform_type, + a.engagement_time_msec, + a.engaged_session_event, + SPLIT(a.page_location, '?')[OFFSET(0)] AS page_location, + a.is_entrance, + DENSE_RANK() OVER (PARTITION BY visit_identifier ORDER BY event_timestamp ASC) AS hit_number, + ROW_NUMBER() OVER ( + PARTITION BY + visit_identifier + ORDER BY + event_timestamp, + event_name, + page_location ASC + ) AS row_nbr + FROM + get_all_events_in_each_session_staging a +), +--get the row number associated with the last page view in each session +row_nbr_of_last_page_view_in_each_session AS ( + SELECT + visit_identifier, + COUNT(1) AS nbr_page_view_events, + MAX(row_nbr) AS max_row_number + FROM + get_all_events_in_each_session + WHERE + event_name = 'page_view' + GROUP BY + visit_identifier +), +first_and_last_interaction AS ( + SELECT + visit_identifier, + MAX(CASE WHEN engaged_session_event = 1 THEN 1 ELSE 0 END) AS session_had_an_engaged_event, + MIN(CASE WHEN engaged_session_event = 1 THEN hit_number ELSE NULL END) AS first_interaction, + MAX(CASE WHEN engaged_session_event = 1 THEN hit_number ELSE NULL END) AS last_interaction + FROM + get_all_events_in_each_session + GROUP BY + visit_identifier +), +final_staging AS ( + SELECT + all_sessions.date, + all_sessions.visit_identifier, + all_sessions.full_visitor_id, + all_sessions.visit_start_time, + REGEXP_REPLACE(all_events.page_location, '^https://www.mozilla.org', '') AS page_path, + SPLIT(REGEXP_REPLACE(all_events.page_location, '^https://www.mozilla.org', ''), '/')[ + SAFE_OFFSET(1) + ] AS page_path_level1, + CASE + WHEN all_events.event_name = 'page_view' + THEN 'PAGE' + ELSE 'EVENT' + END AS hit_type, + CAST(CASE WHEN exits.max_row_number = all_events.row_nbr THEN 1 ELSE 0 END AS bool) AS is_exit, + CAST(all_events.is_entrance AS bool) AS is_entrance, + all_events.hit_number, + all_events.event_timestamp AS hit_timestamp, + CAST(NULL AS string) AS event_category, --GA4 has no notion of event_label, unlike GA3 (UA360) + all_events.event_name, + CAST(NULL AS string) AS event_label, --GA4 has no notion of event_label, unlike GA3 (UA360) + CAST(NULL AS string) AS event_action, --GA4 has no notion of event_action, unlike GA3 (UA360) + all_events.device_category, + all_events.operating_system, + all_events.language, + all_events.browser, + all_events.browser_version, + all_events.country, + all_events.traffic_source_name, + all_events.traffic_source_medium, + all_events.traffic_source_source, + all_events.manual_campaign_id, + all_events.manual_term, + all_events.source, + all_events.medium, + all_events.campaign, + all_events.ad_content, + all_events.product_type, + all_events.platform_type, + engmgt.session_had_an_engaged_event AS visits, --this is the equivalent logic to totals.visits in UA + CASE + WHEN exits.nbr_page_view_events = 1 + THEN TRUE + ELSE FALSE + END AS single_page_session, + CASE + WHEN engmgt.session_had_an_engaged_event = 0 + THEN 1 + ELSE 0 + END AS bounces, --if the session did not have an engaged event, then the session is considered a bounce, else it is not + (all_events.event_timestamp - all_sessions.visit_start_time) / 1000000 AS hit_time, + SAFE_DIVIDE(engagement_time_msec, 1000) AS engagement_time, + engmgt.first_interaction, + CAST(engmgt.last_interaction AS float64) AS last_interaction, + all_events.is_entrance AS entrances, + COALESCE(CASE WHEN exits.max_row_number = all_events.row_nbr THEN 1 ELSE 0 END, 0) AS exits, + CAST( + NULL AS string + ) AS event_id, --old table defined this from event category, action, and label, which no longer exist in GA4 + SPLIT(REGEXP_REPLACE(all_events.page_location, '^https://www.mozilla.org', ''), '/')[ + SAFE_OFFSET(2) + ] AS page_level_1, + SPLIT(REGEXP_REPLACE(all_events.page_location, '^https://www.mozilla.org', ''), '/')[ + SAFE_OFFSET(3) + ] AS page_level_2, + SPLIT(REGEXP_REPLACE(all_events.page_location, '^https://www.mozilla.org', ''), '/')[ + SAFE_OFFSET(4) + ] AS page_level_3, + SPLIT(REGEXP_REPLACE(all_events.page_location, '^https://www.mozilla.org', ''), '/')[ + SAFE_OFFSET(5) + ] AS page_level_4, + SPLIT(REGEXP_REPLACE(all_events.page_location, '^https://www.mozilla.org', ''), '/')[ + SAFE_OFFSET(6) + ] AS page_level_5 + FROM + get_session_start_time all_sessions + LEFT OUTER JOIN + get_all_events_in_each_session all_events + ON all_sessions.date = all_events.date + AND all_sessions.visit_identifier = all_events.visit_identifier + AND all_sessions.full_visitor_id = all_events.full_visitor_id + LEFT OUTER JOIN + row_nbr_of_last_page_view_in_each_session exits + ON all_sessions.visit_identifier = exits.visit_identifier + LEFT OUTER JOIN + first_and_last_interaction engmgt + ON all_sessions.visit_identifier = engmgt.visit_identifier +) +SELECT + final.date, + final.visit_identifier, + final.full_visitor_id, + final.visit_start_time, + final.page_path, + final.page_path_level1, + final.hit_type, + final.is_exit, + final.is_entrance, + final.hit_number, + final.hit_timestamp, + final.event_category, + final.event_name, + final.event_label, + final.event_action, + final.device_category, + final.operating_system, + final.language, + final.browser, + final.browser_version, + final.country, + final.traffic_source_name, + final.traffic_source_medium, + final.traffic_source_source, + final.manual_campaign_id, + final.manual_term, + final.source, + final.medium, + final.campaign, + final.ad_content, + final.visits, + final.bounces, + final.hit_time, + final.engagement_time, + final.first_interaction, + final.last_interaction, + final.entrances, + final.exits, + final.event_id, + final.page_level_1, + final.page_level_2, + final.page_level_3, + final.page_level_4, + final.page_level_5, + IF( + page_level_2 IS NULL, + CONCAT('/', page_level_1, '/'), + ARRAY_TO_STRING(['', page_level_1, page_level_2, page_level_3, page_level_4, page_level_5], '/') + ) AS page_name, + final.single_page_session, + final.product_type, + final.platform_type +FROM + final_staging final diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/www_site_hits_v2/schema.yaml /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/www_site_hits_v2/schema.yaml --- /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/www_site_hits_v2/schema.yaml 1970-01-01 00:00:00.000000000 +0000 +++ /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/www_site_hits_v2/schema.yaml 2024-08-07 17:00:48.000000000 +0000 @@ -0,0 +1,193 @@ +fields: +- mode: NULLABLE + name: date + type: DATE + description: Date of the visit +- mode: NULLABLE + name: visit_identifier + type: STRING + description: Visit Identifier - Uniquely identifies a visit; concatenation of user_pseudo_id and ga_session_id +- mode: NULLABLE + name: full_visitor_id + type: STRING + description: Full Visitor ID - Uniquely identifies a visitor - this is the same as GA4 user_pseudo_id +- mode: NULLABLE + name: visit_start_time + type: INT64 + description: Visit Start Time - The event timestamp from the first event in the visit +- mode: NULLABLE + name: page_path + type: STRING + description: Page Path +- mode: NULLABLE + name: page_path_level1 + type: STRING + description: Page Path Level 1 +- mode: NULLABLE + name: hit_type + type: STRING + description: Hit Type - PAGE if event_name is a page view, EVENT for all other event types +- mode: NULLABLE + name: is_exit + type: BOOLEAN + description: Is Exit - The specific page view event that is considered the exit to the visit +- mode: NULLABLE + name: is_entrance + type: BOOLEAN + description: Is Entrance - The specific page view event that is considered the entrance to the visit +- mode: NULLABLE + name: hit_number + type: INT64 + description: Hit Number - Densely ranked, since there can be multiple events at the same time for a user +- mode: NULLABLE + name: hit_timestamp + type: INT64 + description: Hit Timestamp - Same as the "Event Timestamp" +- mode: NULLABLE + name: event_category + type: STRING + description: Event Category +- mode: NULLABLE + name: event_name + type: STRING + description: Event Name +- mode: NULLABLE + name: event_label + type: STRING + description: Event Label +- mode: NULLABLE + name: event_action + type: STRING + description: Event Action +- mode: NULLABLE + name: device_category + type: STRING + description: Device Category - The device category the visitor used to visit the site +- mode: NULLABLE + name: operating_system + type: STRING + description: Operating System - The operating system the visitor used to visit the site +- mode: NULLABLE + name: language + type: STRING + description: Language - The language the visiting device was using when it visited the site +- mode: NULLABLE + name: browser + type: STRING + description: Browser - The browser the visiting device was using when it visited the site +- mode: NULLABLE + name: browser_version + type: STRING + description: Browser Version - The version that the visiting device's browser was using when it visited the site +- mode: NULLABLE + name: country + type: STRING + description: Country - The country from which events were reported, based on IP address +- mode: NULLABLE + name: traffic_source_name + type: STRING + description: Traffic Source - Name +- mode: NULLABLE + name: traffic_source_medium + type: STRING + description: Traffic Source - Medium +- mode: NULLABLE + name: traffic_source_source + type: STRING + description: Traffic Source - Source +- mode: NULLABLE + name: manual_campaign_id + type: STRING + description: Manual Campaign ID (comes from collected traffic source) +- mode: NULLABLE + name: manual_term + type: STRING + description: Manual Term (comes from collected traffic source) +- mode: NULLABLE + name: source + type: STRING + description: Source - Referring partner domain (comes from collected traffic source) +- mode: NULLABLE + name: medium + type: STRING + description: Medium - Category of the source, such as 'organic' for a search engine (comes from collected traffic source) +- mode: NULLABLE + name: campaign + type: STRING + description: Campaign - Identifier for the marketing campaign (comes from collected traffic source) +- mode: NULLABLE + name: ad_content + type: STRING + description: Ad Content - Indicates the particular link within a campaign (comes from collected traffic source) +- mode: NULLABLE + name: visits + type: INT64 + description: Visits - A flag that indicates if the hit belongs to a visit that had 1 or more engaged event during the visit +- mode: NULLABLE + name: bounces + type: INT64 + description: Bounces - Indicator that displays a 1 for each hit in a bounce session (a session with only 1 page view) +- mode: NULLABLE + name: hit_time + type: FLOAT + description: Hit Time - The number of seconds after the visitStartTime that the hit was registered +- mode: NULLABLE + name: engagement_time + type: FLOAT + description: Engagement Time - Engagement Time in Seconds +- mode: NULLABLE + name: first_interaction + type: INT64 + description: First Interaction - The first hit number in the visit associated with an engaged session event +- mode: NULLABLE + name: last_interaction + type: FLOAT + description: Last Interaction - The last hit number in the visit associated with an engaged session event +- mode: NULLABLE + name: entrances + type: INT64 + description: Entrances - Denotes which pageview was the first in the session +- mode: NULLABLE + name: exits + type: INT64 + description: Exits - Denotes which page view was the last in the session +- mode: NULLABLE + name: event_id + type: STRING + description: Event ID +- mode: NULLABLE + name: page_level_1 + type: STRING + description: Page Level 1 +- mode: NULLABLE + name: page_level_2 + type: STRING + description: Page Level 2 +- mode: NULLABLE + name: page_level_3 + type: STRING + description: Page Level 3 +- mode: NULLABLE + name: page_level_4 + type: STRING + description: Page Level 4 +- mode: NULLABLE + name: page_level_5 + type: STRING + description: Page Level 5 +- mode: NULLABLE + name: page_name + type: STRING + description: Page Name - Consists of page levels 1 to 5 concatenated together +- mode: NULLABLE + name: single_page_session + type: BOOLEAN + description: Single Page Session - Indicator if the hit belongs to a single page session +- mode: NULLABLE + name: product_type + type: STRING + description: Product Type - For product download events, what product type was downloaded +- mode: NULLABLE + name: platform_type + type: STRING + description: Platform Type - For product download events, what platform type was the download requested from ```

Link to full diff