mozilla / bigquery-etl

Bigquery ETL
https://mozilla.github.io/bigquery-etl
Mozilla Public License 2.0
241 stars 98 forks source link

GROWTH-143 backfill new table starting 12/1/2023 #5766

Closed kwindau closed 4 weeks ago

kwindau commented 4 weeks ago

Checklist for reviewer:

For modifications to schemas in restricted namespaces (see CODEOWNERS):

┆Issue is synchronized with this Jira Task

dataops-ci-bot commented 4 weeks ago

Integration report for "Merge branch 'main' into backfill-new-conv-evnts-table"

sql.diff

Click to expand! ```diff Only in /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/google_ads_derived/conversion_event_categorization_v1: backfill.yaml diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/google_ads_derived/conversion_event_categorization_v1/backfill.yaml /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/google_ads_derived/conversion_event_categorization_v1/backfill.yaml --- /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/google_ads_derived/conversion_event_categorization_v1/backfill.yaml 1970-01-01 00:00:00.000000000 +0000 +++ /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/google_ads_derived/conversion_event_categorization_v1/backfill.yaml 2024-06-06 22:03:11.000000000 +0000 @@ -0,0 +1,7 @@ +2024-06-06: + start_date: 2023-12-01 + end_date: 2024-06-05 + reason: GROWTH-143 - backfilling a new table + watchers: + - kwindau@mozilla.com + status: Initiate ```

Link to full diff

dataops-ci-bot commented 4 weeks ago

Integration report for "GROWTH-143 backfill new table starting 12/1/2023"

sql.diff

Click to expand! ```diff Only in /tmp/workspace/main-generated-sql/dags/: bqetl_census_feed.py diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/dags/bqetl_census_feed.py /tmp/workspace/generated-sql/dags/bqetl_census_feed.py --- /tmp/workspace/main-generated-sql/dags/bqetl_census_feed.py 2024-06-06 22:06:47.000000000 +0000 +++ /tmp/workspace/generated-sql/dags/bqetl_census_feed.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,82 +0,0 @@ -# Generated via https://github.com/mozilla/bigquery-etl/blob/main/bigquery_etl/query_scheduling/generate_airflow_dags.py - -from airflow import DAG -from airflow.sensors.external_task import ExternalTaskMarker -from airflow.sensors.external_task import ExternalTaskSensor -from airflow.utils.task_group import TaskGroup -import datetime -from operators.gcp_container_operator import GKEPodOperator -from utils.constants import ALLOWED_STATES, FAILED_STATES -from utils.gcp import bigquery_etl_query, bigquery_dq_check - -docs = """ -### bqetl_census_feed - -Built from bigquery-etl repo, [`dags/bqetl_census_feed.py`](https://github.com/mozilla/bigquery-etl/blob/generated-sql/dags/bqetl_census_feed.py) - -#### Description - -Loads the desktop conversion event tables -#### Owner - -kwindau@mozilla.com - -#### Tags - -* impact/tier_2 -* repo/bigquery-etl -""" - - -default_args = { - "owner": "kwindau@mozilla.com", - "start_date": datetime.datetime(2024, 6, 10, 0, 0), - "end_date": None, - "email": ["kwindau@mozilla.com", "telemetry-alerts@mozilla.com"], - "depends_on_past": False, - "retry_delay": datetime.timedelta(seconds=1800), - "email_on_failure": True, - "email_on_retry": False, - "retries": 2, -} - -tags = ["impact/tier_2", "repo/bigquery-etl"] - -with DAG( - "bqetl_census_feed", - default_args=default_args, - schedule_interval="0 17 * * *", - doc_md=docs, - tags=tags, -) as dag: - - wait_for_checks__fail_mozilla_org_derived__gclid_conversions__v2 = ( - ExternalTaskSensor( - task_id="wait_for_checks__fail_mozilla_org_derived__gclid_conversions__v2", - external_dag_id="bqetl_google_analytics_derived_ga4", - external_task_id="checks__fail_mozilla_org_derived__gclid_conversions__v2", - execution_delta=datetime.timedelta(seconds=18000), - check_existence=True, - mode="reschedule", - allowed_states=ALLOWED_STATES, - failed_states=FAILED_STATES, - pool="DATA_ENG_EXTERNALTASKSENSOR", - ) - ) - - mozilla_org_derived__ga_desktop_conversions__v1 = bigquery_etl_query( - task_id="mozilla_org_derived__ga_desktop_conversions__v1", - destination_table='ga_desktop_conversions_v1${{ macros.ds_format(macros.ds_add(ds, -2), "%Y-%m-%d", "%Y%m%d") }}', - dataset_id="mozilla_org_derived", - project_id="moz-fx-data-shared-prod", - owner="kwindau@mozilla.com", - email=["kwindau@mozilla.com", "telemetry-alerts@mozilla.com"], - date_partition_parameter=None, - depends_on_past=False, - parameters=["activity_date:DATE:{{macros.ds_add(ds, -2)}}"] - + ["submission_date:DATE:{{ds}}"], - ) - - mozilla_org_derived__ga_desktop_conversions__v1.set_upstream( - wait_for_checks__fail_mozilla_org_derived__gclid_conversions__v2 - ) diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/dags/bqetl_google_analytics_derived_ga4.py /tmp/workspace/generated-sql/dags/bqetl_google_analytics_derived_ga4.py --- /tmp/workspace/main-generated-sql/dags/bqetl_google_analytics_derived_ga4.py 2024-06-06 22:06:47.000000000 +0000 +++ /tmp/workspace/generated-sql/dags/bqetl_google_analytics_derived_ga4.py 2024-06-06 22:07:49.000000000 +0000 @@ -153,20 +153,6 @@ retries=0, ) - with TaskGroup( - "checks__fail_mozilla_org_derived__gclid_conversions__v2_external", - ) as checks__fail_mozilla_org_derived__gclid_conversions__v2_external: - ExternalTaskMarker( - task_id="bqetl_census_feed__wait_for_checks__fail_mozilla_org_derived__gclid_conversions__v2", - external_dag_id="bqetl_census_feed", - external_task_id="wait_for_checks__fail_mozilla_org_derived__gclid_conversions__v2", - execution_date="{{ (execution_date - macros.timedelta(days=-1, seconds=68400)).isoformat() }}", - ) - - checks__fail_mozilla_org_derived__gclid_conversions__v2_external.set_upstream( - checks__fail_mozilla_org_derived__gclid_conversions__v2 - ) - checks__warn_ga_derived__blogs_goals__v2 = bigquery_dq_check( task_id="checks__warn_ga_derived__blogs_goals__v2", source_table="blogs_goals_v2", Only in /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/google_ads_derived/conversion_event_categorization_v1: backfill.yaml Only in /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/mozilla_org: ga_desktop_conversions Only in /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived: ga_desktop_conversions_v1 diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/google_ads_derived/conversion_event_categorization_v1/backfill.yaml /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/google_ads_derived/conversion_event_categorization_v1/backfill.yaml --- /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/google_ads_derived/conversion_event_categorization_v1/backfill.yaml 1970-01-01 00:00:00.000000000 +0000 +++ /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/google_ads_derived/conversion_event_categorization_v1/backfill.yaml 2024-06-06 22:03:00.000000000 +0000 @@ -0,0 +1,7 @@ +2024-06-06: + start_date: 2023-12-01 + end_date: 2024-06-05 + reason: GROWTH-143 - backfilling a new table + watchers: + - kwindau@mozilla.com + status: Initiate diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/mozilla_org/ga_desktop_conversions/metadata.yaml /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/mozilla_org/ga_desktop_conversions/metadata.yaml --- /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/mozilla_org/ga_desktop_conversions/metadata.yaml 2024-06-06 22:04:44.000000000 +0000 +++ /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/mozilla_org/ga_desktop_conversions/metadata.yaml 1970-01-01 00:00:00.000000000 +0000 @@ -1,13 +0,0 @@ -friendly_name: Ga Desktop Conversions -description: |- - Please provide a description for the query -owners: [] -labels: {} -bigquery: null -workgroup_access: -- role: roles/bigquery.dataViewer - members: - - workgroup:mozilla-confidential -references: - view.sql: - - moz-fx-data-shared-prod.mozilla_org_derived.ga_desktop_conversions_v1 diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/mozilla_org/ga_desktop_conversions/view.sql /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/mozilla_org/ga_desktop_conversions/view.sql --- /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/mozilla_org/ga_desktop_conversions/view.sql 2024-06-06 22:03:04.000000000 +0000 +++ /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/mozilla_org/ga_desktop_conversions/view.sql 1970-01-01 00:00:00.000000000 +0000 @@ -1,7 +0,0 @@ -CREATE OR REPLACE VIEW - `moz-fx-data-shared-prod.mozilla_org.ga_desktop_conversions` -AS -SELECT - * -FROM - `moz-fx-data-shared-prod.mozilla_org_derived.ga_desktop_conversions_v1` diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/ga_desktop_conversions_v1/metadata.yaml /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/ga_desktop_conversions_v1/metadata.yaml --- /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/ga_desktop_conversions_v1/metadata.yaml 2024-06-06 22:04:46.000000000 +0000 +++ /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/ga_desktop_conversions_v1/metadata.yaml 1970-01-01 00:00:00.000000000 +0000 @@ -1,30 +0,0 @@ -friendly_name: Google Analytics Desktop Conversions Feed -description: |- - Query that pulls conversion event data so Census feed can pull and send to Google for marketing campaigns -owners: -- kwindau@mozilla.com -labels: - incremental: true - owner1: kwindau - dag: bqetl_census_feed -scheduling: - dag_name: bqetl_census_feed - date_partition_parameter: activity_date - date_partition_offset: -2 - parameters: - - submission_date:DATE:{{ds}} -bigquery: - time_partitioning: - type: day - field: activity_date - require_partition_filter: false - expiration_days: null - range_partitioning: null - clustering: - fields: - - conversion_name -workgroup_access: -- role: roles/bigquery.dataViewer - members: - - workgroup:mozilla-confidential -references: {} diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/ga_desktop_conversions_v1/query.sql /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/ga_desktop_conversions_v1/query.sql --- /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/ga_desktop_conversions_v1/query.sql 2024-06-06 22:03:04.000000000 +0000 +++ /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/ga_desktop_conversions_v1/query.sql 1970-01-01 00:00:00.000000000 +0000 @@ -1,31 +0,0 @@ -SELECT - activity_date, - FORMAT_DATETIME("%F %T", DATETIME(activity_date, TIME(23, 59, 59))) AS activity_datetime, - @submission_date AS run_date, - gclid, - -- Names as represented in Google Ads - -- https://docs.google.com/spreadsheets/d/1YzhhvbpOlqPLORRJUZ55BIb0H20hwFqQFApR-r0UMfI - CASE - conversion_name - WHEN "did_firefox_first_run" - THEN "firefox_first_run" - WHEN "did_search" - THEN "firefox_first_search" - WHEN "did_click_ad" - THEN "firefox_first_ad_click" - WHEN "did_returned_second_day" - THEN "firefox_second_run" - ELSE NULL - END AS conversion_name, -FROM - `moz-fx-data-shared-prod.mozilla_org_derived.gclid_conversions_v2` UNPIVOT( - did_conversion FOR conversion_name IN ( - did_firefox_first_run, - did_search, - did_click_ad, - did_returned_second_day - ) - ) -WHERE - did_conversion - AND activity_date = @activity_date diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/ga_desktop_conversions_v1/schema.yaml /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/ga_desktop_conversions_v1/schema.yaml --- /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/ga_desktop_conversions_v1/schema.yaml 2024-06-06 22:03:04.000000000 +0000 +++ /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/ga_desktop_conversions_v1/schema.yaml 1970-01-01 00:00:00.000000000 +0000 @@ -1,21 +0,0 @@ -fields: -- name: activity_date - mode: NULLABLE - type: DATE - description: Activity Date -- name: activity_datetime - mode: NULLABLE - type: STRING - description: Activity Datetime -- name: run_date - mode: NULLABLE - type: DATE - description: Run Date - The submission date -- name: gclid - mode: NULLABLE - type: STRING - description: Google Click ID -- name: conversion_name - mode: NULLABLE - type: STRING - description: Conversion Name ```

Link to full diff

wwyc commented 4 weeks ago

as discussed offline, this backfill may need to be done at a lower parallelism (current default is 16) cc @ANich

kwindau commented 4 weeks ago

Going to run locally for now, this one is a pretty large query, think it would be better if I ran locally with lower parallelism