mozilla / bigquery-etl

Bigquery ETL
https://mozilla.github.io/bigquery-etl
Mozilla Public License 2.0
253 stars 100 forks source link

add blogs_goals_v2 checks, metadata, query, schema files to shared_prod #6016

Closed Marlene-M-Hirose closed 2 months ago

Marlene-M-Hirose commented 2 months ago

Checklist for reviewer:

For modifications to schemas in restricted namespaces (see CODEOWNERS):

┆Issue is synchronized with this Jira Task

dataops-ci-bot commented 2 months ago

Integration report for "add blogs_goals_v2 checks, metadata, query, schema files to shared_prod"

sql.diff

Click to expand! ```diff diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/dags/bqetl_default.py /tmp/workspace/generated-sql/dags/bqetl_default.py --- /tmp/workspace/main-generated-sql/dags/bqetl_default.py 2024-08-06 19:14:49.000000000 +0000 +++ /tmp/workspace/generated-sql/dags/bqetl_default.py 2024-08-06 19:16:13.000000000 +0000 @@ -62,3 +62,32 @@ date_partition_parameter="submission_date", depends_on_past=False, ) + + checks__warn_mozilla_org_derived__blogs_goals__v2 = bigquery_dq_check( + task_id="checks__warn_mozilla_org_derived__blogs_goals__v2", + source_table="blogs_goals_v2", + dataset_id="mozilla_org_derived", + project_id="moz-fx-data-shared-prod", + is_dq_check_fail=False, + owner="example@mozilla.com", + email=["example@mozilla.com", "telemetry-alerts@mozilla.com"], + depends_on_past=False, + parameters=["submission_date:DATE:{{ds}}"], + retries=0, + ) + + mozilla_org_derived__blogs_goals__v2 = bigquery_etl_query( + #### WARNING: This task has been scheduled in the default DAG. It can be moved to a more suitable DAG using `bqetl query schedule`. + task_id="mozilla_org_derived__blogs_goals__v2", + destination_table="blogs_goals_v2", + dataset_id="mozilla_org_derived", + project_id="moz-fx-data-shared-prod", + owner="example@mozilla.com", + email=["example@mozilla.com", "telemetry-alerts@mozilla.com"], + date_partition_parameter="submission_date", + depends_on_past=False, + ) + + checks__warn_mozilla_org_derived__blogs_goals__v2.set_upstream( + mozilla_org_derived__blogs_goals__v2 + ) Only in /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived: blogs_goals_v2 diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/blogs_goals_v2/checks.sql /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/blogs_goals_v2/checks.sql --- /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/blogs_goals_v2/checks.sql 1970-01-01 00:00:00.000000000 +0000 +++ /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/blogs_goals_v2/checks.sql 2024-08-06 19:09:03.000000000 +0000 @@ -0,0 +1,5 @@ +#warn +{{ is_unique(["date", "visit_identifier"], "date = @submission_date") }} + +#warn +{{ matches_pattern(column="visit_identifier", pattern="^[0-9]+\\.{1}[0-9]+\\-{1}[0-9]+$", where="date = @submission_date", threshold_fail_percentage=0, message="Warn - some visit_identifier not matching expected pattern") }} diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/blogs_goals_v2/metadata.yaml /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/blogs_goals_v2/metadata.yaml --- /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/blogs_goals_v2/metadata.yaml 1970-01-01 00:00:00.000000000 +0000 +++ /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/blogs_goals_v2/metadata.yaml 2024-08-06 19:11:07.000000000 +0000 @@ -0,0 +1,25 @@ +friendly_name: Blogs Goals +description: |- + Please provide a description for the query +owners: +- example@mozilla.com +labels: + incremental: true + owner1: example + dag: bqetl_default +scheduling: + dag_name: bqetl_default +bigquery: + time_partitioning: + type: day + field: '' + require_partition_filter: true + expiration_days: null + range_partitioning: null + clustering: + fields: [] +workgroup_access: +- role: roles/bigquery.dataViewer + members: + - workgroup:mozilla-confidential +references: {} diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/blogs_goals_v2/query.sql /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/blogs_goals_v2/query.sql --- /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/blogs_goals_v2/query.sql 1970-01-01 00:00:00.000000000 +0000 +++ /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/blogs_goals_v2/query.sql 2024-08-06 19:13:05.000000000 +0000 @@ -0,0 +1,9 @@ +-- Query for mozilla_org_derived.blogs_goals_v2 + -- For more information on writing queries see: + -- https://docs.telemetry.mozilla.org/cookbooks/bigquery/querying.html +SELECT + * +FROM + `moz-fx-data-shared-prod.mozilla_org_derived.table` +WHERE + submission_date = @submission_date diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/blogs_goals_v2/schema.yaml /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/blogs_goals_v2/schema.yaml --- /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/blogs_goals_v2/schema.yaml 1970-01-01 00:00:00.000000000 +0000 +++ /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/blogs_goals_v2/schema.yaml 2024-08-06 19:09:03.000000000 +0000 @@ -0,0 +1,21 @@ +fields: +- mode: NULLABLE + name: date + type: DATE + description: Date of the visit +- mode: NULLABLE + name: visit_identifier + type: STRING + description: Visit Identifier - Uniquely identifies a visit; concatenation of user_pseudo_id and ga_session_id +- mode: NULLABLE + name: downloads + type: INT64 + description: Downloads - The number of download click events for this visit and date +- mode: NULLABLE + name: social_share + type: INT64 + description: Social Share - The number of social share events for this visit and date +- mode: NULLABLE + name: newsletter_subscription + type: INT64 + description: Newsletter Subscription - The number of newsletter subscription events for this visit and date ```

Link to full diff

dataops-ci-bot commented 2 months ago

Integration report for "actually overwrite metadata and query files. also change owner to mhirose"

sql.diff

Click to expand! ```diff diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/dags/bqetl_google_analytics_derived_ga4.py /tmp/workspace/generated-sql/dags/bqetl_google_analytics_derived_ga4.py --- /tmp/workspace/main-generated-sql/dags/bqetl_google_analytics_derived_ga4.py 2024-08-06 20:11:14.000000000 +0000 +++ /tmp/workspace/generated-sql/dags/bqetl_google_analytics_derived_ga4.py 2024-08-06 20:12:41.000000000 +0000 @@ -206,6 +206,19 @@ retries=0, ) + checks__warn_mozilla_org_derived__blogs_goals__v2 = bigquery_dq_check( + task_id="checks__warn_mozilla_org_derived__blogs_goals__v2", + source_table="blogs_goals_v2", + dataset_id="mozilla_org_derived", + project_id="moz-fx-data-shared-prod", + is_dq_check_fail=False, + owner="kwindau@mozilla.com", + email=["kwindau@mozilla.com", "telemetry-alerts@mozilla.com"], + depends_on_past=False, + parameters=["submission_date:DATE:{{ds}}"], + retries=0, + ) + checks__warn_mozilla_org_derived__ga_sessions__v2 = bigquery_dq_check( task_id="checks__warn_mozilla_org_derived__ga_sessions__v2", source_table="ga_sessions_v2", @@ -341,6 +354,17 @@ depends_on_past=False, ) + mozilla_org_derived__blogs_goals__v2 = bigquery_etl_query( + task_id="mozilla_org_derived__blogs_goals__v2", + destination_table="blogs_goals_v2", + dataset_id="mozilla_org_derived", + project_id="moz-fx-data-shared-prod", + owner="kwindau@mozilla.com", + email=["kwindau@mozilla.com", "telemetry-alerts@mozilla.com"], + date_partition_parameter="submission_date", + depends_on_past=False, + ) + mozilla_org_derived__ga_clients__v2 = bigquery_etl_query( task_id="mozilla_org_derived__ga_clients__v2", destination_table="ga_clients_v2", @@ -422,6 +446,10 @@ ga_derived__www_site_hits__v2 ) + checks__warn_mozilla_org_derived__blogs_goals__v2.set_upstream( + mozilla_org_derived__blogs_goals__v2 + ) + checks__warn_mozilla_org_derived__ga_sessions__v2.set_upstream( mozilla_org_derived__ga_sessions__v2 ) @@ -458,6 +486,8 @@ ga_derived__www_site_page_metrics__v2.set_upstream(ga_derived__www_site_hits__v2) + mozilla_org_derived__blogs_goals__v2.set_upstream(wait_for_blogs_events_table) + mozilla_org_derived__ga_clients__v2.set_upstream( mozilla_org_derived__ga_sessions__v2 ) Only in /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived: blogs_goals_v2 diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/blogs_goals_v2/checks.sql /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/blogs_goals_v2/checks.sql --- /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/blogs_goals_v2/checks.sql 1970-01-01 00:00:00.000000000 +0000 +++ /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/blogs_goals_v2/checks.sql 2024-08-06 20:04:52.000000000 +0000 @@ -0,0 +1,5 @@ +#warn +{{ is_unique(["date", "visit_identifier"], "date = @submission_date") }} + +#warn +{{ matches_pattern(column="visit_identifier", pattern="^[0-9]+\\.{1}[0-9]+\\-{1}[0-9]+$", where="date = @submission_date", threshold_fail_percentage=0, message="Warn - some visit_identifier not matching expected pattern") }} diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/blogs_goals_v2/metadata.yaml /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/blogs_goals_v2/metadata.yaml --- /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/blogs_goals_v2/metadata.yaml 1970-01-01 00:00:00.000000000 +0000 +++ /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/blogs_goals_v2/metadata.yaml 2024-08-06 20:07:04.000000000 +0000 @@ -0,0 +1,29 @@ +friendly_name: Blogs Goals V2 +description: |- + Intermediate table containing key metrics for blog.mozilla.org sourced from Google Analytics 4 +owners: +- kwindau@mozilla.com +labels: + incremental: true + owner1: kwindau + dag: bqetl_google_analytics_derived_ga4 +scheduling: + dag_name: bqetl_google_analytics_derived_ga4 + depends_on: + - task_id: wait_for_blogs_events_table + dag_name: bqetl_google_analytics_derived_ga4 +bigquery: + time_partitioning: + type: day + field: date + require_partition_filter: true + expiration_days: null + range_partitioning: null + clustering: + fields: + - visit_identifier +workgroup_access: +- role: roles/bigquery.dataViewer + members: + - workgroup:mozilla-confidential +references: {} diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/blogs_goals_v2/query.sql /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/blogs_goals_v2/query.sql --- /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/blogs_goals_v2/query.sql 1970-01-01 00:00:00.000000000 +0000 +++ /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/blogs_goals_v2/query.sql 2024-08-06 20:04:52.000000000 +0000 @@ -0,0 +1,17 @@ +SELECT + PARSE_DATE('%Y%m%d', event_date) AS `date`, + user_pseudo_id || '-' || CAST(e.value.int_value AS STRING) AS visit_identifier, + COUNTIF(event_name = 'download_click') AS downloads, + COUNTIF(event_name = 'social_share') AS social_share, + COUNTIF(event_name = 'newsletter_subscribe') AS newsletter_subscription +FROM + `moz-fx-data-marketing-prod.analytics_314399816.events_*` +JOIN + UNNEST(event_params) e +WHERE + _TABLE_SUFFIX = FORMAT_DATE('%Y%m%d', @submission_date) + AND e.key = 'ga_session_id' + AND e.value.int_value IS NOT NULL +GROUP BY + `date`, + visit_identifier diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/blogs_goals_v2/schema.yaml /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/blogs_goals_v2/schema.yaml --- /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/blogs_goals_v2/schema.yaml 1970-01-01 00:00:00.000000000 +0000 +++ /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/mozilla_org_derived/blogs_goals_v2/schema.yaml 2024-08-06 20:04:52.000000000 +0000 @@ -0,0 +1,21 @@ +fields: +- mode: NULLABLE + name: date + type: DATE + description: Date of the visit +- mode: NULLABLE + name: visit_identifier + type: STRING + description: Visit Identifier - Uniquely identifies a visit; concatenation of user_pseudo_id and ga_session_id +- mode: NULLABLE + name: downloads + type: INT64 + description: Downloads - The number of download click events for this visit and date +- mode: NULLABLE + name: social_share + type: INT64 + description: Social Share - The number of social share events for this visit and date +- mode: NULLABLE + name: newsletter_subscription + type: INT64 + description: Newsletter Subscription - The number of newsletter subscription events for this visit and date ```

Link to full diff