HTTPArchive / cwv-tech-report

Core Web Vitals Technology Report
https://cwvtech.report
28 stars 2 forks source link

Add a way to distinguish between CWV 2023 and 2024 #31

Closed rviscomi closed 5 months ago

rviscomi commented 7 months ago

Prep the dataset for the CWV'24 swap, making INP the new responsiveness metric

rviscomi commented 7 months ago

First I copied httparchive.core_web_vitals.technologies to technologies_copy to make sure I didn't screw anything up

Then I ran this query to regenerate a new version of the original technologies table to also include the new year-specific fields:

CREATE OR REPLACE TABLE
  httparchive.core_web_vitals.technologies
PARTITION BY
  date
CLUSTER BY
  geo,
  app,
  rank,
  client

AS

SELECT
  date,
  geo,
  rank,
  category,
  app,
  client,
  origins,
  origins_with_good_fid,
  origins_with_good_cls,
  origins_with_good_lcp,
  origins_with_good_fcp,
  origins_with_good_ttfb,
  origins_with_good_inp,
  origins_with_any_fid,
  origins_with_any_cls,
  origins_with_any_lcp,
  origins_with_any_fcp,
  origins_with_any_ttfb,
  origins_with_any_inp,
  origins_with_good_cwv,
  NULL AS origins_with_good_cwv_2024,
  origins_with_good_cwv AS origins_with_good_cwv_2023,
  origins_eligible_for_cwv,
  pct_eligible_origins_with_good_cwv,
  CAST(NULL AS FLOAT64) AS pct_eligible_origins_with_good_cwv_2024,
  pct_eligible_origins_with_good_cwv AS pct_eligible_origins_with_good_cwv_2023,
  median_lighthouse_score_accessibility,
  median_lighthouse_score_best_practices,
  median_lighthouse_score_performance,
  median_lighthouse_score_pwa,
  median_lighthouse_score_seo,
  median_bytes_total,
  median_bytes_js,
  median_bytes_image
FROM
  httparchive.core_web_vitals.technologies_copy
rviscomi commented 7 months ago

Next I ran this abridged all.sql file to generate only the CrUX data dating back to 202001

CREATE TEMP FUNCTION IS_GOOD(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS (
  SAFE_DIVIDE(good, good + needs_improvement + poor) >= 0.75
);

CREATE TEMP FUNCTION IS_NON_ZERO(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS (
  good + needs_improvement + poor > 0
);

CREATE TEMP FUNCTION GET_LIGHTHOUSE_CATEGORY_SCORES(categories STRING)
RETURNS STRUCT<accessibility NUMERIC, best_practices NUMERIC, performance NUMERIC, pwa NUMERIC, seo NUMERIC> 
LANGUAGE js AS '''
try {
  const $ = JSON.parse(categories);
  return {
    accessibility: $.accessibility.score,
    'best-practices': $['best-practices'].score,
    performance: $.performance.score,
    pwa: $.pwa.score,
    seo: $.seo.score
  };
} catch (e) {
  return {};
}
''';

CREATE OR REPLACE TABLE httparchive.core_web_vitals.technologies_2024
PARTITION BY date
CLUSTER BY geo, app, rank, client AS

WITH latest_technologies AS (
  SELECT
    category,
    app
  FROM
    `httparchive.technologies.2023_11_01_mobile`
), geo_summary AS (
  SELECT
    CAST(REGEXP_REPLACE(CAST(yyyymm AS STRING), r'(\d{4})(\d{2})', r'\1-\2-01') AS DATE) AS date,
    * EXCEPT (country_code),
    `chrome-ux-report`.experimental.GET_COUNTRY(country_code) AS geo
  FROM
    `chrome-ux-report.materialized.country_summary`
UNION ALL
  SELECT
    * EXCEPT (yyyymmdd, p75_fid_origin, p75_cls_origin, p75_lcp_origin, p75_inp_origin),
    'ALL' AS geo
  FROM
    `chrome-ux-report.materialized.device_summary`
), crux AS (
  SELECT
    date,
    geo,
    CASE _rank
      WHEN 100000000 THEN 'ALL'
      WHEN 10000000 THEN 'Top 10M'
      WHEN 1000000 THEN 'Top 1M'
      WHEN 100000 THEN 'Top 100k'
      WHEN 10000 THEN 'Top 10k'
      WHEN 1000 THEN 'Top 1k'
    END AS rank,
    CONCAT(origin, '/') AS url,
    IF(device = 'desktop', 'desktop', 'mobile') AS client,

    # CWV
    IS_NON_ZERO(fast_fid, avg_fid, slow_fid) AS any_fid,
    IS_GOOD(fast_fid, avg_fid, slow_fid) AS good_fid,
    IS_NON_ZERO(small_cls, medium_cls, large_cls) AS any_cls,
    IS_GOOD(small_cls, medium_cls, large_cls) AS good_cls,
    IS_NON_ZERO(fast_lcp, avg_lcp, slow_lcp) AS any_lcp,
    IS_GOOD(fast_lcp, avg_lcp, slow_lcp) AS good_lcp,

    (IS_GOOD(fast_inp, avg_inp, slow_inp) OR fast_inp IS NULL) AND
    IS_GOOD(small_cls, medium_cls, large_cls) AND
    IS_GOOD(fast_lcp, avg_lcp, slow_lcp) AS good_cwv_2024,

    (IS_GOOD(fast_fid, avg_fid, slow_fid) OR fast_fid IS NULL) AND
    IS_GOOD(small_cls, medium_cls, large_cls) AND
    IS_GOOD(fast_lcp, avg_lcp, slow_lcp) AS good_cwv_2023,

    # WV
    IS_NON_ZERO(fast_fcp, avg_fcp, slow_fcp) AS any_fcp,
    IS_GOOD(fast_fcp, avg_fcp, slow_fcp) AS good_fcp,
    IS_NON_ZERO(fast_ttfb, avg_ttfb, slow_ttfb) AS any_ttfb,
    IS_GOOD(fast_ttfb, avg_ttfb, slow_ttfb) AS good_ttfb,
    IS_NON_ZERO(fast_inp, avg_inp, slow_inp) AS any_inp,
    IS_GOOD(fast_inp, avg_inp, slow_inp) AS good_inp
  FROM
    geo_summary,
    UNNEST([1000, 10000, 100000, 1000000, 10000000, 100000000]) AS _rank
  WHERE
    date >= '2020-01-01' AND
    device IN ('desktop', 'phone') AND
    (rank <= _rank OR (rank IS NULL AND _rank = 100000000))
), technologies AS (
  SELECT DISTINCT
    CAST(REGEXP_REPLACE(_TABLE_SUFFIX, r'(\d)_(\d{2})_(\d{2}).*', r'202\1-\2-\3') AS DATE) AS date,
    app,
    IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client,
    url
  FROM
    `httparchive.technologies.202*`
  WHERE
    app IS NOT NULL AND
    app != ''
UNION ALL
  SELECT
    CAST(REGEXP_REPLACE(_TABLE_SUFFIX, r'(\d)_(\d{2})_(\d{2}).*', r'202\1-\2-\3') AS DATE) AS date,
    'ALL' AS app,
    IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client,
    url
  FROM
    `httparchive.summary_pages.202*`
), categories AS (
  SELECT
    app,
    ARRAY_TO_STRING(ARRAY_AGG(DISTINCT category IGNORE NULLS ORDER BY category), ', ') AS category
  FROM
    latest_technologies
  GROUP BY
    app
UNION ALL
  SELECT
    'ALL' AS app,
    ARRAY_TO_STRING(ARRAY_AGG(DISTINCT category IGNORE NULLS ORDER BY category), ', ') AS category
  FROM
    latest_technologies
), summary_stats AS (
  SELECT
    CAST(REGEXP_REPLACE(_TABLE_SUFFIX, r'(\d)_(\d{2})_(\d{2}).*', r'202\1-\2-\3') AS DATE) AS date,
    IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client,
    url,
    bytesTotal,
    bytesJS,
    bytesImg
  FROM
    `httparchive.summary_pages.202*`
)

SELECT
  date,
  geo,
  rank,
  app,
  client,
  COUNT(0) AS origins,

  # CrUX data
  COUNTIF(good_cwv_2023) AS origins_with_good_cwv,
  COUNTIF(good_cwv_2024) AS origins_with_good_cwv_2024,
  COUNTIF(good_cwv_2023) AS origins_with_good_cwv_2023,

  SAFE_DIVIDE(COUNTIF(good_cwv_2023), COUNTIF(any_lcp AND any_cls)) AS pct_eligible_origins_with_good_cwv,
  SAFE_DIVIDE(COUNTIF(good_cwv_2024), COUNTIF(any_lcp AND any_cls)) AS pct_eligible_origins_with_good_cwv_2024,
  SAFE_DIVIDE(COUNTIF(good_cwv_2023), COUNTIF(any_lcp AND any_cls)) AS pct_eligible_origins_with_good_cwv_2023,

FROM
  technologies
JOIN
  categories
USING
  (app)
JOIN
  summary_stats
USING
  (date, client, url)
JOIN
  crux
USING
  (date, client, url)
GROUP BY
  date,
  rank,
  geo,
  app,
  client
rviscomi commented 7 months ago

Then I merged the 2024-specific fields generated above with the null columns in the new table schema:

UPDATE
  `httparchive.core_web_vitals.technologies` AS original
SET
  original.origins_with_good_cwv_2024 = updated.origins_with_good_cwv_2024,
  original.pct_eligible_origins_with_good_cwv_2024 = updated.pct_eligible_origins_with_good_cwv_2024
FROM
  `httparchive.core_web_vitals.technologies_2024` AS updated
WHERE
  original.date = updated.date AND
  original.client = updated.client AND
  original.app = updated.app AND
  original.geo = updated.geo AND
  original.rank = updated.rank
rviscomi commented 7 months ago

The all.sql version used above only works on home pages in the legacy tables. To get the most accurate data using home+secondary pages, we need to run it again using the all.pages table:

CREATE TEMP FUNCTION IS_GOOD(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS (
  SAFE_DIVIDE(good, good + needs_improvement + poor) >= 0.75
);

CREATE TEMP FUNCTION IS_NON_ZERO(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS (
  good + needs_improvement + poor > 0
);

CREATE TEMP FUNCTION GET_LIGHTHOUSE_CATEGORY_SCORES(categories STRING)
RETURNS STRUCT<accessibility NUMERIC, best_practices NUMERIC, performance NUMERIC, pwa NUMERIC, seo NUMERIC> 
LANGUAGE js AS '''
try {
  const $ = JSON.parse(categories);
  return {
    accessibility: $.accessibility.score,
    best_practices: $['best-practices'].score,
    performance: $.performance.score,
    pwa: $.pwa.score,
    seo: $.seo.score
  };
} catch (e) {
  return {};
}
''';

CREATE OR REPLACE TABLE httparchive.core_web_vitals.technologies_2024b
PARTITION BY date
CLUSTER BY geo, app, rank, client AS

WITH geo_summary AS (
  SELECT
    CAST(REGEXP_REPLACE(CAST(yyyymm AS STRING), r'(\d{4})(\d{2})', r'\1-\2-01') AS DATE) AS date,
    * EXCEPT (country_code),
    `chrome-ux-report`.experimental.GET_COUNTRY(country_code) AS geo
  FROM
    `chrome-ux-report.materialized.country_summary`
  WHERE
    yyyymm >= 202201 AND
    device IN ('desktop', 'phone')
UNION ALL
  SELECT
    date,
    * EXCEPT (date, yyyymmdd, p75_fid_origin, p75_cls_origin, p75_lcp_origin, p75_inp_origin),
    'ALL' AS geo
  FROM
    `chrome-ux-report.materialized.device_summary`
  WHERE
    date >= '2022-01-01' AND
    device IN ('desktop', 'phone')
),

crux AS (
  SELECT
    date,
    geo,
    CASE _rank
      WHEN 100000000 THEN 'ALL'
      WHEN 10000000 THEN 'Top 10M'
      WHEN 1000000 THEN 'Top 1M'
      WHEN 100000 THEN 'Top 100k'
      WHEN 10000 THEN 'Top 10k'
      WHEN 1000 THEN 'Top 1k'
    END AS rank,
    CONCAT(origin, '/') AS root_page_url,
    IF(device = 'desktop', 'desktop', 'mobile') AS client,

    # CWV
    IS_NON_ZERO(fast_fid, avg_fid, slow_fid) AS any_fid,
    IS_GOOD(fast_fid, avg_fid, slow_fid) AS good_fid,
    IS_NON_ZERO(small_cls, medium_cls, large_cls) AS any_cls,
    IS_GOOD(small_cls, medium_cls, large_cls) AS good_cls,
    IS_NON_ZERO(fast_lcp, avg_lcp, slow_lcp) AS any_lcp,
    IS_GOOD(fast_lcp, avg_lcp, slow_lcp) AS good_lcp,
    (IS_GOOD(fast_inp, avg_inp, slow_inp) OR fast_inp IS NULL) AND
    IS_GOOD(small_cls, medium_cls, large_cls) AND
    IS_GOOD(fast_lcp, avg_lcp, slow_lcp) AS good_cwv,

    # WV
    IS_NON_ZERO(fast_fcp, avg_fcp, slow_fcp) AS any_fcp,
    IS_GOOD(fast_fcp, avg_fcp, slow_fcp) AS good_fcp,
    IS_NON_ZERO(fast_ttfb, avg_ttfb, slow_ttfb) AS any_ttfb,
    IS_GOOD(fast_ttfb, avg_ttfb, slow_ttfb) AS good_ttfb,
    IS_NON_ZERO(fast_inp, avg_inp, slow_inp) AS any_inp,
    IS_GOOD(fast_inp, avg_inp, slow_inp) AS good_inp
  FROM
    geo_summary,
    UNNEST([1000, 10000, 100000, 1000000, 10000000, 100000000]) AS _rank
  WHERE
    rank <= _rank
),

technologies AS (
  SELECT
    date,
    technology.technology AS app,
    client,
    page AS url
  FROM
    `httparchive.all.pages`,
    UNNEST(technologies) AS technology
  WHERE
    date >= '2020-01-01' AND
    technology.technology IS NOT NULL AND
    technology.technology != ''
UNION ALL
  SELECT
    date,
    'ALL' AS app,
    client,
    page AS url
  FROM
    `httparchive.all.pages`
  WHERE
    date >= '2020-01-01'
),

categories AS (
  SELECT
    technology.technology AS app,
    ARRAY_TO_STRING(ARRAY_AGG(DISTINCT category IGNORE NULLS ORDER BY category), ', ') AS category
  FROM
    `httparchive.all.pages`,
    UNNEST(technologies) AS technology,
    UNNEST(technology.categories) AS category
  WHERE
    date = '2023-11-01'
  GROUP BY
    app
UNION ALL
  SELECT
    'ALL' AS app,
    ARRAY_TO_STRING(ARRAY_AGG(DISTINCT category IGNORE NULLS ORDER BY category), ', ') AS category
  FROM
    `httparchive.all.pages`,
    UNNEST(technologies) AS technology,
    UNNEST(technology.categories) AS category
  WHERE
    date = '2023-11-01' AND
    client = 'mobile'
),

summary_stats AS (
  SELECT
    date,
    client,
    page AS url,
    root_page AS root_page_url,
  FROM
    `httparchive.all.pages`
  WHERE
    date >= '2020-01-01'
),

lab_data AS (
  SELECT
    date,
    client,
    root_page_url,
    app,
    ANY_VALUE(category) AS category,
  FROM
    summary_stats
  JOIN
    technologies
  USING
    (date, client, url)
  JOIN
    categories
  USING
    (app)
  GROUP BY
    date,
    client,
    root_page_url,
    app
)

SELECT
  date,
  geo,
  rank,
  ANY_VALUE(category) AS category,
  app,
  client,
  COUNT(0) AS origins,

  # CrUX data
  COUNTIF(good_cwv) AS origins_with_good_cwv_2024,

  SAFE_DIVIDE(COUNTIF(good_cwv), COUNTIF(any_lcp AND any_cls)) AS pct_eligible_origins_with_good_cwv_2024,

FROM
  lab_data
JOIN
  crux
USING
  (date, client, root_page_url)
GROUP BY
  date,
  app,
  geo,
  rank,
  client
rviscomi commented 7 months ago

Overwriting the results of the technologies_2024b table on top of the last update to the technologies table should leave us with home+secondary page data for as long as we have it, falling back to home page-only data when we don't.

UPDATE
  `httparchive.core_web_vitals.technologies` AS original
SET
  original.origins_with_good_cwv_2024 = updated.origins_with_good_cwv_2024,
  original.pct_eligible_origins_with_good_cwv_2024 = updated.pct_eligible_origins_with_good_cwv_2024
FROM
  `httparchive.core_web_vitals.technologies_2024b` AS updated
WHERE
  original.date = updated.date AND
  original.client = updated.client AND
  original.app = updated.app AND
  original.geo = updated.geo AND
  original.rank = updated.rank
rviscomi commented 5 months ago

The next time we need to run this (March 2024) the SQL will already be pointing to CWV'24 with INP. I've also added a way to automatically append the results to the technologies table using INSERT INTO.