Closed rviscomi closed 5 months ago
First I copied httparchive.core_web_vitals.technologies
to technologies_copy
to make sure I didn't screw anything up
Then I ran this query to regenerate a new version of the original technologies
table to also include the new year-specific fields:
CREATE OR REPLACE TABLE
httparchive.core_web_vitals.technologies
PARTITION BY
date
CLUSTER BY
geo,
app,
rank,
client
AS
SELECT
date,
geo,
rank,
category,
app,
client,
origins,
origins_with_good_fid,
origins_with_good_cls,
origins_with_good_lcp,
origins_with_good_fcp,
origins_with_good_ttfb,
origins_with_good_inp,
origins_with_any_fid,
origins_with_any_cls,
origins_with_any_lcp,
origins_with_any_fcp,
origins_with_any_ttfb,
origins_with_any_inp,
origins_with_good_cwv,
NULL AS origins_with_good_cwv_2024,
origins_with_good_cwv AS origins_with_good_cwv_2023,
origins_eligible_for_cwv,
pct_eligible_origins_with_good_cwv,
CAST(NULL AS FLOAT64) AS pct_eligible_origins_with_good_cwv_2024,
pct_eligible_origins_with_good_cwv AS pct_eligible_origins_with_good_cwv_2023,
median_lighthouse_score_accessibility,
median_lighthouse_score_best_practices,
median_lighthouse_score_performance,
median_lighthouse_score_pwa,
median_lighthouse_score_seo,
median_bytes_total,
median_bytes_js,
median_bytes_image
FROM
httparchive.core_web_vitals.technologies_copy
Next I ran this abridged all.sql
file to generate only the CrUX data dating back to 202001
CREATE TEMP FUNCTION IS_GOOD(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS (
SAFE_DIVIDE(good, good + needs_improvement + poor) >= 0.75
);
CREATE TEMP FUNCTION IS_NON_ZERO(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS (
good + needs_improvement + poor > 0
);
CREATE TEMP FUNCTION GET_LIGHTHOUSE_CATEGORY_SCORES(categories STRING)
RETURNS STRUCT<accessibility NUMERIC, best_practices NUMERIC, performance NUMERIC, pwa NUMERIC, seo NUMERIC>
LANGUAGE js AS '''
try {
const $ = JSON.parse(categories);
return {
accessibility: $.accessibility.score,
'best-practices': $['best-practices'].score,
performance: $.performance.score,
pwa: $.pwa.score,
seo: $.seo.score
};
} catch (e) {
return {};
}
''';
CREATE OR REPLACE TABLE httparchive.core_web_vitals.technologies_2024
PARTITION BY date
CLUSTER BY geo, app, rank, client AS
WITH latest_technologies AS (
SELECT
category,
app
FROM
`httparchive.technologies.2023_11_01_mobile`
), geo_summary AS (
SELECT
CAST(REGEXP_REPLACE(CAST(yyyymm AS STRING), r'(\d{4})(\d{2})', r'\1-\2-01') AS DATE) AS date,
* EXCEPT (country_code),
`chrome-ux-report`.experimental.GET_COUNTRY(country_code) AS geo
FROM
`chrome-ux-report.materialized.country_summary`
UNION ALL
SELECT
* EXCEPT (yyyymmdd, p75_fid_origin, p75_cls_origin, p75_lcp_origin, p75_inp_origin),
'ALL' AS geo
FROM
`chrome-ux-report.materialized.device_summary`
), crux AS (
SELECT
date,
geo,
CASE _rank
WHEN 100000000 THEN 'ALL'
WHEN 10000000 THEN 'Top 10M'
WHEN 1000000 THEN 'Top 1M'
WHEN 100000 THEN 'Top 100k'
WHEN 10000 THEN 'Top 10k'
WHEN 1000 THEN 'Top 1k'
END AS rank,
CONCAT(origin, '/') AS url,
IF(device = 'desktop', 'desktop', 'mobile') AS client,
# CWV
IS_NON_ZERO(fast_fid, avg_fid, slow_fid) AS any_fid,
IS_GOOD(fast_fid, avg_fid, slow_fid) AS good_fid,
IS_NON_ZERO(small_cls, medium_cls, large_cls) AS any_cls,
IS_GOOD(small_cls, medium_cls, large_cls) AS good_cls,
IS_NON_ZERO(fast_lcp, avg_lcp, slow_lcp) AS any_lcp,
IS_GOOD(fast_lcp, avg_lcp, slow_lcp) AS good_lcp,
(IS_GOOD(fast_inp, avg_inp, slow_inp) OR fast_inp IS NULL) AND
IS_GOOD(small_cls, medium_cls, large_cls) AND
IS_GOOD(fast_lcp, avg_lcp, slow_lcp) AS good_cwv_2024,
(IS_GOOD(fast_fid, avg_fid, slow_fid) OR fast_fid IS NULL) AND
IS_GOOD(small_cls, medium_cls, large_cls) AND
IS_GOOD(fast_lcp, avg_lcp, slow_lcp) AS good_cwv_2023,
# WV
IS_NON_ZERO(fast_fcp, avg_fcp, slow_fcp) AS any_fcp,
IS_GOOD(fast_fcp, avg_fcp, slow_fcp) AS good_fcp,
IS_NON_ZERO(fast_ttfb, avg_ttfb, slow_ttfb) AS any_ttfb,
IS_GOOD(fast_ttfb, avg_ttfb, slow_ttfb) AS good_ttfb,
IS_NON_ZERO(fast_inp, avg_inp, slow_inp) AS any_inp,
IS_GOOD(fast_inp, avg_inp, slow_inp) AS good_inp
FROM
geo_summary,
UNNEST([1000, 10000, 100000, 1000000, 10000000, 100000000]) AS _rank
WHERE
date >= '2020-01-01' AND
device IN ('desktop', 'phone') AND
(rank <= _rank OR (rank IS NULL AND _rank = 100000000))
), technologies AS (
SELECT DISTINCT
CAST(REGEXP_REPLACE(_TABLE_SUFFIX, r'(\d)_(\d{2})_(\d{2}).*', r'202\1-\2-\3') AS DATE) AS date,
app,
IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client,
url
FROM
`httparchive.technologies.202*`
WHERE
app IS NOT NULL AND
app != ''
UNION ALL
SELECT
CAST(REGEXP_REPLACE(_TABLE_SUFFIX, r'(\d)_(\d{2})_(\d{2}).*', r'202\1-\2-\3') AS DATE) AS date,
'ALL' AS app,
IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client,
url
FROM
`httparchive.summary_pages.202*`
), categories AS (
SELECT
app,
ARRAY_TO_STRING(ARRAY_AGG(DISTINCT category IGNORE NULLS ORDER BY category), ', ') AS category
FROM
latest_technologies
GROUP BY
app
UNION ALL
SELECT
'ALL' AS app,
ARRAY_TO_STRING(ARRAY_AGG(DISTINCT category IGNORE NULLS ORDER BY category), ', ') AS category
FROM
latest_technologies
), summary_stats AS (
SELECT
CAST(REGEXP_REPLACE(_TABLE_SUFFIX, r'(\d)_(\d{2})_(\d{2}).*', r'202\1-\2-\3') AS DATE) AS date,
IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client,
url,
bytesTotal,
bytesJS,
bytesImg
FROM
`httparchive.summary_pages.202*`
)
SELECT
date,
geo,
rank,
app,
client,
COUNT(0) AS origins,
# CrUX data
COUNTIF(good_cwv_2023) AS origins_with_good_cwv,
COUNTIF(good_cwv_2024) AS origins_with_good_cwv_2024,
COUNTIF(good_cwv_2023) AS origins_with_good_cwv_2023,
SAFE_DIVIDE(COUNTIF(good_cwv_2023), COUNTIF(any_lcp AND any_cls)) AS pct_eligible_origins_with_good_cwv,
SAFE_DIVIDE(COUNTIF(good_cwv_2024), COUNTIF(any_lcp AND any_cls)) AS pct_eligible_origins_with_good_cwv_2024,
SAFE_DIVIDE(COUNTIF(good_cwv_2023), COUNTIF(any_lcp AND any_cls)) AS pct_eligible_origins_with_good_cwv_2023,
FROM
technologies
JOIN
categories
USING
(app)
JOIN
summary_stats
USING
(date, client, url)
JOIN
crux
USING
(date, client, url)
GROUP BY
date,
rank,
geo,
app,
client
Then I merged the 2024-specific fields generated above with the null columns in the new table schema:
UPDATE
`httparchive.core_web_vitals.technologies` AS original
SET
original.origins_with_good_cwv_2024 = updated.origins_with_good_cwv_2024,
original.pct_eligible_origins_with_good_cwv_2024 = updated.pct_eligible_origins_with_good_cwv_2024
FROM
`httparchive.core_web_vitals.technologies_2024` AS updated
WHERE
original.date = updated.date AND
original.client = updated.client AND
original.app = updated.app AND
original.geo = updated.geo AND
original.rank = updated.rank
The all.sql
version used above only works on home pages in the legacy tables. To get the most accurate data using home+secondary pages, we need to run it again using the all.pages
table:
CREATE TEMP FUNCTION IS_GOOD(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS (
SAFE_DIVIDE(good, good + needs_improvement + poor) >= 0.75
);
CREATE TEMP FUNCTION IS_NON_ZERO(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS (
good + needs_improvement + poor > 0
);
CREATE TEMP FUNCTION GET_LIGHTHOUSE_CATEGORY_SCORES(categories STRING)
RETURNS STRUCT<accessibility NUMERIC, best_practices NUMERIC, performance NUMERIC, pwa NUMERIC, seo NUMERIC>
LANGUAGE js AS '''
try {
const $ = JSON.parse(categories);
return {
accessibility: $.accessibility.score,
best_practices: $['best-practices'].score,
performance: $.performance.score,
pwa: $.pwa.score,
seo: $.seo.score
};
} catch (e) {
return {};
}
''';
CREATE OR REPLACE TABLE httparchive.core_web_vitals.technologies_2024b
PARTITION BY date
CLUSTER BY geo, app, rank, client AS
WITH geo_summary AS (
SELECT
CAST(REGEXP_REPLACE(CAST(yyyymm AS STRING), r'(\d{4})(\d{2})', r'\1-\2-01') AS DATE) AS date,
* EXCEPT (country_code),
`chrome-ux-report`.experimental.GET_COUNTRY(country_code) AS geo
FROM
`chrome-ux-report.materialized.country_summary`
WHERE
yyyymm >= 202201 AND
device IN ('desktop', 'phone')
UNION ALL
SELECT
date,
* EXCEPT (date, yyyymmdd, p75_fid_origin, p75_cls_origin, p75_lcp_origin, p75_inp_origin),
'ALL' AS geo
FROM
`chrome-ux-report.materialized.device_summary`
WHERE
date >= '2022-01-01' AND
device IN ('desktop', 'phone')
),
crux AS (
SELECT
date,
geo,
CASE _rank
WHEN 100000000 THEN 'ALL'
WHEN 10000000 THEN 'Top 10M'
WHEN 1000000 THEN 'Top 1M'
WHEN 100000 THEN 'Top 100k'
WHEN 10000 THEN 'Top 10k'
WHEN 1000 THEN 'Top 1k'
END AS rank,
CONCAT(origin, '/') AS root_page_url,
IF(device = 'desktop', 'desktop', 'mobile') AS client,
# CWV
IS_NON_ZERO(fast_fid, avg_fid, slow_fid) AS any_fid,
IS_GOOD(fast_fid, avg_fid, slow_fid) AS good_fid,
IS_NON_ZERO(small_cls, medium_cls, large_cls) AS any_cls,
IS_GOOD(small_cls, medium_cls, large_cls) AS good_cls,
IS_NON_ZERO(fast_lcp, avg_lcp, slow_lcp) AS any_lcp,
IS_GOOD(fast_lcp, avg_lcp, slow_lcp) AS good_lcp,
(IS_GOOD(fast_inp, avg_inp, slow_inp) OR fast_inp IS NULL) AND
IS_GOOD(small_cls, medium_cls, large_cls) AND
IS_GOOD(fast_lcp, avg_lcp, slow_lcp) AS good_cwv,
# WV
IS_NON_ZERO(fast_fcp, avg_fcp, slow_fcp) AS any_fcp,
IS_GOOD(fast_fcp, avg_fcp, slow_fcp) AS good_fcp,
IS_NON_ZERO(fast_ttfb, avg_ttfb, slow_ttfb) AS any_ttfb,
IS_GOOD(fast_ttfb, avg_ttfb, slow_ttfb) AS good_ttfb,
IS_NON_ZERO(fast_inp, avg_inp, slow_inp) AS any_inp,
IS_GOOD(fast_inp, avg_inp, slow_inp) AS good_inp
FROM
geo_summary,
UNNEST([1000, 10000, 100000, 1000000, 10000000, 100000000]) AS _rank
WHERE
rank <= _rank
),
technologies AS (
SELECT
date,
technology.technology AS app,
client,
page AS url
FROM
`httparchive.all.pages`,
UNNEST(technologies) AS technology
WHERE
date >= '2020-01-01' AND
technology.technology IS NOT NULL AND
technology.technology != ''
UNION ALL
SELECT
date,
'ALL' AS app,
client,
page AS url
FROM
`httparchive.all.pages`
WHERE
date >= '2020-01-01'
),
categories AS (
SELECT
technology.technology AS app,
ARRAY_TO_STRING(ARRAY_AGG(DISTINCT category IGNORE NULLS ORDER BY category), ', ') AS category
FROM
`httparchive.all.pages`,
UNNEST(technologies) AS technology,
UNNEST(technology.categories) AS category
WHERE
date = '2023-11-01'
GROUP BY
app
UNION ALL
SELECT
'ALL' AS app,
ARRAY_TO_STRING(ARRAY_AGG(DISTINCT category IGNORE NULLS ORDER BY category), ', ') AS category
FROM
`httparchive.all.pages`,
UNNEST(technologies) AS technology,
UNNEST(technology.categories) AS category
WHERE
date = '2023-11-01' AND
client = 'mobile'
),
summary_stats AS (
SELECT
date,
client,
page AS url,
root_page AS root_page_url,
FROM
`httparchive.all.pages`
WHERE
date >= '2020-01-01'
),
lab_data AS (
SELECT
date,
client,
root_page_url,
app,
ANY_VALUE(category) AS category,
FROM
summary_stats
JOIN
technologies
USING
(date, client, url)
JOIN
categories
USING
(app)
GROUP BY
date,
client,
root_page_url,
app
)
SELECT
date,
geo,
rank,
ANY_VALUE(category) AS category,
app,
client,
COUNT(0) AS origins,
# CrUX data
COUNTIF(good_cwv) AS origins_with_good_cwv_2024,
SAFE_DIVIDE(COUNTIF(good_cwv), COUNTIF(any_lcp AND any_cls)) AS pct_eligible_origins_with_good_cwv_2024,
FROM
lab_data
JOIN
crux
USING
(date, client, root_page_url)
GROUP BY
date,
app,
geo,
rank,
client
Overwriting the results of the technologies_2024b
table on top of the last update to the technologies
table should leave us with home+secondary page data for as long as we have it, falling back to home page-only data when we don't.
UPDATE
`httparchive.core_web_vitals.technologies` AS original
SET
original.origins_with_good_cwv_2024 = updated.origins_with_good_cwv_2024,
original.pct_eligible_origins_with_good_cwv_2024 = updated.pct_eligible_origins_with_good_cwv_2024
FROM
`httparchive.core_web_vitals.technologies_2024b` AS updated
WHERE
original.date = updated.date AND
original.client = updated.client AND
original.app = updated.app AND
original.geo = updated.geo AND
original.rank = updated.rank
The next time we need to run this (March 2024) the SQL will already be pointing to CWV'24 with INP. I've also added a way to automatically append the results to the technologies
table using INSERT INTO
.
Prep the dataset for the CWV'24 swap, making INP the new responsiveness metric