m-lab / analysis

Detailed analysis for data collected from M-Lab.
Apache License 2.0
12 stars 5 forks source link

Preliminary intra-site canary metrics from kernel upgrade #12

Open gfr10598 opened 4 years ago

gfr10598 commented 4 years ago

This issue is intended to document the work on bigquery queries for canary assessments.

The primary concern is compensating for or avoiding client bias, in order to increase sensitivity to changes in the platform, specifically between a machine with a new canary software stack, and other machines on the same site.

gfr10598 commented 4 years ago

This is the final spreadsheet that shows the differences detected in NDT download speeds and speed/minRTT correlation on gru03 machines. The mlab1 machine was identified as a possible canary machine, out of worldwide canaries on single unknown machines per site. https://docs.google.com/spreadsheets/d/1pueCPywPId7mA8JVMLhBa5er4gtCcSGuWxMDG2rU1YU/

GRU log_mean_speed GRU pearson correlation speed_rtt

gfr10598 commented 4 years ago

This is the query used in the spreadsheet.

WITH
metro_stats AS (
SELECT * FROM `mlab-sandbox.gfr.metro_stats` 
),

date_agg AS (
SELECT metro, site, machine, protocol, complete, slow, error,
SUM(tests) AS tests,
ROUND(AVG(log_mean_speed),3) AS log_mean_speed,
ROUND(AVG(logMeanMinRTT),3) AS logMeanMinRTT,
ROUND(AVG(pearson),3) AS pearson,
ROUND(AVG(under_1),3) AS under_1  ,
ROUND(AVG(_1_3),3) AS _1_3,
ROUND(AVG(_3_10 ),3) AS _3_10,
ROUND(AVG(_10_30 ),3) AS _10_30,
ROUND(AVG(_30_100 ),3) AS _30_100,
ROUND(AVG(_100_300 ),3) AS _100_300,
ROUND(AVG(over_300),3) AS over_300
FROM metro_stats
--WHERE metro = "syd"
WHERE protocol = "JSON" AND complete IS true
GROUP BY metro, site, machine, protocol, complete, slow, error
ORDER BY protocol, complete DESC, slow, error, site, machine 
),

deviations AS (
SELECT metro, site, 
ROUND(SAFE_DIVIDE(STDDEV(log_mean_speed), AVG(log_mean_speed)),4) AS logMeanSpeedStdev,
ROUND(SAFE_DIVIDE(STDDEV(logMeanMinRTT), AVG(logMeanMinRTT)),4) AS logMeanMinRTTStdev,
ROUND(SAFE_DIVIDE(STDDEV(pearson), AVG(pearson)),4) AS pearsonStdev,
FROM date_agg
WHERE complete = true AND error = false AND slow = false
GROUP BY metro, site, protocol, complete, slow, error
ORDER BY logMeanSpeedStdev DESC
)

#SELECT * FROM deviations 
SELECT * FROM metro_stats 
WHERE metro = "gru" AND site = "gru03"
AND protocol = "JSON" AND complete AND slow = false AND error = false
ORDER BY site, test_date, machine 
gfr10598 commented 4 years ago

This is a saved query that was probably used to generate the table that the spreadsheet uses. It in turn uses mlab-sandbox.gfr.client_weekly_stats and base_tables.ndt5.

https://console.cloud.google.com/bigquery?sq=581276032543:cb730dd999c54182a129c7d9ccbff01c

# Metro stats
# Order:
#  2020-06-06 ndt5 per machine client stats, last 10 days
#  2020-06-07 Client stats
#  2020-06-16 Metro stats

CREATE OR REPLACE TABLE `mlab-sandbox.gfr.metro_stats` 
PARTITION BY test_date
CLUSTER BY metro, site
AS

WITH
primary AS (
SELECT DATE(TIMESTAMP_SECONDS(log_time)) AS test_date,
TIMESTAMP_SECONDS(log_time) AS log_time,
* EXCEPT(log_time)
FROM `mlab-oti.base_tables.ndt5`
WHERE DATE(_PARTITIONTIME) >= DATE_SUB(CURRENT_DATE(), INTERVAL 10 DAY) # PARTITIONED on this.
),

# Ideally, this should be based on binomial distribution likelihood.
# However, for now I'm using a simpler criteria that is sub-optimal.
good_clients AS (
SELECT * FROM `mlab-sandbox.gfr.client_weekly_stats`
WHERE date >= DATE_SUB(CURRENT_DATE(), INTERVAL 8 DAY)
AND client NOT IN
        ("45.56.98.222", "35.192.37.249", "35.225.75.192", "23.228.128.99",
        "2600:3c03::f03c:91ff:fe33:819",  "2605:a601:f1ff:fffe::99")
# Exclude clients more than twice as many tests as machines in the metro
--AND total_tests < 2*machines
# Good clients will have similar counts across all machines
AND max <= min + SQRT(machines)  -- up to 3 machines -> max = 1, 4 machines -> max = 2, ALL 15 machines -> spread < 4
--AND max <= 3
--AND test_machines = machines
--AND total_tests > 25
--AND zeros = 0 AND ones = 0 AND twos=0
--AND max <= min + 2
# If there are fewer tests than machines, we expect lots of singletons
--AND (total_tests > machines OR ones >= twos)
ORDER BY machines DESC, metro
),
--------------------------------------------------------------------------- 

downloads AS (
SELECT 
  test_date, log_time, 
  REGEXP_EXTRACT(ParseInfo.TaskFilename, ".*-(mlab[1-4])-.*") AS machine, 
  REGEXP_EXTRACT(ParseInfo.TaskFilename, ".*-mlab[1-4]-([a-z]{3}[0-9]{2}).*") AS site, 
  REGEXP_EXTRACT(ParseInfo.TaskFilename, ".*-mlab[1-4]-([a-z]{3})[0-9]{2}.*") AS metro, 
  ParseInfo, 
  result.S2C.ClientIP AS client, 
  result.S2C.UUID AS uuid, 
  result.Control.MessageProtocol AS protocol,
  result.S2C.MeanThroughputMbps AS mbps, 
  result.S2C.TCPInfo.Retransmits,  # empty/NULL
  result.S2C.TCPInfo.MinRTT,       # empty/NULL
  result.S2C.MinRTT AS appMinRTT,
  result.S2C.SumRTT, result.S2C.CountRTT,  # empty/NULL
  result.S2C.MaxRTT AS appMaxRTT,          # empty/NULL
  TIMESTAMP_DIFF(result.S2C.EndTime, result.S2C.StartTime, MICROSECOND)/1000000 AS test_duration,
  result.S2C.Error != "" AS error,
  result.S2C.MeanThroughputMbps <= 0.1 AS slow,
  TIMESTAMP_DIFF(result.S2C.EndTime, result.S2C.StartTime, MICROSECOND)/1000000 BETWEEN 9 AND 13 AS complete,
FROM primary 
--WHERE ParseInfo.TaskFilename LIKE "%lga%"
),

-------------------------------------------------------------- 

good_downloads AS (
SELECT D.*
FROM downloads D JOIN good_clients G ON D.client = G.client AND D.metro = G.metro AND D.test_date = G.date
),

stats AS (
SELECT test_date, metro, site, machine, protocol, complete, slow, error, count(uuid) AS tests, 
ROUND(EXP(AVG(IF(mbps > 0, LN(mbps), NULL))),2) AS log_mean_speed, 
ROUND(SAFE_DIVIDE(COUNTIF(appMinRTT < 10000000), COUNT(uuid)),3) AS rttUnder10,
ROUND(APPROX_QUANTILES(appMinRTT, 101)[OFFSET(50)]/1000000,2) AS medianMinRTT,
ROUND(AVG(appMinRTT)/1000000,2) AS meanMinRTT,
ROUND(EXP(AVG(IF(appMinRTT > 0, LN(appMinRTT/1000000), 0))),2) AS logMeanMinRTT,
AVG(Retransmits) AS avgRetransmits,
# Pearson correlation between ln(minRTT) and ln(bandwidth).  Ln produces much higher correlation (.5 vs .3)
# suggesting that the long tail of high speed / low RTT undermines the correlation without the LOG.
ROUND(CORR(IF(appMinRTT > 0, LN(1/appMinRTT), NULL) , IF(mbps > 0, LN(mbps), NULL)), 3) AS pearson,
--ROUND(AVG(SAFE_DIVIDE(SumRTT,CountRTT))/1000000,2) AS meanAppAvgRTT,
ROUND(APPROX_QUANTILES(mbps, 101)[OFFSET(10)],2) AS q10,
ROUND(APPROX_QUANTILES(mbps, 101)[OFFSET(25)],2) AS q25,
ROUND(APPROX_QUANTILES(mbps, 101)[OFFSET(50)],2) AS q50,
ROUND(APPROX_QUANTILES(mbps, 101)[OFFSET(75)],2) AS q75,
ROUND(APPROX_QUANTILES(mbps, 101)[OFFSET(90)],2) AS q90,
ROUND(MAX(mbps),2) AS max,
ROUND(SAFE_DIVIDE(COUNTIF(mbps < 1), COUNT(uuid)),3) AS under_1,
ROUND(SAFE_DIVIDE(COUNTIF(mbps BETWEEN 1 AND 3), COUNT(uuid)),3) AS _1_3,
ROUND(SAFE_DIVIDE(COUNTIF(mbps BETWEEN 3 AND 10), COUNT(uuid)),3) AS _3_10,
ROUND(SAFE_DIVIDE(COUNTIF(mbps BETWEEN 10 AND 30), COUNT(uuid)),3) AS _10_30,
ROUND(SAFE_DIVIDE(COUNTIF(mbps BETWEEN 30 AND 100), COUNT(uuid)),3) AS _30_100,
ROUND(SAFE_DIVIDE(COUNTIF(mbps BETWEEN 100 AND 300), COUNT(uuid)),3) AS _100_300,
ROUND(SAFE_DIVIDE(COUNTIF(mbps > 300), COUNT(uuid)),3) AS over_300,
COUNTIF(appMinRTT > 50000000) AS far,
ROUND(EXP(AVG(IF(appMinRTT > 50000000 AND mbps > 0, LN(mbps), NULL))),3) AS logMeanFarMbps,
FROM good_downloads
GROUP BY metro, test_date, machine, site, complete, slow, error, protocol
)

SELECT * FROM stats
-- WHERE metro = "lga"
-- ORDER BY protocol, complete DESC, slow, error, test_date, site, machine 
gfr10598 commented 4 years ago

This is the saved query that likely produced the client_week_stats, used by the metro_stats query. The Order: section is the list of actual saved query names (personal gfr queries).

https://console.cloud.google.com/bigquery?sq=581276032543:e0c015127c3b4fa09f7d0f45914d8212

# All metros, 7 dates takes about 1 slot hour, produces 2M rows of good clients.
# CROSS JOIN produces about 150M rows.
# Order:
#  2020-06-06 ndt5 per machine client stats, last 10 days
#  2020-06-07 Client stats
#  2020-06-10 Multi-date

CREATE OR REPLACE TABLE `mlab-sandbox.gfr.client_weekly_stats`
PARTITION BY date
CLUSTER BY metro, client
AS

# This is the raw metro summary data
WITH metros AS (
SELECT *
FROM `mlab-sandbox.gfr.oti_metro_client_summary`
),

# flatten the per metro data, so that we have a row for each machine/client/date
flatten AS (
SELECT test_date, metro, client, CONCAT(site, ".", machine) AS machine, tests
FROM metros JOIN UNNEST(metros.machines)
GROUP BY test_date, metro, client, site, machine, tests
),

# extract complete list of machines per metro/date
machines AS (
SELECT test_date, metro, machine
FROM flatten
GROUP BY test_date, metro, machine
),

# extract complete list of clients per metro/date
clients AS (
SELECT test_date, metro, client
FROM flatten
WHERE client != ""
GROUP BY test_date, metro, client
),

# create a complete list of machine/client pairs per metro/date
# This is quite large - about 100M pairs worldwide for a one week window.
product AS (
SELECT machines.test_date, machines.metro, machines.machine, clients.client
FROM machines CROSS JOIN clients
WHERE machines.metro = clients.metro AND machines.test_date = clients.test_date
),

# Now join the machine/client pairs with the original flattened data.
# This produces a full complement of rows for each client/metro/date.
joined AS (
SELECT product.test_date, product.metro, product.machine, product.client, IF(flatten.tests IS NULL, 0, flatten.tests) AS tests
FROM product LEFT JOIN flatten ON  product.test_date = flatten.test_date AND product.metro = flatten.metro AND product.client = flatten.client AND product.machine = flatten.machine
),

---------------------------------------------------------

# Now aggregate over the past week, to produce a complete distribution of tests
# per client across all machines in each metro.
past_week AS (
SELECT
  test_date AS date, metro, machine, client,
  SUM(tests) OVER weekly_window AS weekly_tests,
  MIN(test_date) OVER weekly_window AS min_date,
FROM joined
GROUP BY date, metro, client, machine, tests
WINDOW weekly_window AS (
  PARTITION BY client, machine
  ORDER BY UNIX_DATE(test_date) RANGE BETWEEN 6 PRECEDING AND CURRENT ROW
)
),

# Now summarize the data for each client/metro
weekly_summary AS (
SELECT
  date, metro, client,
  COUNTIF(weekly_tests > 0) AS test_machines,
  COUNT(machine) AS machines,
  SUM(weekly_tests) total_tests, 
  MIN(min_date) AS min_date,
  # These count the number of machines with 0,1,2,3 or more tests
  # These are useful to determining whether the client is statistically well behaved
  MIN(weekly_tests) AS min,
  MAX(weekly_tests) AS max,
  COUNTIF(weekly_tests = 0) AS zeros,
  COUNTIF(weekly_tests = 1) AS ones,
  COUNTIF(weekly_tests = 2) AS twos,  
  COUNTIF(weekly_tests = 3) AS threes,
  COUNTIF(weekly_tests > 3) AS more,
FROM past_week
GROUP BY date, metro, client
),

good_clients AS (
SELECT * FROM weekly_summary
WHERE date >= DATE_SUB(CURRENT_DATE(), INTERVAL 8 DAY)
# Exclude clients more than twice as many tests as machines in the metro
AND total_tests < 2*machines
# Good clients will have similar counts across all machines
AND max <= min + 2
# If there are fewer tests than machines, we expect lots of singletons
AND (total_tests > machines OR ones >= twos)
ORDER BY machines DESC, metro
)

SELECT *
FROM weekly_summary
gfr10598 commented 4 years ago
# Update mlab-sandbox.gfr.ndt5_stats
INSERT INTO `mlab-sandbox.gfr.ndt5_stats`
SELECT
DATE(TIMESTAMP_SECONDS(log_time)) AS test_date, 
REGEXP_EXTRACT(ParseInfo.TaskFilename, ".*-mlab[1-4]-([a-z]{3})[0-9]{2}.*") AS metro,
REGEXP_EXTRACT(ParseInfo.TaskFilename, ".*-mlab[1-4]-([a-z]{3}[0-9]{2}).*") AS site,
REGEXP_EXTRACT(ParseInfo.TaskFilename, ".*-(mlab[1-4])-.*") AS machine, 
_PARTITIONTIME AS pt, 
ParseInfo.TaskFileName as archive, 
COUNT(test_id) AS tests, 
CURRENT_TIMESTAMP() AS query_time
FROM `mlab-oti.base_tables.ndt5`
WHERE DATE(TIMESTAMP_SECONDS(log_time)) > DATE_SUB(CURRENT_DATE(), INTERVAL 7 DAY)
AND DATE(_PARTITIONTIME) > DATE_SUB(CURRENT_DATE(), INTERVAL 7 DAY)
GROUP BY test_date, pt, metro, site, machine, archive
gfr10598 commented 4 years ago

This is a scheduled query that also updates metro_stats https://pantheon.corp.google.com/bigquery?tc=us:5ef9f65a-0000-2fdb-b33b-94eb2c0367f0&authuser=2&project=mlab-sandbox

# Tests per client per machine per day, with metro summaries.
# Uses about 1 Slot Hour.
# Introduces dups if run more than once per day.
INSERT into `mlab-sandbox.gfr.metro_client_summary` 

WITH hours_per_machine AS (
  SELECT
    DATE(log_time) AS test_date, 
    TIMESTAMP_TRUNC(log_time, HOUR) AS hour,
    COUNT(log_time) AS tests,
    REGEXP_EXTRACT(ParseInfo.TaskFilename, ".*-mlab[1-4]-([a-z]{3})[0-9]{2}.*") AS metro,
    REGEXP_EXTRACT(ParseInfo.TaskFilename, ".*-mlab[1-4]-([a-z]{3}[0-9]{2}).*") AS site,
    REGEXP_EXTRACT(ParseInfo.TaskFilename, ".*-(mlab[1-4])-.*") AS machine, 
  FROM `mlab-sandbox.raw_ndt.ndt5`
  WHERE DATE(log_time) >= DATE_SUB(CURRENT_DATE(), INTERVAL 7 DAY)
  GROUP BY test_date, hour, machine, site, metro
),

hours_per_day_per_machine AS (
  SELECT * EXCEPT(hour, tests),
    COUNT(hour) AS hours,
    SUM(tests) AS tests,
  FROM hours_per_machine
  GROUP BY test_date, machine, site, metro
),

good_machines_per_metro AS (
  SELECT * EXCEPT(machine, hours, site, tests),
    COUNT(machine) AS metro_machines,
    SUM(hours) AS metro_hours, 
    COUNTIF(hours = 24) AS good_machines,
    SUM(IF(hours = 24, hours, 0)) AS good_hours, 
    ARRAY_AGG(
      STRUCT(
        site, machine, tests, hours, 
        hours = 24 AS good
      ) ORDER BY site, machine) AS machines,
  FROM hours_per_day_per_machine
  GROUP BY test_date, metro
),
---------------------------------------------------------------------------
tests_per_client AS (
  SELECT
  DATE(log_time) AS test_date,
  log_time AS TestTime,
  REGEXP_EXTRACT(ParseInfo.TaskFilename, ".*-(mlab[1-4])-.*") AS machine, 
  REGEXP_EXTRACT(ParseInfo.TaskFilename, ".*-mlab[1-4]-([a-z]{3}[0-9]{2}).*") AS site,
  REGEXP_EXTRACT(ParseInfo.TaskFilename, ".*-mlab[1-4]-([a-z]{3})[0-9]{2}.*") AS metro,
  ParseInfo,
  IF(result.S2C IS NOT NULL, result.S2C.ClientIP, result.C2S.ClientIP) AS client,
  IF(result.S2C IS NOT NULL, result.S2C.UUID, result.C2S.UUID) AS uuid,
  IF(result.S2C IS NOT NULL, result.S2C.MeanThroughputMbps,  result.C2S.MeanThroughputMbps) AS mbps,
  IF(result.S2C IS NOT NULL,
    TIMESTAMP_DIFF(result.S2C.EndTime, result.S2C.StartTime, MICROSECOND),
    TIMESTAMP_DIFF(result.C2S.EndTime, result.C2S.StartTime, MICROSECOND)) AS test_duration
  FROM `mlab-sandbox.raw_ndt.ndt5`
  WHERE
     (result.S2C.Error = "" OR result.C2S.Error = "")
),

---------------------------------------------------------------------
# Count tests per machine, and join with hours per machine
machine_summary AS (
SELECT
  a.* EXCEPT(TestTime, ParseInfo, uuid, mbps, test_duration),
  --test_date, metro, site, machine, client,
  COUNT(DISTINCT uuid) AS tests,
  b.* EXCEPT(metro, site, machine, test_date, tests)
FROM tests_per_client a LEFT JOIN hours_per_day_per_machine b ON (a.metro = b.metro AND a.site = b.site AND a.machine = b.machine AND a.test_date = b.test_date)
GROUP BY metro, site, machine, client, test_date, hours
),

# This will be very expensive.
# This should create a lot of empty rows, for clients that appear in metro, but not on a machine.
with_hours AS (
  SELECT
    a.*, b.* EXCEPT(test_date, metro, machines)
  FROM good_machines_per_metro b LEFT JOIN machine_summary a ON (a.metro = b.metro AND a.test_date = b.test_date)
),

metro_summary AS (
SELECT CURRENT_DATE() AS update_time,
  test_date, metro, client,
  ARRAY_AGG(
    STRUCT(
      site, machine, tests
    ) ORDER BY site, machine) AS machines,
  metro_machines, metro_hours, good_machines, good_hours, 
FROM with_hours
GROUP BY metro, client,
test_date, good_hours, metro_hours, good_machines, metro_machines
)

--------------------------------------------------------------

SELECT *
FROM metro_summary
WHERE test_date >= DATE_SUB(CURRENT_DATE(), INTERVAL 3 DAY)
gfr10598 commented 4 years ago

Here is the query that generates oti_metro_client_summary, saved as gfr personal query "2020-06-06 ndt5 per machine client stats, last 10 days"

https://console.cloud.google.com/bigquery?sq=581276032543:5e697d8ec4bb4e249927976828b25341

# Tests per client per machine per day, with metro summaries.
# Uses < 2 Slot Hour to process 16 days, producing about 12 million rows.
CREATE OR REPLACE TABLE `mlab-sandbox.gfr.oti_metro_client_summary` 
PARTITION BY test_date
CLUSTER BY metro, client
AS

WITH
primary AS (
SELECT DATE(TIMESTAMP_SECONDS(log_time)) AS test_date,
TIMESTAMP_SECONDS(log_time) AS log_time,
* EXCEPT(log_time)
FROM `mlab-oti.base_tables.ndt5`
WHERE DATE(_PARTITIONTIME) >= DATE_SUB(CURRENT_DATE(), INTERVAL 16 DAY) # PARTITIONED on this.
),

hours_per_machine AS ( 
SELECT 
  test_date, 
  TIMESTAMP_TRUNC(log_time, HOUR) AS hour, 
  COUNT(test_id) AS tests, 
  REGEXP_EXTRACT(ParseInfo.TaskFilename, ".*-mlab[1-4]-([a-z]{3})[0-9]{2}.*") AS metro,
  REGEXP_EXTRACT(ParseInfo.TaskFilename, ".*-mlab[1-4]-([a-z]{3}[0-9]{2}).*") AS site,
  REGEXP_EXTRACT(ParseInfo.TaskFilename, ".*-(mlab[1-4])-.*") AS machine,
FROM primary
# Without this, the query costs goes up dramatically.
WHERE test_date >= DATE_SUB(CURRENT_DATE(), INTERVAL 16 DAY) 
GROUP BY test_date, hour, machine, site, metro ),

hours_per_day_per_machine AS (
SELECT 
  * EXCEPT(hour, tests),
  COUNT(hour) AS hours, SUM(tests) AS tests, 
FROM hours_per_machine 
GROUP BY test_date, machine, site, metro 
), 

good_machines_per_metro AS (
SELECT 
  * EXCEPT(machine, hours, site, tests), 
  COUNT(machine) AS metro_machines, SUM(hours) AS metro_hours, 
  COUNTIF(hours = 24) AS good_machines, SUM(IF(hours = 24, hours, 0)) AS good_hours, 
  ARRAY_AGG( STRUCT( site, machine, tests, hours, hours = 24 AS good ) ORDER BY site, machine) AS machines, 
FROM hours_per_day_per_machine 
GROUP BY test_date, metro ), 

--------------------------------------------------------------------------- 

tests_per_client AS (
SELECT 
  test_date, log_time, 
  REGEXP_EXTRACT(ParseInfo.TaskFilename, ".*-(mlab[1-4])-.*") AS machine, 
  REGEXP_EXTRACT(ParseInfo.TaskFilename, ".*-mlab[1-4]-([a-z]{3}[0-9]{2}).*") AS site, 
  REGEXP_EXTRACT(ParseInfo.TaskFilename, ".*-mlab[1-4]-([a-z]{3})[0-9]{2}.*") AS metro, 
  ParseInfo, 
  IF(result.S2C IS NOT NULL, result.S2C.ClientIP, result.C2S.ClientIP) AS client, 
  IF(result.S2C IS NOT NULL, result.S2C.UUID, result.C2S.UUID) AS uuid, 
  IF(result.S2C IS NOT NULL, result.S2C.MeanThroughputMbps, result.C2S.MeanThroughputMbps) AS mbps, 
  IF(result.S2C IS NOT NULL,  TIMESTAMP_DIFF(result.S2C.EndTime, result.S2C.StartTime, MICROSECOND), TIMESTAMP_DIFF(result.C2S.EndTime, result.C2S.StartTime, MICROSECOND)) AS test_duration 
FROM primary 
WHERE (result.S2C.Error = "" OR result.C2S.Error = "")
),

--------------------------------------------------------------------- 
# Count tests per machine, and join with hours per machine

machine_summary AS (
SELECT 
  a.* EXCEPT(log_time, ParseInfo, uuid, mbps, test_duration),
  --test_date, metro, site, machine, client, 
  COUNT(DISTINCT uuid) AS tests,
  b.* EXCEPT(metro, site, machine, test_date, tests) 
FROM tests_per_client a LEFT JOIN hours_per_day_per_machine b 
ON (a.metro = b.metro AND a.site = b.site AND a.machine = b.machine AND a.test_date = b.test_date)
GROUP BY metro, site, machine, client, test_date, hours 
), 

# This will be very expensive. 
# This should create a lot of empty rows, for clients that appear in metro, but not on a machine. 
with_hours AS ( 
SELECT a.*, b.* EXCEPT(test_date, metro, machines) 
FROM good_machines_per_metro b LEFT JOIN machine_summary a 
ON (a.metro = b.metro AND a.test_date = b.test_date) 
),

metro_summary AS (
SELECT 
  CURRENT_DATE() AS update_time, test_date, metro, client, 
  ARRAY_AGG( STRUCT( site, machine, tests ) ORDER BY site, machine) AS machines, 
  metro_machines, metro_hours, good_machines, good_hours, 
FROM with_hours 
GROUP BY metro, client, test_date, good_hours, metro_hours, good_machines, metro_machines
)

-------------------------------------------------------------- 
SELECT * FROM metro_summary 
WHERE test_date >= DATE_SUB(CURRENT_DATE(), INTERVAL 16 DAY)