CREATE OR REPLACE FUNCTION `bqutil.fn.chisquare`(x ARRAY<STRING>, y ARRAY<STRING>) AS (
(
WITH categorical AS (
SELECT independent_var , y[OFFSET(id)] as dependent_var
FROM UNNEST( x ) as independent_var WITH OFFSET id
),
contingency_table AS (
SELECT DISTINCT
independent_var,
dependent_var,
COUNT(*) OVER(PARTITION BY independent_var, dependent_var) as count,
COUNT(*) OVER(PARTITION BY independent_var) independent_total,
COUNT(*) OVER(PARTITION BY dependent_var) dependent_total,
COUNT(*) OVER() as total
FROM categorical AS t0
),
expected_table AS (
SELECT
independent_var,
dependent_var,
independent_total * dependent_total / total as count
FROM contingency_table
),
output AS (
SELECT
SUM(POW(contingency_table.count - expected_table.count, 2) / expected_table.count) as x,
SUM(POW(ABS(contingency_table.count - expected_table.count)- 0.5, 2) / expected_table.count) as x_corr,
(COUNT(DISTINCT contingency_table.independent_var) - 1)
* (COUNT(DISTINCT contingency_table.dependent_var) - 1) AS dof
FROM contingency_table
INNER JOIN expected_table
ON expected_table.independent_var = contingency_table.independent_var
AND expected_table.dependent_var = contingency_table.dependent_var
)
SELECT
IF ( dof = 1 ,
STRUCT (x_corr as x, dof, bqutil.fn.pvalue(x_corr, dof) AS p) ,
STRUCT (x, dof, bqutil.fn.pvalue(x, dof) AS p) )
FROM output
)
);
Hi Ian and Ross,
Many implementations of the chisquare test have a correction term (Yate's correction), for example the python function https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chi2_contingency.html . I added the correction term into a udf based on the chisquare procedure developed by IAN.