jrossthomson / bigquery-utils

Useful scripts, udfs, views, and other utilities for migration and data warehouse operations in BigQuery.
https://cloud.google.com/bigquery/
Apache License 2.0
0 stars 0 forks source link

chisquare correction term #29

Open boaguilar opened 2 years ago

boaguilar commented 2 years ago

Hi Ian and Ross,

Many implementations of the chisquare test have a correction term (Yate's correction), for example the python function https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chi2_contingency.html . I added the correction term into a udf based on the chisquare procedure developed by IAN.

CREATE OR REPLACE FUNCTION `bqutil.fn.chisquare`(x ARRAY<STRING>, y ARRAY<STRING>) AS (
(
    WITH categorical AS (
        SELECT independent_var , y[OFFSET(id)] as dependent_var
        FROM UNNEST( x )  as independent_var  WITH OFFSET id  
    ),
    contingency_table AS (
        SELECT DISTINCT
            independent_var,
            dependent_var,
            COUNT(*) OVER(PARTITION BY independent_var, dependent_var) as count,
            COUNT(*) OVER(PARTITION BY independent_var) independent_total,
            COUNT(*) OVER(PARTITION BY dependent_var) dependent_total,
            COUNT(*) OVER() as total
        FROM categorical AS t0
    ),
    expected_table AS (
        SELECT
            independent_var,
            dependent_var,
            independent_total * dependent_total / total as count
        FROM contingency_table
    ),
    output AS (
        SELECT
            SUM(POW(contingency_table.count - expected_table.count, 2) / expected_table.count) as x,
            SUM(POW(ABS(contingency_table.count - expected_table.count)- 0.5, 2) / expected_table.count) as x_corr,
            (COUNT(DISTINCT contingency_table.independent_var) - 1)
                * (COUNT(DISTINCT contingency_table.dependent_var) - 1) AS dof
        FROM contingency_table
        INNER JOIN expected_table
            ON expected_table.independent_var = contingency_table.independent_var
            AND expected_table.dependent_var = contingency_table.dependent_var
    )
    SELECT 
        IF ( dof = 1 , 
            STRUCT (x_corr as x, dof, bqutil.fn.pvalue(x_corr, dof) AS p) ,
            STRUCT (x, dof, bqutil.fn.pvalue(x, dof) AS p) )
     FROM output
)
);