vmware-archive / sql-to-dbsp-compiler

Compiler translating SQL view definitions into DBSP circuits (https://github.com/vmware/database-stream-processor)
Other
28 stars 5 forks source link

Panic unwrapping NULL value in `ST_POINT` #116

Closed ryzhyk closed 1 year ago

ryzhyk commented 1 year ago

The panic happens when running this SQL:

CREATE TABLE demographics (
    cc_num FLOAT64 NOT NULL,
    first STRING,
    gender STRING,
    street STRING,
    city STRING,
    state STRING,
    zip INTEGER,
    lat FLOAT64,
    long FLOAT64,
    city_pop INTEGER,
    job STRING,
    dob STRING
    --dob DATE
);

CREATE TABLE transactions (
    trans_date_trans_time TIMESTAMP NOT NULL,
    cc_num FLOAT64 NOT NULL,
    merchant STRING,
    category STRING,
    amt FLOAT64,
    trans_num STRING,
    unix_time INTEGER NOT NULL,
    merch_lat FLOAT64,
    merch_long FLOAT64,
    is_fraud INTEGER
);

CREATE VIEW features as
    SELECT
        -- DAYOFWEEK(trans_date_trans_time) AS d,
        -- TIMESTAMPDIFF(YEAR, trans_date_trans_time, CAST(dob as TIMESTAMP)) AS age,
        ST_DISTANCE(ST_POINT(long,lat), ST_POINT(merch_long,merch_lat)) AS distance,
        -- TIMESTAMPDIFF(MINUTE, trans_date_trans_time, last_txn_date) AS trans_diff,
        AVG(amt) OVER(
            PARTITION BY   CAST(cc_num AS NUMERIC)
            ORDER BY unix_time
            -- 1 week is 604800  seconds
            RANGE BETWEEN 604800  PRECEDING AND 1 PRECEDING) AS
        avg_spend_pw,
        AVG(amt) OVER(
            PARTITION BY  CAST(cc_num AS NUMERIC)
            ORDER BY unix_time
            -- 1 month(30 days) is 2592000 seconds
            RANGE BETWEEN 2592000 PRECEDING AND 1 PRECEDING) AS
        avg_spend_pm,
        COUNT(*) OVER(
            PARTITION BY  CAST(cc_num AS NUMERIC)
            ORDER BY unix_time
            -- 1 day is 86400  seconds
            RANGE BETWEEN 86400  PRECEDING AND 1 PRECEDING ) AS
        trans_freq_24,
        category,
        amt,
        state,
        job,
        unix_time,
        city_pop,
        merchant,
        is_fraud
    FROM (
        SELECT t1.*, t2.*
               -- , LAG(trans_date_trans_time, 1) OVER (PARTITION BY t1.cc_num  ORDER BY trans_date_trans_time ASC) AS last_txn_date
        FROM  transactions AS t1
        LEFT JOIN  demographics AS t2
        ON t1.cc_num = t2.cc_num);

Note that all the longitude/latitude fields are not labeled NOT NULL; however the generated code treats them as if they were:

Tuple10::new(t.2.clone(), t.3.clone(), t.4, t.6, t.9, t.15.clone(), t.19, t.20.clone(), st_distance_N_N(cast_to_geopointN_geopoint(GeoPoint::new(cast_to_d_dN(t.18), cast_to_d_dN(t.17))), cast_to_geopointN_geopoint(GeoPoint::new(cast_to_d_dN(t.8), cast_to_d_dN(t.7)))), cast_to_decimal_d(t.1))

where cast_to_d_dN simply unwraps its argument.

ryzhyk commented 1 year ago

PS. In fact, because of the LEFT JOIN, t2 columns can be NULL in the SELECT clause even if they were declared as NOT NULL.