fluree / core

Fluree releases and public bug reports
0 stars 0 forks source link

`selectDistinct` Breaks with `groupBy` #86

Open aaj3f opened 5 months ago

aaj3f commented 5 months ago

Description

When using selectDistinct with groupBy, the grouped items are not unique sets.

For example, without groupBy, this query returns unique values for ?p

    "where": {
        "@id": "?id",
        "@type": "?type",
        "?p": "?o"
    },
    "selectDistinct": [
        "?p"
    ]

But when using groupBy, this query returns 2+ duplicate values for ?p

    "where": {
        "@id": "?id",
        "@type": "?type",
        "?p": "?o"
    },
    "selectDistinct": [
        "?type",
        "?p"
    ],
    "groupBy": "?type"

Steps to Reproduce

Submit this /create txn

{
    "ledger": "fluree-jld/369435906933189",
    "insert": [
        {
            "@id": "http://flureesense.org/myDB/1",
            "@type": "integrated_transaction",
            "Beneficiary_GRID": 9000015269,
            "Ordering_GRID": 1000416653,
            "Beneficiary_Account_Level_Country_Code": "coun-10",
            "Beneficiary_BIC": "GOLDUS33007",
            "Beneficiary_Country_Code": "coun-35",
            "Date": "8/5/2014",
            "Ordering_Account_Level_Country_Code": "coun-5",
            "Ordering_BIC": "GOLDUS33007",
            "Ordering_Country_Code": "coun-5",
            "Time": "7:58:34 AM",
            "Transaction_Amount_Original": 1168.53,
            "Transaction_Amount_USD": 1168.53,
            "Transaction_Currency": "U.S. Dollar",
            "Transaction_ID": 454721451,
            "Ordering_Geography_ID": "coun-5",
            "Ordering_Industry_ID": "ind-1",
            "Ordering_Derived_Risk_Rating": "L",
            "Ordering_Fulltime_Employees": 26900,
            "Ordering_Bank_isCustomer_Flag": "Y",
            "Ordering_KYC_Risk_Rating": "L",
            "Ordering_Legal_Entity_Type": "Wholly or majority owned",
            "Ordering_Name": "Tdk Corporation",
            "Ordering_OFAC_Sanction_List_Flag": "N",
            "Ordering_Risk_Percentile": 0.265,
            "Ordering_Total_Risk_Score": 0.5,
            "Ordering_Website": "www.TdkCorporation.com",
            "Ordering_Industry_Name": "Electronics & Electric Goods",
            "Ordering_Industry_Risk_Rating": "L",
            "Ordering_Geography_Name": "Japan",
            "Ordering_Geography_Risk_Rating": "M",
            "Ordering_Account_Level_Geography_ID": "coun-5",
            "Ordering_Account_Level_Geography_Name": "Japan",
            "Ordering_Account_Level_Geography_Risk_Rating": "M",
            "Beneficiary_Geography_ID": "coun-35",
            "Beneficiary_Industry_ID": "ind-19",
            "Beneficiary_Derived_Risk_Rating": "M",
            "Beneficiary_Fulltime_Employees": 34900,
            "Beneficiary_Bank_isCustomer_Flag": "Y",
            "Beneficiary_KYC_Risk_Rating": "L",
            "Beneficiary_Legal_Entity_Type": "Wholly or majority owned",
            "Beneficiary_Name": "Citigroup",
            "Beneficiary_OFAC_Sanction_List_Flag": "N",
            "Beneficiary_Risk_Percentile": 0.642,
            "Beneficiary_Total_Risk_Score": 0.85,
            "Beneficiary_Website": "www.Citigroup.com",
            "Beneficiary_Industry_Name": "Industrial Equipment",
            "Beneficiary_Industry_Risk_Rating": "L",
            "Beneficiary_Geography_Name": "Poland",
            "Beneficiary_Geography_Risk_Rating": "M",
            "Beneficiary_Account_Level_Geography_ID": "coun-10",
            "Beneficiary_Account_Level_Geography_Name": "Jordan",
            "Beneficiary_Account_Level_Geography_Risk_Rating": "H"
        },
        {
            "@id": "http://flureesense.org/myDB/2",
            "@type": "integrated_transaction",
            "Beneficiary_GRID": 9000015269,
            "Ordering_GRID": 1000416653,
            "Beneficiary_Account_Level_Country_Code": "coun-60",
            "Beneficiary_BIC": "MIDLGB22FIN",
            "Beneficiary_Country_Code": "coun-35",
            "Date": "2/19/2014",
            "Ordering_Account_Level_Country_Code": "coun-5",
            "Ordering_BIC": "MIDLGB22FIN",
            "Ordering_Country_Code": "coun-5",
            "Time": "8:40:23 AM",
            "Transaction_Amount_Original": 1085.07,
            "Transaction_Amount_USD": 1085.07,
            "Transaction_Currency": "U.S. Dollar",
            "Transaction_ID": 454780609,
            "Ordering_Geography_ID": "coun-5",
            "Ordering_Industry_ID": "ind-1",
            "Ordering_Derived_Risk_Rating": "L",
            "Ordering_Fulltime_Employees": 26900,
            "Ordering_Bank_isCustomer_Flag": "Y",
            "Ordering_KYC_Risk_Rating": "L",
            "Ordering_Legal_Entity_Type": "Wholly or majority owned",
            "Ordering_Name": "Tdk Corporation",
            "Ordering_OFAC_Sanction_List_Flag": "N",
            "Ordering_Risk_Percentile": 0.265,
            "Ordering_Total_Risk_Score": 0.5,
            "Ordering_Website": "www.TdkCorporation.com",
            "Ordering_Industry_Name": "Electronics & Electric Goods",
            "Ordering_Industry_Risk_Rating": "L",
            "Ordering_Geography_Name": "Japan",
            "Ordering_Geography_Risk_Rating": "M",
            "Ordering_Account_Level_Geography_ID": "coun-5",
            "Ordering_Account_Level_Geography_Name": "Japan",
            "Ordering_Account_Level_Geography_Risk_Rating": "M",
            "Beneficiary_Geography_ID": "coun-35",
            "Beneficiary_Industry_ID": "ind-19",
            "Beneficiary_Derived_Risk_Rating": "M",
            "Beneficiary_Fulltime_Employees": 34900,
            "Beneficiary_Bank_isCustomer_Flag": "Y",
            "Beneficiary_KYC_Risk_Rating": "L",
            "Beneficiary_Legal_Entity_Type": "Wholly or majority owned",
            "Beneficiary_Name": "Citigroup",
            "Beneficiary_OFAC_Sanction_List_Flag": "N",
            "Beneficiary_Risk_Percentile": 0.642,
            "Beneficiary_Total_Risk_Score": 0.85,
            "Beneficiary_Website": "www.Citigroup.com",
            "Beneficiary_Industry_Name": "Industrial Equipment",
            "Beneficiary_Industry_Risk_Rating": "L",
            "Beneficiary_Geography_Name": "Poland",
            "Beneficiary_Geography_Risk_Rating": "M",
            "Beneficiary_Account_Level_Geography_ID": "coun-60",
            "Beneficiary_Account_Level_Geography_Name": "Algeria",
            "Beneficiary_Account_Level_Geography_Risk_Rating": "M"
        }
    ]
}

Issue this query

{
    "@context": {
        "f": "https://ns.flur.ee/ledger#",
        "owl": "http://www.w3.org/2002/07/owl#",
        "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
        "rdfs": "http://www.w3.org/2000/01/rdf-schema#",
        "sh": "http://www.w3.org/ns/shacl#",
        "skos": "http://www.w3.org/2008/05/skos#",
        "xsd": "http://www.w3.org/2001/XMLSchema#",
        "ex": "http://example.org/",
        "schema": "http://schema.org/"
    },
    "from": "fluree-jld/369435906933189",
    "where": {
        "@id": "?id",
        "@type": "?type",
        "?p": "?o"
    },
    "selectDistinct": [
        "?type",
        "?p"
    ],
    "groupBy": "?type"
}

Notice duplicates like the following in the query results:

[
    [
        "integrated_transaction",
        [
            "@id",
            "@type",
            "Beneficiary_Industry_Name",
            ...
            "@id",
            "@type",
            "Beneficiary_Industry_Name",
            ...
        ]
    ]
]
zonotope commented 4 months ago

I don't think this is a bug but intended behavior. When group-by is used in a query, the select expression selects from the set of groups of results, not the individual results within each group. If two groups of solutions are different, then select-distinct will treat them as different and include both irrespective of the repetition within each group.

I think we'd need to add a new distinct aggregate function modifier to support this behavior because I think changing the behavior of select-distinct will break the model of how it should work in other places. We already support a similar behavior with count-distinct, so we'd need to generalize this to allow distinct to act as it's own aggregate function. Then, this query would look like this:

{
    "@context": {
        "f": "https://ns.flur.ee/ledger#",
        "owl": "http://www.w3.org/2002/07/owl#",
        "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
        "rdfs": "http://www.w3.org/2000/01/rdf-schema#",
        "sh": "http://www.w3.org/ns/shacl#",
        "skos": "http://www.w3.org/2008/05/skos#",
        "xsd": "http://www.w3.org/2001/XMLSchema#",
        "ex": "http://example.org/",
        "schema": "http://schema.org/"
    },
    "from": "fluree-jld/369435906933189",
    "where": {
        "@id": "?id",
        "@type": "?type",
        "?p": "?o"
    },
    "select": [
        "?type",
        "(distinct ?p)"
    ],
    "groupBy": "?type"
}

The SPARQL spec describes a similar mechanism where DISTINCT is used as a modifier of the input of any aggregate function, including GROUPCONCAT, which is the "default" aggregation in FlureeQL if no other aggregation is specified for a group.