hpcc-systems / DataPatterns

HPCC Systems ECL bundle that provides some basic data profiling and research tools to an ECL programmer
3 stars 4 forks source link

Request: Additional info for low-cardinality fields #9

Closed dcamper closed 5 years ago

dcamper commented 6 years ago

For fields where cardinality <=64 values, show those values along with record counts.

GordonSmith commented 6 years ago

64 should probably be a default and be configurable? (Side note: 64 worked well in the past as it was "small" and would include US States)

dcamper commented 6 years ago

Agreed.

GordonSmith commented 6 years ago

FWIW here is my "baby" implementation:

flatToDataset(lf) := FUNCTIONMACRO
    RETURN DATASET(lf, RECORDOF(lf, LOOKUP), FLAT);
ENDMACRO;

lowCardinalityFields(inFile, maxCardinality = 64) := FUNCTIONMACRO
    LOADXML('<xml/>');
    #EXPORTXML(inFileFields, RECORDOF(inFile));
    #DECLARE(recLevel);
    #DECLARE(needsDelim);

    #SET(recLevel, 0)
    #FOR(inFileFields)
        #FOR(field)
            #IF(%{@isRecord}% = 1 OR %{@isDataset}% = 1)
                #SET(recLevel, %recLevel% + 1)
            #ELSEIF(%{@isEnd}% = 1)
                #SET(recLevel, %recLevel% - 1)
            #ELSEIF(%recLevel% = 0)
                #EXPAND(%'@name'% + '_table') := CHOOSEN(TABLE(inFile, {STRING value := (STRING)#EXPAND(%'@name'%), UNSIGNED INTEGER4 rowcount := COUNT(GROUP)}, #EXPAND(%'@name'%), FEW), maxCardinality + 1);
            #END
        #END
    #END

    ValueRowCount := RECORD
        STRING value;
        UNSIGNED INTEGER4 rowcount;
    END;

    FieldValues := RECORD
        STRING fieldID;
        UNSIGNED INTEGER4 valueCount;
        DATASET(ValueRowCount) values;
    END;

    retVal := DATASET([
        #SET(recLevel, 0)
        #SET(needsDelim, 0)
        #FOR(inFileFields)
            #FOR(field)
                #IF(%{@isRecord}% = 1 OR %{@isDataset}% = 1)
                    #SET(recLevel, %recLevel% + 1)
                #ELSEIF(%{@isEnd}% = 1)
                    #SET(recLevel, %recLevel% - 1)
                #ELSEIF(%recLevel% = 0)
                    #IF(%needsDelim% = 1) , #END                
                    {
                        %'@name'%,
                        COUNT(#EXPAND(%'@name'% + '_table')),
                        #EXPAND(%'@name'% + '_table')
                    }
                    #SET(needsDelim, 1)                    
                #END
            #END
        #END
    ], FieldValues);
    RETURN retVal(valueCount <= maxCardinality);
ENDMACRO;

inFile := flatToDataset('~progguide::exampledata::accounts');
y := lowCardinalityFields(inFile);
OUTPUT(y, NAMED('accounts'));
dcamper commented 5 years ago

Close via PR 6f69c3ed44c6cefba20261932089f31478ee8fe9