capitalone / DataProfiler

What's in your data? Extract schema, statistics and entities from datasets
https://capitalone.github.io/DataProfiler
Apache License 2.0
1.41k stars 157 forks source link

Column profiled as int but should be text/string #1130

Open carlsonp opened 4 months ago

carlsonp commented 4 months ago

General Information:

Describe the bug:

I have a parquet file column org_number that should be treated as text but is being profiled into an int.

Pandas info reports it as an object:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26943 entries, 0 to 26942
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype
---  ------             --------------  -----
<snip>
 2   org_number         26943 non-null  object
<snip>

When I use Pandas describe(), it doesn't show any numeric statistics like min, max, stddev, etc. which is correct.

The output from the profiler:

{
            "column_name": "org_number",
            "data_type": "int",
            "categorical": false,
            "order": "random",
            "samples": "['01321', '07618', '08257', '02321', '09123']",
            "statistics": {
                "min": 1.0,
                "max": 105121.0,
                "mode": "[6781.24]",
                "median": 6573.749,
                "sum": 220034705.0,
                "mean": 8166.6743,
                "variance": 150256092.2856,
                "stddev": 12257.8992,
                "skewness": 5.3242,
                "kurtosis": 30.6063,
                "histogram": {
                    "bin_edges": "[  1.        , 363.48275862, ... , 104758.51724138, 105121.        ]",
                    "bin_counts": "[ 259.,  539.,  126., 1006., 2057., ... , 0., 0., 0., 0., 7.]"
                },
                "quantiles": {
                    "0": 3350.0226,
                    "1": 6573.749,
                    "2": 8726.115
                },
                "median_abs_deviation": 2195.6598,
                "num_zeros": 0,
                "num_negatives": 0,
                "times": {
                    "min": 0.0001,
                    "max": 0.0001,
                    "sum": 0.0001,
                    "variance": 0.0002,
                    "skewness": 0.0046,
                    "kurtosis": 0.0046,
                    "histogram_and_quantiles": 0.0042,
                    "num_zeros": 0.0002,
                    "num_negatives": 0.0001
                },
                "unique_count": 1367,
                "unique_ratio": 0.0507,
                "sample_size": 26943,
                "null_count": 0,
                "null_types": [],
                "null_types_index": {},
                "data_type_representation": {
                    "datetime": 0.0,
                    "int": 1.0,
                    "float": 1.0,
                    "string": 1.0
                }
            }
        },

To Reproduce:

The code I'm using:

data = dp.Data(filename)
profile_options = dp.ProfilerOptions()

df = pd.read_parquet(filename)
print(df.info())

profile_options.set({
    "structured_options.data_labeler.is_enabled": False,
    "unstructured_options.data_labeler.is_enabled": False,
    "structured_options.correlation.is_enabled": False,
    "structured_options.multiprocess.is_enabled": True,
    "structured_options.chi2_homogeneity.is_enabled": False,
    "structured_options.category.max_sample_size_to_check_stop_condition": 1,
    "structured_options.category.stop_condition_unique_value_ratio": 0.001,
    "structured_options.sampling_ratio": 1.0,
    "structured_options.null_replication_metrics.is_enabled": False
})

profile = dp.Profiler(data, options=profile_options)
human_readable_report = profile.report(report_options={"output_format":"pretty"})

with open("reportfile.json", "w") as outfile:
    outfile.write(json.dumps(human_readable_report, indent=4))

I can't provide the raw data but I can test things. The data is interesting in that it's almost integer, but many of the entries have 0's prepended as you can see in the samples.

Expected behavior:

I would expect the type to be string/text.

Screenshots:

Additional context:

carlsonp commented 4 months ago

And here's an even simpler reproducible example:

import json
import dataprofiler as dp
import pandas as pd

filename = "myfile.parquet"
df = pd.DataFrame(data={'org_number': ['0001', '0002', '0003']})
df.to_parquet(filename)

data = dp.Data(filename)
profile_options = dp.ProfilerOptions()

df = pd.read_parquet(filename)
print(df.info())
print(df['org_number'].describe())

profile_options.set({
    "structured_options.data_labeler.is_enabled": False,
    "unstructured_options.data_labeler.is_enabled": False,
    "structured_options.correlation.is_enabled": False,
    "structured_options.multiprocess.is_enabled": True,
    "structured_options.chi2_homogeneity.is_enabled": False,
    "structured_options.category.max_sample_size_to_check_stop_condition": 1,
    "structured_options.category.stop_condition_unique_value_ratio": 0.001,
    "structured_options.sampling_ratio": 1.0,
    "structured_options.null_replication_metrics.is_enabled": False
})

profile = dp.Profiler(data, options=profile_options)
human_readable_report = profile.report(report_options={"output_format":"pretty"})

print(json.dumps(human_readable_report, indent=4))

And the output:

{
    "global_stats": {
        "samples_used": 3,
        "column_count": 1,
        "row_count": 3,
        "row_has_null_ratio": 0.0,
        "row_is_null_ratio": 0.0,
        "unique_row_ratio": 1.0,
        "duplicate_row_count": 0,
        "file_type": "parquet",
        "encoding": null,
        "correlation_matrix": null,
        "chi2_matrix": null,
        "profile_schema": {
            "org_number": [
                0
            ]
        },
        "times": {
            "row_stats": 0.0005
        }
    },
    "data_stats": [
        {
            "column_name": "org_number",
            "data_type": "int",
            "categorical": false,
            "order": "ascending",
            "samples": "['0001', '0003', '0002']",
            "statistics": {
                "min": 1.0,
                "max": 3.0,
                "mode": "[1.001, 2.001, 2.999]",
                "median": 2.001,
                "sum": 6.0,
                "mean": 2.0,
                "variance": 1.0,
                "stddev": 1.0,
                "skewness": 0.0,
                "kurtosis": NaN,
                "histogram": {
                    "bin_edges": "[1.        , 1.66666667, 2.33333333, 3.        ]",
                    "bin_counts": "[1., 1., 1.]"
                },
                "quantiles": {
                    "0": 1.0015,
                    "1": 2.001,
                    "2": 2.9985
                },
                "median_abs_deviation": 0.998,
                "num_zeros": 0,
                "num_negatives": 0,
                "times": {
                    "min": 0.0,
                    "max": 0.0,
                    "sum": 0.0001,
                    "variance": 0.0001,
                    "skewness": 0.0003,
                    "kurtosis": 0.0002,
                    "histogram_and_quantiles": 0.0011,
                    "num_zeros": 0.0001,
                    "num_negatives": 0.0001
                },
                "unique_count": 3,
                "unique_ratio": 1.0,
                "sample_size": 3,
                "null_count": 0,
                "null_types": [],
                "null_types_index": {},
                "data_type_representation": {
                    "datetime": 0.0,
                    "int": 1.0,
                    "float": 1.0,
                    "string": 1.0
                }
            }
        }
    ]
}