pola-rs / polars

Dataframes powered by a multithreaded, vectorized query engine, written in Rust
https://docs.pola.rs
Other
29.2k stars 1.84k forks source link

Feature Request: add to `df.write_ndjson(json_columns: list[str])`, for columns to be decoded and written out as JSON #17054

Open DeflateAwning opened 2 months ago

DeflateAwning commented 2 months ago

Description

We store a column as geojson in a string in a dataframe. It would be really nice to be able to write this column out as json in the outputted JSON. Very niche use case, but all the alternative hacks are pretty awful.

cmdlineluser commented 2 months ago

Is the problem that you have something like:

df = pl.select(a=1,b=2,c=pl.lit('{"foo":[1, 2, 3],"bar":[4,5,6]}'))

df.select(pl.col.c.str.json_decode()).write_ndjson()

# '{"c":{"foo":[1,2,3],"bar":[4,5,6]}}\n'

But you need the data written without the outer column name label?

# {"foo":[1,2,3],"bar":[4,5,6]}\n'
DeflateAwning commented 2 months ago

Here are some examples that hopefully help better explain my issue with that.

The issue is that null values are outputed as "geometry":{"type":null,"coordinates":null,"features":null} instead of just null.

# Issue #17054
import polars as pl
import json
import tempfile

big_geojson_obj = {"type":"FeatureCollection","features":[{"id":"baddba6f1276e861263d05d9cbecff74","type":"Feature","properties":{"lineColor":"#ffa000","lineWidth":2,"fillColor":"#ffe082","fillOpacity":0.1},"geometry":{"coordinates":[[[-114.42286807,55.199035275],[-118.90384586,53.413681626],[-115.7853142,51.95024781],[-111.63559015,53.23660491],[-114.42286807,55.199035275]]],"type":"Polygon"}}]}

df = pl.DataFrame({
    "id": [1, 2, 3, 4],
    "name": ["Location1", "Location2", "LocationWithLongGeom", "LocationWithNullGeom"],
    "geometry": [
        '{"type":"Point","coordinates":[102.0,0.5]}',
        '{"type":"Point","coordinates":[103.0,1.0]}',
        json.dumps(big_geojson_obj),
        None
    ]
})

print("================ START BASIC WAY ================")
# Just output the column as a String (not what I want)
with tempfile.NamedTemporaryFile(suffix=".ndjson") as f:
    df.write_ndjson(f.name)
    f.seek(0)
    print(f.read().decode())
print("================ END BASIC WAY ================")

print("================ START DEMO GOAL ================")
# Obviously this way is very slow
for row in df.iter_rows(named=True):
    row_out = row.copy()
    row_out["geometry"] = json.loads(row_out["geometry"]) if row_out["geometry"] is not None else None
    print(json.dumps(row_out))  # fill write would happen here
print("================ END DEMO GOAL ================")

print("================ START SUGGESTION 1 ================")
# This is the previous suggestion.
# The issue is that null values are outputed as `"geometry":{"type":null,"coordinates":null,"features":null}` instead of just `null`.
df1 = df.with_columns(pl.col('geometry').str.json_decode())
with tempfile.NamedTemporaryFile(suffix=".ndjson") as f:
    df1.write_ndjson(f.name)
    f.seek(0)
    print(f.read().decode())
print("================ END SUGGESTION 1 ================")
================ START BASIC WAY ================
{"id":1,"name":"Location1","geometry":"{\"type\":\"Point\",\"coordinates\":[102.0,0.5]}"}
{"id":2,"name":"Location2","geometry":"{\"type\":\"Point\",\"coordinates\":[103.0,1.0]}"}
{"id":3,"name":"LocationWithLongGeom","geometry":"{\"type\": \"FeatureCollection\", \"features\": [{\"id\": \"baddba6f1276e861263d05d9cbecff74\", \"type\": \"Feature\", \"properties\": {\"lineColor\": \"#ffa000\", \"lineWidth\": 2, \"fillColor\": \"#ffe082\", \"fillOpacity\": 0.1}, \"geometry\": {\"coordinates\": [[[-114.42286807, 55.199035275], [-118.90384586, 53.413681626], [-115.7853142, 51.95024781], [-111.63559015, 53.23660491], [-114.42286807, 55.199035275]]], \"type\": \"Polygon\"}}]}"}
{"id":4,"name":"LocationWithNullGeom","geometry":null}

================ END BASIC WAY ================
================ START DEMO GOAL ================
{"id": 1, "name": "Location1", "geometry": {"type": "Point", "coordinates": [102.0, 0.5]}}
{"id": 2, "name": "Location2", "geometry": {"type": "Point", "coordinates": [103.0, 1.0]}}
{"id": 3, "name": "LocationWithLongGeom", "geometry": {"type": "FeatureCollection", "features": [{"id": "baddba6f1276e861263d05d9cbecff74", "type": "Feature", "properties": {"lineColor": "#ffa000", "lineWidth": 2, "fillColor": "#ffe082", "fillOpacity": 0.1}, "geometry": {"coordinates": [[[-114.42286807, 55.199035275], [-118.90384586, 53.413681626], [-115.7853142, 51.95024781], [-111.63559015, 53.23660491], [-114.42286807, 55.199035275]]], "type": "Polygon"}}]}}
{"id": 4, "name": "LocationWithNullGeom", "geometry": null}
================ END DEMO GOAL ================
================ START SUGGESTION 1 ================
{"id":1,"name":"Location1","geometry":{"type":"Point","coordinates":[102.0,0.5],"features":null}}
{"id":2,"name":"Location2","geometry":{"type":"Point","coordinates":[103.0,1.0],"features":null}}
{"id":3,"name":"LocationWithLongGeom","geometry":{"type":"FeatureCollection","coordinates":null,"features":[{"id":"baddba6f1276e861263d05d9cbecff74","type":"Feature","properties":{"lineColor":"#ffa000","lineWidth":2,"fillColor":"#ffe082","fillOpacity":0.1},"geometry":{"coordinates":[[[-114.42286807,55.199035275],[-118.90384586,53.413681626],[-115.7853142,51.95024781],[-111.63559015,53.23660491],[-114.42286807,55.199035275]]],"type":"Polygon"}}]}}
{"id":4,"name":"LocationWithNullGeom","geometry":{"type":null,"coordinates":null,"features":null}}

================ END SUGGESTION 1 ================
cjackal commented 2 months ago

The issue is that null values are outputed as "geometry":{"type":null,"coordinates":null,"features":null} instead of just null.

Well this is one of the oldest feature request in polars project, #3462. Your geometry column has dtype Struct[Utf8, Utf8] and currently polars Struct dtype has no validity buffer, so any null in Struct[Utf8,Utf8] is represented as {null, null}.

For an workaround one must venture outside polars (e.g. move to pyarrow and manually replace {null, null} with null), which is precisely what geojson spec requires...

My personal thinking is that #3462 should be marked P-high and 1.0 todo list, current implementation of Struct dtype in polars makes it hard to inter-op with other dataframe libraries due to its inconsistency with standard arrow spec.

cmdlineluser commented 2 months ago

This now produces just null on main thanks to https://github.com/pola-rs/polars/pull/17522

{"id":4,"name":"LocationWithNullGeom","geometry":null}
DeflateAwning commented 1 month ago

Seems as though this is not the case. Please see the following pytest test cases. Several asserts fail.


class Test_nulls_in_structs():
    df = pl.DataFrame({
        'group': [100, 100, 100, 100, 100],
        'id': [1, 2, 3, 4, 5],
        'null_case': ['normal', 'lat_null', 'lon_null', 'both_null', 'normal'],
        'lat': [1, None, 3, None, 4],
        'lon': [10, 20, None, None, 40],
    }).with_columns(
        lon_lat = pl.struct(lon=pl.col('lon'), lat=pl.col('lat'))
    )

    def test_converting_null_null_struct_to_null_value(self):
        """Test whether a {null,null} struct can be converted to a null value.
        Related to: https://github.com/pola-rs/polars/issues/17054
        """

        df1 = self.df
        df1 = df1.with_columns(
            lon_lat = pl.when(
                pl.col('lon_lat').struct.field('lon').is_null()
                & pl.col('lon_lat').struct.field('lat').is_null()
            ).then(None).otherwise(pl.col('lon_lat'))
        )

        # assert not df1.equals(self.df), "The df should have changed." # <- This assertion fails right now.
        df1_both_nulls_row_dicts = df1.filter(pl.col('null_case') == pl.lit('both_null')).to_dicts()
        assert len(df1_both_nulls_row_dicts) == 1 # Obvious.
        # assert df1_both_nulls_row_dicts[0]['lon_lat'] is None, "The lon_lat struct should be None, not a dict." # <- This assertion fails right now.

        df1_both_nulls_json = df1.filter(pl.col('null_case') == pl.lit('both_null')).write_ndjson()
        assert df1_both_nulls_json.strip() == '{"group":100,"id":4,"null_case":"both_null","lat":null,"lon":null,"lon_lat":null}'
cmdlineluser commented 1 month ago

Re-running the latest tests on main after:

>>> not df1.equals(df)
True
>>> df1_both_nulls_row_dicts[0]['lon_lat'] is None
True
>>> df1_both_nulls_json.strip() == '{"group":100,"id":4,"null_case":"both_null","lat":null,"lon":null,"lon_lat":null}'
True
DeflateAwning commented 2 weeks ago

Appears to be fixed in v1.5.0.