Open DeflateAwning opened 2 months ago
Is the problem that you have something like:
df = pl.select(a=1,b=2,c=pl.lit('{"foo":[1, 2, 3],"bar":[4,5,6]}'))
df.select(pl.col.c.str.json_decode()).write_ndjson()
# '{"c":{"foo":[1,2,3],"bar":[4,5,6]}}\n'
But you need the data written without the outer column name label?
# {"foo":[1,2,3],"bar":[4,5,6]}\n'
Here are some examples that hopefully help better explain my issue with that.
The issue is that null values are outputed as "geometry":{"type":null,"coordinates":null,"features":null}
instead of just null
.
# Issue #17054
import polars as pl
import json
import tempfile
big_geojson_obj = {"type":"FeatureCollection","features":[{"id":"baddba6f1276e861263d05d9cbecff74","type":"Feature","properties":{"lineColor":"#ffa000","lineWidth":2,"fillColor":"#ffe082","fillOpacity":0.1},"geometry":{"coordinates":[[[-114.42286807,55.199035275],[-118.90384586,53.413681626],[-115.7853142,51.95024781],[-111.63559015,53.23660491],[-114.42286807,55.199035275]]],"type":"Polygon"}}]}
df = pl.DataFrame({
"id": [1, 2, 3, 4],
"name": ["Location1", "Location2", "LocationWithLongGeom", "LocationWithNullGeom"],
"geometry": [
'{"type":"Point","coordinates":[102.0,0.5]}',
'{"type":"Point","coordinates":[103.0,1.0]}',
json.dumps(big_geojson_obj),
None
]
})
print("================ START BASIC WAY ================")
# Just output the column as a String (not what I want)
with tempfile.NamedTemporaryFile(suffix=".ndjson") as f:
df.write_ndjson(f.name)
f.seek(0)
print(f.read().decode())
print("================ END BASIC WAY ================")
print("================ START DEMO GOAL ================")
# Obviously this way is very slow
for row in df.iter_rows(named=True):
row_out = row.copy()
row_out["geometry"] = json.loads(row_out["geometry"]) if row_out["geometry"] is not None else None
print(json.dumps(row_out)) # fill write would happen here
print("================ END DEMO GOAL ================")
print("================ START SUGGESTION 1 ================")
# This is the previous suggestion.
# The issue is that null values are outputed as `"geometry":{"type":null,"coordinates":null,"features":null}` instead of just `null`.
df1 = df.with_columns(pl.col('geometry').str.json_decode())
with tempfile.NamedTemporaryFile(suffix=".ndjson") as f:
df1.write_ndjson(f.name)
f.seek(0)
print(f.read().decode())
print("================ END SUGGESTION 1 ================")
================ START BASIC WAY ================
{"id":1,"name":"Location1","geometry":"{\"type\":\"Point\",\"coordinates\":[102.0,0.5]}"}
{"id":2,"name":"Location2","geometry":"{\"type\":\"Point\",\"coordinates\":[103.0,1.0]}"}
{"id":3,"name":"LocationWithLongGeom","geometry":"{\"type\": \"FeatureCollection\", \"features\": [{\"id\": \"baddba6f1276e861263d05d9cbecff74\", \"type\": \"Feature\", \"properties\": {\"lineColor\": \"#ffa000\", \"lineWidth\": 2, \"fillColor\": \"#ffe082\", \"fillOpacity\": 0.1}, \"geometry\": {\"coordinates\": [[[-114.42286807, 55.199035275], [-118.90384586, 53.413681626], [-115.7853142, 51.95024781], [-111.63559015, 53.23660491], [-114.42286807, 55.199035275]]], \"type\": \"Polygon\"}}]}"}
{"id":4,"name":"LocationWithNullGeom","geometry":null}
================ END BASIC WAY ================
================ START DEMO GOAL ================
{"id": 1, "name": "Location1", "geometry": {"type": "Point", "coordinates": [102.0, 0.5]}}
{"id": 2, "name": "Location2", "geometry": {"type": "Point", "coordinates": [103.0, 1.0]}}
{"id": 3, "name": "LocationWithLongGeom", "geometry": {"type": "FeatureCollection", "features": [{"id": "baddba6f1276e861263d05d9cbecff74", "type": "Feature", "properties": {"lineColor": "#ffa000", "lineWidth": 2, "fillColor": "#ffe082", "fillOpacity": 0.1}, "geometry": {"coordinates": [[[-114.42286807, 55.199035275], [-118.90384586, 53.413681626], [-115.7853142, 51.95024781], [-111.63559015, 53.23660491], [-114.42286807, 55.199035275]]], "type": "Polygon"}}]}}
{"id": 4, "name": "LocationWithNullGeom", "geometry": null}
================ END DEMO GOAL ================
================ START SUGGESTION 1 ================
{"id":1,"name":"Location1","geometry":{"type":"Point","coordinates":[102.0,0.5],"features":null}}
{"id":2,"name":"Location2","geometry":{"type":"Point","coordinates":[103.0,1.0],"features":null}}
{"id":3,"name":"LocationWithLongGeom","geometry":{"type":"FeatureCollection","coordinates":null,"features":[{"id":"baddba6f1276e861263d05d9cbecff74","type":"Feature","properties":{"lineColor":"#ffa000","lineWidth":2,"fillColor":"#ffe082","fillOpacity":0.1},"geometry":{"coordinates":[[[-114.42286807,55.199035275],[-118.90384586,53.413681626],[-115.7853142,51.95024781],[-111.63559015,53.23660491],[-114.42286807,55.199035275]]],"type":"Polygon"}}]}}
{"id":4,"name":"LocationWithNullGeom","geometry":{"type":null,"coordinates":null,"features":null}}
================ END SUGGESTION 1 ================
The issue is that null values are outputed as
"geometry":{"type":null,"coordinates":null,"features":null}
instead of justnull
.
Well this is one of the oldest feature request in polars project, #3462. Your geometry
column has dtype Struct[Utf8, Utf8]
and currently polars Struct
dtype has no validity buffer, so any null
in Struct[Utf8,Utf8]
is represented as {null, null}
.
For an workaround one must venture outside polars
(e.g. move to pyarrow
and manually replace {null, null}
with null
), which is precisely what geojson
spec requires...
My personal thinking is that #3462 should be marked P-high and 1.0 todo list, current implementation of Struct
dtype in polars
makes it hard to inter-op with other dataframe libraries due to its inconsistency with standard arrow spec.
This now produces just null
on main thanks to https://github.com/pola-rs/polars/pull/17522
{"id":4,"name":"LocationWithNullGeom","geometry":null}
Seems as though this is not the case. Please see the following pytest test cases. Several asserts fail.
class Test_nulls_in_structs():
df = pl.DataFrame({
'group': [100, 100, 100, 100, 100],
'id': [1, 2, 3, 4, 5],
'null_case': ['normal', 'lat_null', 'lon_null', 'both_null', 'normal'],
'lat': [1, None, 3, None, 4],
'lon': [10, 20, None, None, 40],
}).with_columns(
lon_lat = pl.struct(lon=pl.col('lon'), lat=pl.col('lat'))
)
def test_converting_null_null_struct_to_null_value(self):
"""Test whether a {null,null} struct can be converted to a null value.
Related to: https://github.com/pola-rs/polars/issues/17054
"""
df1 = self.df
df1 = df1.with_columns(
lon_lat = pl.when(
pl.col('lon_lat').struct.field('lon').is_null()
& pl.col('lon_lat').struct.field('lat').is_null()
).then(None).otherwise(pl.col('lon_lat'))
)
# assert not df1.equals(self.df), "The df should have changed." # <- This assertion fails right now.
df1_both_nulls_row_dicts = df1.filter(pl.col('null_case') == pl.lit('both_null')).to_dicts()
assert len(df1_both_nulls_row_dicts) == 1 # Obvious.
# assert df1_both_nulls_row_dicts[0]['lon_lat'] is None, "The lon_lat struct should be None, not a dict." # <- This assertion fails right now.
df1_both_nulls_json = df1.filter(pl.col('null_case') == pl.lit('both_null')).write_ndjson()
assert df1_both_nulls_json.strip() == '{"group":100,"id":4,"null_case":"both_null","lat":null,"lon":null,"lon_lat":null}'
Re-running the latest tests on main after:
>>> not df1.equals(df)
True
>>> df1_both_nulls_row_dicts[0]['lon_lat'] is None
True
>>> df1_both_nulls_json.strip() == '{"group":100,"id":4,"null_case":"both_null","lat":null,"lon":null,"lon_lat":null}'
True
Appears to be fixed in v1.5.0.
Description
We store a column as geojson in a string in a dataframe. It would be really nice to be able to write this column out as json in the outputted JSON. Very niche use case, but all the alternative hacks are pretty awful.