Open cfcs opened 12 months ago
Hi @cfcs thanks for the issue. I think logical types are not applied, but it's doable.
If you would like to support me, you can prepare a test example (issue for the reference), it will allow me to save time during testing.
If you want, you can include your specific case I can test against.
@Stretch sure, here is a set of encoding/decoding functions:
defmodule Enc do
# int logical types:
def date(d) when is_struct(d, Date) do # int
Date.diff(d, ~D[1970-01-01])
end
def date(d) when is_binary(d) and 10 == byte_size(d) do
date(Date.from_iso8601!(d)) # this can obviously fail
end
# long logical types:
def timestamp_millis(ts) when is_struct(ts, DateTime) do
DateTime.to_unix(ts, :millisecond)
end
def timestamp_millis(ts) when is_binary(ts) do
{:ok, utc, _tzsec} = DateTime.from_iso8601(ts)
timestamp_millis(utc)
end
def timestamp_micros(ts) when is_struct(ts, DateTime) do
DateTime.to_unix(ts, :microsecond)
end
def timestamp_micros(ts) when is_binary(ts) do
{:ok, utc, _tzsec} = DateTime.from_iso8601(ts)
timestamp_micros(utc)
end
def time_millis(t) when is_struct(t, Time) do
{s,u} = t |> Time.to_seconds_after_midnight()
s * 1000 + Integer.floor_div(u, 1000)
end
def time_millis(t) when is_binary(t) do
time_millis(Time.from_iso8601!(t))
end
def time_micros(t) when is_struct(t, Time) do
{s,u} = t |> Time.to_seconds_after_midnight()
s * 1000_000 + u
end
def time_micros(t) when is_binary(t) do
time_micros(Time.from_iso8601!(t))
end
end
defmodule Dec do
def date(int) do Date.add(~D[1970-01-01], int) end
def timestamp_millis(long) do DateTime.from_unix!(long, :millisecond) end
def timestamp_micros(long) do DateTime.from_unix!(long, :microsecond) end
def time_millis(long) do
seconds = Integer.floor_div(long, 1000)
millisecs = (long - seconds * 1000) * 1000
Time.from_seconds_after_midnight(seconds, {millisecs, 3})
end
def time_micros(long) do
seconds = Integer.floor_div(long, 1000_000)
microsecs = long - seconds * 1000_000
Time.from_seconds_after_midnight(seconds, {microsecs, 6})
end
end
And corresponding tests:
ts_iso = "2019-10-12T17:57:42.123456Z"
ts_iso_tz = "2019-10-12T17:57:42.123456+0130" # 90 min before
ts_sig = ~U[2019-10-12 17:57:42.123456Z]
ts_sig_tz = ~U[2019-10-12 16:27:42.123456Z]
date_iso = "2019-10-12"
date_sig = ~D[2019-10-12]
time_sig = ~T[17:57:42.123456]
time_iso = "17:57:42.123456"
1570903062123 = Enc.timestamp_millis(ts_sig)
1570903062123 = Enc.timestamp_millis(ts_iso)
1570897662123 = Enc.timestamp_millis(ts_iso_tz)
1570903062123456 = Enc.timestamp_micros(ts_sig)
1570903062123456 = Enc.timestamp_micros(ts_iso)
1570897662123456 = Enc.timestamp_micros(ts_iso_tz)
18181 = Enc.date(date_sig)
18181 = Enc.date(date_iso)
# These are "naive" intervals from midnight, not timezoned:
64662123 = Enc.time_millis(time_sig)
64662123 = Enc.time_millis(time_iso)
64662123456 = Enc.time_micros(time_iso)
64662123456 = Enc.time_micros(time_sig)
~D[2019-10-12] = date_sig
~D[2019-10-12] = Dec.date(Enc.date(date_iso))
~U[2019-10-12 17:57:42.123456Z] = ts_sig
~U[2019-10-12 17:57:42.123Z] = Dec.timestamp_millis(Enc.timestamp_millis(ts_iso))
~U[2019-10-12 17:57:42.123456Z] = Dec.timestamp_micros(Enc.timestamp_micros(ts_iso))
~U[2019-10-12 16:27:42.123456Z] = ts_sig_tz
~U[2019-10-12 16:27:42.123Z] = Dec.timestamp_millis(Enc.timestamp_millis(ts_iso_tz))
~U[2019-10-12 16:27:42.123456Z] = Dec.timestamp_micros(Enc.timestamp_micros(ts_iso_tz))
~T[17:57:42.123456] = time_sig
~T[17:57:42.123] = Dec.time_millis(Enc.time_millis(time_iso))
~T[17:57:42.123456] = Dec.time_micros(Enc.time_micros(time_iso))
dataclasses-avroschema has some examples. Here is an example schema generated using their tooling:
{
"type": "record",
"name": "TimeLogicalTypes",
"fields": [
{
"name": "date",
"type": {
"type": "int",
"logicalType": "date"
},
"default": 18181
},
{
"name": "timestamp_millis",
"type": {
"type": "long",
"logicalType": "timestamp-millis"
},
"default": 1570903062123
},
{
"name": "timestamp_micro",
"type": {
"type": "long",
"logicalType": "timestamp-micros"
},
"default": 1570903062123456
},
{
"name": "daily_time_millis",
"type": {
"type": "int",
"logicalType": "time-millis"
},
"default": 64662123
},
{
"name": "daily_time_micro",
"type": {
"type": "long",
"logicalType": "time-micros"
},
"default": 64662123456
}
],
"doc": "Time logical types"
}
The Python code for that:
# pip install dataclasses-avroschema
import datetime
import dataclasses
import typing
import json
from dataclasses_avroschema import AvroModel, TimeMicro, DateTimeMicro
# UTC / naive:
a_datetime = datetime.datetime(2019, 10, 12, 17, 57, 42, 123456)
# timezoned timestamp (results in 15:57 instead of 17:57 because Avro doesn't preserve TZ info)
a_datetime = datetime.datetime.fromisoformat("2019-10-12T17:57:42.123456+02")
# UTC (this works)
a_datetime = datetime.datetime.fromisoformat("2019-10-12T17:57:42.123456+00")
@dataclasses.dataclass
class TimeLogicalTypes(AvroModel):
"Time logical types"
date: datetime.date = a_datetime.date() # date
timestamp_millis: datetime.datetime = a_datetime # timestamp-millis
timestamp_micro: DateTimeMicro = a_datetime # timestamp-micro
daily_time_millis: datetime.time = a_datetime.time() # time-millis
daily_time_micro: TimeMicro = a_datetime.time() # time-micros
# Print the derived Avro schema:
# print(json.dumps(json.loads(TimeLogicalTypes.avro_schema()), indent=2))
t1 = TimeLogicalTypes()
enc_json = t1.serialize(serialization_type="avro-json")
enc = t1.serialize() # binary serialization
dec_t1 = TimeLogicalTypes.deserialize(enc) # deserialized t1
re_enc_json = dec_t1.serialize(serialization_type="avro-json")
print('---- avro serialized json')
print(enc_json)
print('---- avro serialized binary')
print(enc.hex())
print('----- iso8861 json:')
t1_json = t1.to_json()
dec_t1_json = dec_t1.to_json()
print(t1_json)
print(dec_t1_json)
assert dec_t1_json == t1_json
Sample output from that script (minus the schema above):
---- avro serialized json
b'{"date": 18181, "timestamp_millis": 1570903062123, "timestamp_micro": 1570903062123456,
"daily_time_millis": 64662123, "daily_time_micro": 64662123456}'
---- avro serialized binary
8a9c02d6d9f391b85b80cff4efcbaeca05d6a9d53d80cfcde2e103
----- iso8861 json:
{"date": "2019-10-12", "timestamp_millis": "2019-10-12T17:57:42+0000",
"timestamp_micro": "2019-10-12T17:57:42+0000",
"daily_time_millis": "17:57:42", "daily_time_micro": "17:57:42"}
{"date": "2019-10-12", "timestamp_millis": "2019-10-12T17:57:42+0000",
"timestamp_micro": "2019-10-12T17:57:42+0000",
"daily_time_millis": "17:57:42", "daily_time_micro": "17:57:42"}
It's perhaps worth noting that ~U
is the best we can do on the decoder side because Avro does not preserve timezone info, so it's not a bijection when dealing with timezoned ISO-8861 dates.
For completeness here are the logical types I did not implement and that might also be useful:
DateTime.diff/3
with DateTime.diff(a, b, :millisecond)
and can apply with DateTime.add/3
Not sure when you would ever want to serialize not-timezoned stuff, but I guess they put it in the spec, and there is a native Elixir sigil ~N so: https://avro.apache.org/docs/1.11.1/specification/#local-timestamp-millisecond-precision https://avro.apache.org/docs/1.11.1/specification/#local-timestamp-microsecond-precision
Thanks @cfcs, I appreciate your help. I'm a bit busy right now, but I will put that thing to work soon ™️
Hi, I have a schema like:
In order to pass a Date sigil into it it has to be converted to "the number of days since unix epoch":
I don't have the expertise to add handling of
logicalType
to the library, but leaving it here in case someone else needs this. :-) ping @Stretch