OpenEnergyPlatform / oemetadata

Repository for the Open Energy Family metadata. Contains metadata templates, examples and schemas. For metadata conversion see https://github.com/OpenEnergyPlatform/omi
https://openenergyplatform.github.io/oemetadata/
MIT License
21 stars 3 forks source link

Upgrade script to migrate all metadata currently on oep to v1.5.1 #100

Closed wingechr closed 1 year ago

wingechr commented 1 year ago
import json
import re

from omi.dialects.oep import OEP_V_1_5_Dialect

def remove_nulls(obj):
    """recursively remove empty structures (because they cause errors in validation)"""

    EMPTY_ITEMS = [None, "", "null", [None], [], [""], ["null"], [{}], {}]
    EMPTY_ITEMS_STR = [json.dumps(x) for x in EMPTY_ITEMS]

    if isinstance(obj, list):
        # recursive over list elements
        for x in obj:
            remove_nulls(x)
    elif isinstance(obj, dict):
        for key in tuple(obj.keys()):
            # recursive over dict values
            remove_nulls(obj[key])

            # remove empty structures
            val_str = json.dumps(obj[key])
            if val_str in EMPTY_ITEMS_STR:
                del obj[key]

def fix_date(obj, key):
    """fix date strings.
    unfixable specialcases will be moved into _{FIELD}
    """

    KNOWN_SPECIAL_CASES = [
        "current",
        "updated regulary",
        "valid for the 1st day of the 2nd 8-day period",
        "2012 (weather year)",
        "v1.2",
    ]

    if key in obj:
        val = obj[key]

        if re.match("^[0-9-]+T0$", val):
            obj[key] = val[:2]  # remove T0
        elif re.match("^[0-9-]+T00$", val):
            obj[key] = val[:3]  # remove T00
        elif re.match("^[0-9-]+T00:$", val):
            obj[key] = val[:4]  # remove T00

        # cannot be parsed as datetime
        # => move move into _{FIELD}, e.g. "_start" instead of "start"
        if val in KNOWN_SPECIAL_CASES:
            del obj[key]
            key_new = f"_{key}"
            obj[key_new] = val

def unwrap_single_item_list(obj, key):
    """replace list with exactly 1 item with the item.

    Example:
        >>> obj = {"x": [{"y": 1}]}
        >>> unwrap_single_item_list(obj, "x")
        >>> obj
        {"x": {"y": 1}}
    """
    val = obj.get(key)
    if isinstance(val, list):
        assert len(val) == 1
        val = val[0]
        obj[key] = val

def wrap_dict_to_single_item_list(obj, key):
    """wrap dict in a list (if exist)

    Example:
        >>> obj = {"x": {"y": 1}}
        >>> wrap_dict_to_single_item_list(obj, "x")
        >>> obj
        {"x": [{"y": 1}]}

    """
    val = obj.get(key)
    if isinstance(val, dict):
        val = [val]
        obj[key] = val

def fix_metadata(metadata, table_name):
    """main script to fix & validate metadata

    Args:
        metadata (obj|str): python object of metadata or json string
        table_name (str): name of table (default for id if id is missing)

    Returns:
        fixed and validated metadata object (python object)
    """
    dialect = OEP_V_1_5_Dialect()

    # load
    if not isinstance(metadata, str):
        metadata = json.dumps(metadata)
    metadata = json.loads(metadata)

    # fix
    remove_nulls(metadata)
    metadata["id"] = metadata.get("id") or table_name
    fix_date(metadata, "publicationDate")
    fix_date(metadata, "referenceDate")
    wrap_dict_to_single_item_list(metadata, "sources")
    unwrap_single_item_list(metadata, "spatial")
    unwrap_single_item_list(metadata, "temporal")
    if "temporal" in metadata:
        fix_date(metadata["temporal"], "referenceDate")
        wrap_dict_to_single_item_list(metadata["temporal"], "timeseries")
        for ts in metadata["temporal"].get("timeseries", []):
            fix_date(ts, "start")
            fix_date(ts, "end")

    # validate
    metadata_str = json.dumps(metadata, ensure_ascii=False)
    metadata_oep = dialect.parse(metadata_str)
    metadata_obj = dialect.compile(metadata_oep)

    return metadata_obj
wingechr commented 1 year ago

@ludee @jh-RLI

wingechr commented 1 year ago

importantly, it seems that the dialect parser just silently drops additional fields. that's probably not good

chrwm commented 1 year ago

importantly, it seems that the dialect parser just silently drops additional fields. that's probably not good

Do you mean fields which values are null (see issue here) or other fields? Can you specify "additional"?

wingechr commented 1 year ago

Do you mean fields which values are null (see issue here) or other fields? Can you specify "additional"?

fields that are not part of the metadata specification (user defined fields)

jh-RLI commented 1 year ago

I documented a solution to keep additional values without changing omi to much: https://github.com/OpenEnergyPlatform/omi (scroll down to - i forgot to make this a linkable section :/) Additional Fields - not related to the OEMetadata specification

wingechr commented 1 year ago

all metadata has been upgraded to 1.5.2 with the release of 0.12.2