dapper91 / pydantic-xml

python xml for humans
https://pydantic-xml.readthedocs.io
The Unlicense
141 stars 14 forks source link

Default namespace causes serializing list elements to fail #156

Closed mcleantom closed 5 months ago

mcleantom commented 6 months ago

When I add a default namespace to my class, it causes serializing list elements in a nested BaseXmlModel to fail.

An example of this is:

from pydantic_xml import BaseXmlModel, attr, element
from typing import Literal

class Foo(BaseXmlModel, tag="foo"):
    x: int = attr(default=1)

class Bar(BaseXmlModel, tag="bar"):
    y: int = attr(default=2)

class Bars(BaseXmlModel, tag="bars"):
    bars: list[Bar] = element(default_factory=list)

class Baz(
    BaseXmlModel,
    tag="route",
    nsmap={
        '': "http://www.cirm.org/RTZ/1/0",
        "xsi": "http://www.w3.org/2001/XMLSchema-instance"
    }
):
    version: Literal["1.0"] = attr()
    foo: Foo = element(default_factory=Foo)
    bars: Bars = element(default_factory=Bars)

class Qux(
    BaseXmlModel,
    tag="route",
    nsmap={
        "xsi": "http://www.w3.org/2001/XMLSchema-instance"
    }
):
    version: Literal["1.0"] = attr()
    foo: Foo = element(default_factory=Foo)
    bars: Bars = element(default_factory=Bars)

bar = Baz(
    version="1.0",
    foo=Foo(x=1),
    bars=Bars(bars=[Bar(y=1), Bar(y=2)])
)
print(bar.to_xml(pretty_print=True, skip_empty=True).decode("utf-8"))
print(Baz.from_xml(bar.to_xml(skip_empty=True)))

bar2 = Qux(
    version="1.0",
    foo=Foo(x=1),
    bars=Bars(bars=[Bar(y=1), Bar(y=2)])
)
print(bar2.to_xml(pretty_print=True, skip_empty=True).decode("utf-8"))
print(Qux.from_xml(bar2.to_xml(skip_empty=True)))

This outputs:

<route xmlns="http://www.cirm.org/RTZ/1/0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" version="1.0">
  <foo x="1"/>
  <bars>
    <bar y="1"/>
    <bar y="2"/>
  </bars>
</route>

version='1.0' foo=Foo(x=1) bars=Bars(bars=[])

For Baz

and for Qux it outputs:

<route xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" version="1.0">
  <foo x="1"/>
  <bars>
    <bar y="1"/>
    <bar y="2"/>
  </bars>
</route>

version='1.0' foo=Foo(x=1) bars=Bars(bars=[Bar(y=1), Bar(y=2)])

So in the second version, it has correctly serialized the Bars class to be Bars(bars=[Bar(y=1), Bar(y=2)]) whereas in the first version, it has incorrectly serialized it to be Bars(bars=[]). The only difference between the two classes is the nsmap parameter.

How can I add the default namespace parameter, and get Baz to serialize the string correctly?

Thanks

mcleantom commented 6 months ago

I think the error is coming from the ModelProxySerializer class in the deserialize function on the line:

if (sub_element := element.pop_element(self._element_name, self._search_mode)) is not None:

Where self._element_name is bar and this line returns None, however if you modify it to:

>>> sub_element = element.pop_element('{http://www.cirm.org/RTZ/1/0}bar', self._search_mode)
>>> sub_element
<pydantic_xml.element.native.lxml.XmlElement object at 0x7f8eb5ca6ca0>

You get an object back and the rest of the code works:

>>> sourcemap[loc] = sub_element.get_sourceline()
>>> is_element_nill(sub_element)
False
>>> self._model.__xml_serializer__.deserialize(
...                     sub_element, context=context, sourcemap=sourcemap, loc=loc,
...                 )
Bar(y=10)

The value of self._element_name is being set in the initializer of ModelProxySerializer on the line:

self._element_name = QName.from_alias(tag=name, ns=ns, nsmap=nsmap).uri

And the nsmap is not being passed into the Bar class:

>>> nsmap
ChainMap({})
>>> name
'bar'

Still trying to work out why this is, and a fix for it

mcleantom commented 6 months ago

Still trying to work it out but a cleaner unit test for the bug is:

def test_deeply_nested_model_list():

    class TestSubSubModel(BaseXmlModel, tag="subsubmodel"):
        attr1: int = attr()
        attr2: int = attr()

    class TestSubModel(BaseXmlModel, tag="submodel"):
        subsubmodels: list[TestSubSubModel] = element(default_factory=list)

    class TestModel(
        BaseXmlModel,
        tag="model",
        nsmap={"": "http://test1.org"},
    ):
        submodels: TestSubModel = element()

    xml = """
    <model xmlns="http://test1.org">
        <submodel>
            <subsubmodel attr1="1" attr2="2"/>
            <subsubmodel attr1="3" attr2="4"/>
        </submodel>
    </model>
    """

    expected_obj = TestModel(
        submodels=TestSubModel(
            subsubmodels=[
                TestSubSubModel(attr1=1, attr2=2),
                TestSubSubModel(attr1=3, attr2=4),
            ]
        )
    )

    actual_obj = TestModel.from_xml(xml)

    assert actual_obj == expected_obj
mcleantom commented 6 months ago

This bug is happening because the ModelProxySerializer class is using the first initialized instance of the TestSubModel serializer on the line:

return self._model.__xml_serializer__.deserialize(
    sub_element, context=context, sourcemap=sourcemap, loc=loc,
)

And this version of the serializer doesnt have the new namespaces from the parent context.

I can fix this bug for this test if I modify the ModelProxySerializer class to be:

class ModelProxySerializer(BaseModelSerializer):
    @classmethod
    def from_core_schema(cls, schema: pcs.ModelSchema, ctx: Serializer.Context) -> 'ModelProxySerializer':
        model_cls = schema['cls']
        assert issubclass(model_cls, pxml.BaseXmlModel), "unexpected model type"

        name = ctx.entity_path or model_cls.__xml_tag__ or ctx.field_alias or ctx.field_name or model_cls.__name__
        ns = select_ns(ctx.entity_ns, model_cls.__xml_ns__, ctx.parent_ns)
        nsmap = merge_nsmaps(ctx.entity_nsmap, model_cls.__xml_nsmap__, ctx.parent_nsmap)
        search_mode = ctx.search_mode
        computed = ctx.field_computed
        nillable = ctx.nillable
        serializer = ModelSerializer.from_core_schema(schema, ctx)
        return cls(model_cls, name, ns, nsmap, search_mode, computed, nillable, serializer)

    def __init__(
            self,
            model: Type['pxml.BaseXmlModel'],
            name: str,
            ns: Optional[str],
            nsmap: Optional[NsMap],
            search_mode: SearchMode,
            computed: bool,
            nillable: bool,
            serializer: ModelSerializer,
    ):
        self._model = model
        self._element_name = QName.from_alias(tag=name, ns=ns, nsmap=nsmap).uri
        self._nsmap = nsmap
        self._search_mode = search_mode
        self._computed = computed
        self._nillable = nillable
        self._serializer = serializer

    @property
    def model(self) -> Type['pxml.BaseXmlModel']:
        return self._model

    @property
    def model_serializer(self) -> Optional[BaseModelSerializer]:
        return self._model.__xml_serializer__

    @property
    def element_name(self) -> str:
        return self._element_name

    @property
    def nsmap(self) -> Optional[NsMap]:
        return self._nsmap

    def serialize(
            self,
            element: XmlElementWriter,
            value: 'pxml.BaseXmlModel',
            encoded: Dict[str, Any],
            *,
            skip_empty: bool = False,
    ) -> Optional[XmlElementWriter]:
        assert self._model.__xml_serializer__ is not None, f"model {self._model.__name__} is partially initialized"

        if self._nillable and value is None:
            sub_element = element.make_element(self._element_name, nsmap=self._nsmap)
            make_element_nill(sub_element)
            element.append_element(sub_element)
            return sub_element

        if value is None:
            return None

        sub_element = element.make_element(self._element_name, nsmap=self._nsmap)
        self._model.__xml_serializer__.serialize(sub_element, value, encoded, skip_empty=skip_empty)
        if skip_empty and sub_element.is_empty():
            return None
        else:
            element.append_element(sub_element)
            return sub_element

    def deserialize(
            self,
            element: Optional[XmlElementReader],
            *,
            context: Optional[Dict[str, Any]],
            sourcemap: Dict[Location, int],
            loc: Location,
    ) -> Optional['pxml.BaseXmlModel']:
        assert self._model.__xml_serializer__ is not None, f"model {self._model.__name__} is partially initialized"

        if self._computed:
            return None

        if element is None:
            return None

        if (sub_element := element.pop_element(self._element_name, self._search_mode)) is not None:
            sourcemap[loc] = sub_element.get_sourceline()
            if is_element_nill(sub_element):
                return None
            else:
                return self._serializer.deserialize(
                    sub_element, context=context, sourcemap=sourcemap, loc=loc,
                )
        else:
            return None

Where in from_core_schema I have built a new ModelSerializer from scratch which uses the new ctx which includes the parent namespaces. I then use this new serialization object in the return statement in the deserialize function.

However, this breaks a bunch of other unit tests.

mcleantom commented 6 months ago

A few of the tests can be fixed by either making a ModelSerializer or a RootModelSerializer like so:

if schema['schema']['type'] == 'model-fields':
    serializer = ModelSerializer.from_core_schema(schema, ctx)
else:
    serializer = RootModelSerializer.from_core_schema(schema, ctx)

However, this breaks unit tests with self-referencing models such as test_misc.py -> test_self_ref_models.

Without any changes to the code, a self referencing model within another model with a namespace map also breaks:

def test_self_ref_models_with_namespace_map_on_root():
    class TestSubModel(BaseXmlModel, tag='submodel'):
        attr1: int = attr()
        element1: float = element()

        model1: Optional['TestSubModel'] = element(tag='model1', default=None)
        models1: Optional[List['TestSubModel']] = element(tag='item1', default=None)
        models2: Optional[Tuple['TestSubModel', 'TestSubModel']] = element(tag='item2', default=None)

    class TestModel(BaseXmlModel, tag="model", nsmap={'': 'https://wwww.test.org'}):
        submodel: TestSubModel = element(tag='submodel')

    obj = TestModel(
        submodel=TestSubModel(
            attr1=1,
            element1=1.1,
            model1=TestSubModel(
                attr1=2,
                element1=2.2,
            ),
            models1=[
                TestSubModel(
                    attr1=3,
                    element1=3.3,
                ),
                TestSubModel(
                    attr1=4,
                    element1=4.4,
                ),
            ],
            models2=[
                TestSubModel(
                    attr1=5,
                    element1=5.5,
                ),
                TestSubModel(
                    attr1=6,
                    element1=6.6,
                ),
            ],
        )
    )

    returned_obj = TestModel.from_xml(obj.to_xml(skip_empty=True))
    assert returned_obj == obj
mcleantom commented 6 months ago

This bug can also be fixed by passing the namespace all the way down the deserialize path:

def deserialize(
        self,
        element: Optional[XmlElementReader],
        *,
        context: Optional[Dict[str, Any]],
        sourcemap: Dict[Location, int],
        loc: Location,
        nsmap: NsMap  # <-- New parameter
) -> Optional['pxml.BaseXmlModel']:

And adding a new method to BaseModelSerializer:

@staticmethod
def get_element_name(tag: str, nsmap: NsMap) -> str:
    return QName.from_alias(tag=tag, ns=None, nsmap=nsmap).uri

And then replacing ModelProxySerializer.deserialize to be:

    def deserialize(
            self,
            element: Optional[XmlElementReader],
            *,
            context: Optional[Dict[str, Any]],
            sourcemap: Dict[Location, int],
            loc: Location,
            nsmap: NsMap
    ) -> Optional['pxml.BaseXmlModel']:
        assert self._model.__xml_serializer__ is not None, f"model {self._model.__name__} is partially initialized"

        if self._computed:
            return None

        if element is None:
            return None

        if (sub_element := element.pop_element(self.get_element_name(self._tag, nsmap), self._search_mode)) is not None:
            sourcemap[loc] = sub_element.get_sourceline()
            if is_element_nill(sub_element):
                return None
            else:
                return self._model.__xml_serializer__.deserialize(
                    sub_element, context=context, sourcemap=sourcemap, loc=loc, nsmap=nsmap
                )
        else:
            return None

But again, this breaks other unit tests, so maybe not the solution?

mcleantom commented 6 months ago

Actually, doing this passes all tests including my new tests:

    def deserialize(
            self,
            element: Optional[XmlElementReader],
            *,
            context: Optional[Dict[str, Any]],
            sourcemap: Dict[Location, int],
            loc: Location,
            nsmap: NsMap
    ) -> Optional['pxml.BaseXmlModel']:
        assert self._model.__xml_serializer__ is not None, f"model {self._model.__name__} is partially initialized"

        if self._computed:
            return None

        if element is None:
            return None

        sub_element = element.pop_element(self._element_name, self._search_mode)

        if sub_element is None:
            sub_element = element.pop_element(self.get_element_name(self._tag, nsmap), self._search_mode)

        if sub_element is not None:
            sourcemap[loc] = sub_element.get_sourceline()
            if is_element_nill(sub_element):
                return None
            else:
                return self._model.__xml_serializer__.deserialize(
                    sub_element, context=context, sourcemap=sourcemap, loc=loc, nsmap=nsmap
                )
        else:
            return None

Not sure if it is a good solution. @dapper91 thoughts?#

edit: Actually, doesnt pass my test for self referencing models with a namespace on the root element

dapper91 commented 5 months ago

@mcleantom Hi.

Sub-models don't inherit the namespace map from the parent one. So in your example Bars sub-model doesn't inherit the namespace map (therefore the http://www.cirm.org/RTZ/1/0 namespace) from Baz. If you define it explicitly you example should work correctly:

>>> from pydantic_xml import BaseXmlModel, attr, element
>>> from typing import Literal
>>>
>>> NSMAP = {
...     '': "http://www.cirm.org/RTZ/1/0",
...     'xsi': "http://www.w3.org/2001/XMLSchema-instance",
... }
>>>
>>>
>>> class Foo(BaseXmlModel, tag="foo"):
...     x: int = attr(default=1)
...
>>>
>>> class Bar(BaseXmlModel, tag="bar"):
...     y: int = attr(default=2)
...
>>>
>>> class Bars(BaseXmlModel, tag="bars", nsmap=NSMAP):
...     bars: list[Bar] = element(default_factory=list)
...
>>>
>>> class Baz(BaseXmlModel, tag="route", nsmap=NSMAP):
...     version: Literal["1.0"] = attr()
...     foo: Foo = element(default_factory=Foo)
...     bars: Bars = element(default_factory=Bars)
...
>>> bar = Baz(
...     version="1.0",
...     foo=Foo(x=1),
...     bars=Bars(bars=[Bar(y=1), Bar(y=2)])
... )
>>> print(bar.to_xml(pretty_print=True, skip_empty=True).decode("utf-8"))
<route xmlns="http://www.cirm.org/RTZ/1/0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" version="1.0">
  <foo x="1"/>
  <bars>
    <bar y="1"/>
    <bar y="2"/>
  </bars>
</route>

>>> print(Baz.from_xml(bar.to_xml(skip_empty=True)))
version='1.0' foo=Foo(x=1) bars=Bars(bars=[Bar(y=1), Bar(y=2)])