dapper91 / pydantic-xml

python xml for humans
https://pydantic-xml.readthedocs.io
The Unlicense
155 stars 16 forks source link

Setting `nsmap` breaks XML parsing with nested class #91

Closed lmmx closed 1 year ago

lmmx commented 1 year ago

I tried to parse a XML document with a schema URL and got a failure, which I think only occurs when there's a namespace set on the XML model.

I read the tests and followed the exact layout used there for nested submodels in case it was my mistake (since this is my first time using this library), but it seems to only occur when a namespace schema URL is in the top model.

It doesn't seem to be due to this namespace map being inherited, because setting an empty nsmap on the first submodel (to 'undo' the inherited namespace) doesn't change the error either.

The following test case demonstrates the bug, adapted from this test

from pytest import mark

from pydantic_xml import BaseXmlModel, RootXmlModel, element

@mark.parametrize("schema_url", ["http://www.sitemaps.org/schemas/sitemap/0.9"])
@mark.parametrize("use_ns", [False, True])
def test_nested_root_submodel_element_extraction(use_ns, schema_url):
    if use_ns:
        NSMAP = {"": schema_url}
    else:
        NSMAP = {}

    class Loc(RootXmlModel, tag="loc"):
        root: int

    class LastMod(RootXmlModel, tag="lastmod"):
        root: int

    class Url(BaseXmlModel, nsmap={}):
        loc: Loc
        lastmod: LastMod

    class UrlSet(
        BaseXmlModel,
        tag="urlset",
        nsmap=NSMAP,
    ):
        url: list[Url] = element()

    ns = f' xmlns="{schema_url}"' if use_ns else ""
    xml = f"""
    <urlset{ns}>
        <url>
            <loc>1</loc>
            <lastmod>10</lastmod>
        </url>
        <url>
            <loc>2</loc>
            <lastmod>20</lastmod>
        </url>
    </urlset>
    """

    actual_obj = UrlSet.from_xml(xml)
    expected_obj = UrlSet(
        url=[
            Url(loc=Loc(1), lastmod=LastMod(10)),
            Url(loc=Loc(2), lastmod=LastMod(20)),
        ]
    )

Result

(Abbreviated for clarity)

E       pydantic_core._pydantic_core.ValidationError: 2 validation errors for Url                                                                                                                                                                                                                                                                 
E       loc                                                                                                                                                                                                                                                                                                                                       
E         Field required [type=missing, input_value={}, input_type=dict]                                        
E           For further information visit https://errors.pydantic.dev/2.1/v/missing                             
E       lastmod                                                                                                 
E         Field required [type=missing, input_value={}, input_type=dict]                                        
E           For further information visit https://errors.pydantic.dev/2.1/v/missing                             
Click to show entire pytest output Result: ```sh (magicscrape) louis 🚶 ~/dev/testing/pydantic_xml $ pytest submodels_ns.py ``` ```py [615/1944] ========================================================================== test session starts ========================================================================== platform linux -- Python 3.10.12, pytest-7.4.0, pluggy-1.2.0 rootdir: /home/louis/dev/testing/pydantic_xml plugins: anyio-3.7.1 collected 2 items submodels_ns.py .F [100%] =============================================================================== FAILURES ================================================================================ ____________________________________ test_nested_root_submodel_element_extraction[True-http://www.sitemaps.org/schemas/sitemap/0.9] _____________________________________ use_ns = True, schema_url = 'http://www.sitemaps.org/schemas/sitemap/0.9' @mark.parametrize("schema_url", ["http://www.sitemaps.org/schemas/sitemap/0.9"]) @mark.parametrize("use_ns", [False, True]) def test_nested_root_submodel_element_extraction(use_ns, schema_url): if use_ns: NSMAP = {"": schema_url} else: NSMAP = {} class Loc(RootXmlModel, tag="loc"): root: int class LastMod(RootXmlModel, tag="lastmod"): root: int class Url(BaseXmlModel, nsmap={}): loc: Loc lastmod: LastMod class UrlSet( BaseXmlModel, tag="urlset", nsmap=NSMAP, ): url: list[Url] = element() ns = f' xmlns="{schema_url}"' if use_ns else "" xml = f""" 1 10 2 20 """ > actual_obj = UrlSet.from_xml(xml) submodels_ns.py:45: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ ../../../miniconda3/envs/magicscrape/lib/python3.10/site-packages/pydantic_xml/model.py:385: in from_xml return cls.from_xml_tree(etree.fromstring(source), context=context) ../../../miniconda3/envs/magicscrape/lib/python3.10/site-packages/pydantic_xml/model.py:361: in from_xml_tree cls.__xml_serializer__.deserialize( ../../../miniconda3/envs/magicscrape/lib/python3.10/site-packages/pydantic_xml/serializers/factories/model.py:164: in deserialize result = { ../../../miniconda3/envs/magicscrape/lib/python3.10/site-packages/pydantic_xml/serializers/factories/model.py:167: in if (field_value := field_serializer.deserialize(element, context=context)) ../../../miniconda3/envs/magicscrape/lib/python3.10/site-packages/pydantic_xml/serializers/factories/homogeneous.py:60: in deserialize while (value := self._inner_serializer.deserialize(element, context=context)) is not None: ../../../miniconda3/envs/magicscrape/lib/python3.10/site-packages/pydantic_xml/serializers/factories/model.py:365: in deserialize return self._model.__xml_serializer__.deserialize( _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ self = element = def deserialize( self, element: Optional[XmlElementReader], *, context: Optional[Dict[str, Any]], ) -> Optional["pxml.BaseXmlModel"]: if element is None: return None result = { self._fields_validation_aliases.get(field_name, field_name): field_value for field_name, field_serializer in self._field_serializers.items() if (field_value := field_serializer.deserialize(element, context=context)) is not None } # import traceback as tb # breakpoint() > return self._model.model_validate(result, strict=False, context=context) E pydantic_core._pydantic_core.ValidationError: 2 validation errors for Url E loc E Field required [type=missing, input_value={}, input_type=dict] E For further information visit https://errors.pydantic.dev/2.1/v/missing E lastmod E Field required [type=missing, input_value={}, input_type=dict] E For further information visit https://errors.pydantic.dev/2.1/v/missing ../../../miniconda3/envs/magicscrape/lib/python3.10/site-packages/pydantic_xml/serializers/factories/model.py:172: ValidationError ======================================================================== short test summary info ======================================================================== FAILED submodels_ns.py::test_nested_root_submodel_element_extraction[True-http://www.sitemaps.org/schemas/sitemap/0.9] - pydantic_core._pydantic_core.ValidationError: 2 validation errors for Url ====================================================================== 1 failed, 1 passed in 0.44s ====================================================================== ```

I also verified that it isn't a regression due to the move to Pydantic 2, the same result occurs with v1 syntax:

Click to show Pydantic v1 version ```py from pytest import mark from pydantic_xml import BaseXmlModel, element @mark.parametrize("schema_url", ["http://www.sitemaps.org/schemas/sitemap/0.9"]) @mark.parametrize("use_ns", [False, True]) def test_nested_root_submodel_element_extraction(use_ns, schema_url): if use_ns: NSMAP = {"": schema_url} else: NSMAP = {} class Loc(BaseXmlModel, tag="loc"): __root__: int class LastMod(BaseXmlModel, tag="lastmod"): __root__: int class Url(BaseXmlModel, nsmap={}): loc: Loc lastmod: LastMod class UrlSet( BaseXmlModel, tag="urlset", nsmap=NSMAP, ): url: list[Url] = element() ns = f' xmlns="{schema_url}"' if use_ns else "" xml = f""" 1 10 2 20 """ actual_obj = UrlSet.from_xml(xml) expected_obj = UrlSet( url=[ Url(loc=Loc(__root__=1), lastmod=LastMod(__root__=10)), Url(loc=Loc(__root__=2), lastmod=LastMod(__root__=20)), ] ) ``` Result: ```sh (pydanticv1xml) louis 🚶 ~/dev/testing/pydantic_xml $ python -m pytest submodels_ns_v1.py ``` ```py [99/1944] ========================================================================== test session starts ========================================================================== platform linux -- Python 3.10.12, pytest-7.4.0, pluggy-1.2.0 rootdir: /home/louis/dev/testing/pydantic_xml collected 2 items submodels_ns_v1.py .F [100%] =============================================================================== FAILURES ================================================================================ ____________________________________ test_nested_root_submodel_element_extraction[True-http://www.sitemaps.org/schemas/sitemap/0.9] _____________________________________ use_ns = True, schema_url = 'http://www.sitemaps.org/schemas/sitemap/0.9' @mark.parametrize("schema_url", ["http://www.sitemaps.org/schemas/sitemap/0.9"]) @mark.parametrize("use_ns", [False, True]) def test_nested_root_submodel_element_extraction(use_ns, schema_url): if use_ns: NSMAP = {"": schema_url} else: NSMAP = {} class Loc(BaseXmlModel, tag="loc"): __root__: int class LastMod(BaseXmlModel, tag="lastmod"): __root__: int class Url(BaseXmlModel, nsmap={}): loc: Loc lastmod: LastMod class UrlSet( BaseXmlModel, tag="urlset", nsmap=NSMAP, ): url: list[Url] = element() ns = f' xmlns="{schema_url}"' if use_ns else "" xml = f""" 1 10 2 20 """ > actual_obj = UrlSet.from_xml(xml) submodels_ns_v1.py:45: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ ../../../miniconda3/envs/pydanticv1xml/lib/python3.10/site-packages/pydantic_xml/model.py:302: in from_xml return cls.from_xml_tree(etree.fromstring(source)) ../../../miniconda3/envs/pydanticv1xml/lib/python3.10/site-packages/pydantic_xml/model.py:286: in from_xml_tree obj = typing.cast(ModelT, cls.__xml_serializer__.deserialize(XmlElement.from_native(root))) ../../../miniconda3/envs/pydanticv1xml/lib/python3.10/site-packages/pydantic_xml/serializers/factories/model.py:76: in deserialize result = { ../../../miniconda3/envs/pydanticv1xml/lib/python3.10/site-packages/pydantic_xml/serializers/factories/model.py:79: in if (field_value := field_serializer.deserialize(element)) is not None ../../../miniconda3/envs/pydanticv1xml/lib/python3.10/site-packages/pydantic_xml/serializers/factories/homogeneous.py:75: in deserialize while (value := self._inner_serializer.deserialize(element)) is not None: ../../../miniconda3/envs/pydanticv1xml/lib/python3.10/site-packages/pydantic_xml/serializers/factories/model.py:156: in deserialize return super().deserialize(sub_element) ../../../miniconda3/envs/pydanticv1xml/lib/python3.10/site-packages/pydantic_xml/serializers/factories/model.py:116: in deserialize return self._model.__xml_serializer__.deserialize(element) ../../../miniconda3/envs/pydanticv1xml/lib/python3.10/site-packages/pydantic_xml/serializers/factories/model.py:86: in deserialize return self._model.parse_obj(obj) pydantic/main.py:526: in pydantic.main.BaseModel.parse_obj ??? _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ > ??? E pydantic.error_wrappers.ValidationError: 2 validation errors for Url E loc E field required (type=value_error.missing) E lastmod E field required (type=value_error.missing) pydantic/main.py:341: ValidationError ======================================================================== short test summary info ======================================================================== FAILED submodels_ns_v1.py::test_nested_root_submodel_element_extraction[True-http://www.sitemaps.org/schemas/sitemap/0.9] - pydantic.error_wrappers.ValidationError: 2 validation errors for Url ====================================================================== 1 failed, 1 passed in 0.15s ====================================================================== ```

From the traceback I breakpointed the model.py and serializers/factories/model.py modules and inspected with breakpoint() and traceback.print_stack() which showed that the validation error is occurring because the Url validator (i.e. the submodel) is first being passed a correct source of the <url>...</url> substring, which becomes a valid result dict and passes the model_validate call, but then on a 2nd pass receives an incorrect source equal to the entire XML string, which produces an empty result dict which fails the 2nd model_validate call.

I.e. the ValidationError is arising from trying to interpret the entire XML string as the submodel, which is obviously going to fail.

I don't know if this internal view really helps debug it or not but it might point to the root cause, and rule out a failure to parse the submodel, but a failure to parse only that submodel.

Click to show the real XML that was simplified for this example Initial demo script: ```py from typing import Optional from pydantic import ValidationError from pydantic_xml import BaseXmlModel, RootXmlModel, attr, element class Loc(RootXmlModel, tag="loc"): root: str class LastMod(RootXmlModel, tag="lastmod"): root: str class Url(BaseXmlModel): loc: Loc lastmod: LastMod class UrlSet( BaseXmlModel, tag="urlset", nsmap={"": "http://www.sitemaps.org/schemas/sitemap/0.9"}, ): url: list[Url] = element() urlfree = b'' assert UrlSet(url=[]).to_xml() == urlfree # but reverse direction fails? # ufx = UrlSet.from_xml(urlfree) # assert ufx == UrlSet([]) ns = b"http://www.sitemaps.org/schemas/sitemap/0.9" ux = b"https://pyfound.blogspot.com/2023/08/announcing-our-new-pypi-safety-security.html2023-08-04T16:32:28Z" usx = b'' + ux + b"" stub = b"""""" + usx u_base = "https://pyfound.blogspot.com" u_loc = f"{u_base}/2023/08/announcing-our-new-pypi-safety-security.html" u_mod = "2023-08-04T16:32:28Z" gen_u = Url(loc=u_loc, lastmod=u_mod) gen_ux = gen_u.to_xml() # assert gen_ux == ux, "Url generation is not accurate" regen_ux = Url.from_xml(gen_ux) assert regen_ux == gen_u, "Url generation is not symmetric" gen_us = UrlSet(url=[gen_u]) gen_usx = gen_us.to_xml() assert gen_usx == usx, "UrlSet generation is not accurate" try: regen_usx = UrlSet.from_xml(gen_usx) except ValidationError as ve: print("Could not regenerate UrlSet from gen_usx") print(ve) else: assert regen_usx == gen_us, "UrlSet generation is not symmetric" assert stub[38:] == gen_usx ``` It gave the same error as in the reproducible example above ```py 2 validation errors for Url loc Field required [type=missing, input_value={}, input_type=dict] For further information visit https://errors.pydantic.dev/2.1/v/missing lastmod Field required [type=missing, input_value={}, input_type=dict] For further information visit https://errors.pydantic.dev/2.1/v/missing ```
dapper91 commented 1 year ago

@lmmx Hi.

Your xml document defines the default namespace, so all sub-elements (including loc and lastmod) belongs to that namespace. When you defined a sub-model nsmap parameter is not inherited from the parent one.

So your model

 class Url(BaseXmlModel, nsmap={}):
    loc: Loc
    lastmod: LastMod

doesn't inherit default namespace from UrlSet which means it tries to find loc and lastmod without namespace.

Try to redefine Url model like this:

class Url(BaseXmlModel, nsmap=NSMAP):
        loc: Loc
        lastmod: LastMod
lmmx commented 1 year ago

Ahh thank you v much. That edit fixed all but 1 of my demo test cases.

from typing import Optional

from pydantic import ValidationError

from pydantic_xml import BaseXmlModel, RootXmlModel, attr, element

NSMAP = {"": "http://www.sitemaps.org/schemas/sitemap/0.9"}

class Loc(RootXmlModel, tag="loc"):
    root: str

class LastMod(RootXmlModel, tag="lastmod"):
    root: str

class Url(BaseXmlModel, tag="url", nsmap=NSMAP):
    loc: Loc
    lastmod: LastMod

class UrlSet(BaseXmlModel, tag="urlset", nsmap=NSMAP):
    urls: list[Url] = element(default=[])

urlfree = b'<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"/>'
assert urlfree == UrlSet(urls=[]).to_xml()
assert urlfree == UrlSet.from_xml(urlfree).to_xml()

ns = b"http://www.sitemaps.org/schemas/sitemap/0.9"
ux = b"<url><loc>https://pyfound.blogspot.com/2023/08/announcing-our-new-pypi-safety-security.html</loc><lastmod>2023-08-04T16:32:28Z</lastmod></url>"
usx = b'<urlset xmlns="' + ns + b'">' + ux + b"</urlset>"
stub = b"""<?xml version="1.0" encoding="UTF-8"?>""" + usx

u_base = "https://pyfound.blogspot.com"
u_loc = f"{u_base}/2023/08/announcing-our-new-pypi-safety-security.html"
u_mod = "2023-08-04T16:32:28Z"

gen_u = Url(loc=u_loc, lastmod=u_mod)
gen_ux = gen_u.to_xml()
# assert gen_ux == ux, "Url generation is not accurate"

regen_ux = Url.from_xml(gen_ux)
assert regen_ux == gen_u, "Url generation is not symmetric"

gen_us = UrlSet(urls=[gen_u])
gen_usx = gen_us.to_xml()
assert gen_usx == usx, "UrlSet generation is not accurate"

regen_usx = UrlSet.from_xml(gen_usx)
assert regen_usx == gen_us, "UrlSet generation is not symmetric"
assert stub[38:] == gen_usx

The one it didn't solve feels a bit counterintuitive: I guess you can't generate the <url> tag on its own, without the namespace attribute [the one I left commented out above]:

# assert gen_ux == ux, "Url generation is not accurate"

I guess it makes sense in the context. Thanks again for the tip