from lxml import etree
from pydantic_xml import BaseXmlModel, computed_element, element
from pydantic_xml.element.native import ElementT as Element
class Root(BaseXmlModel, tag="root", arbitrary_types_allowed=True):
a_raw: Element = element("a")
@computed_element
def parse_raw_a(self) -> str:
text_in_a = self.a_raw.text
for child in self.a_raw.getchildren():
text_in_a += str(child.tail)
return text_in_a
def main():
xml = """\
<root>
<a>start<b/>end</a>
</root>
"""
root = etree.fromstring(xml)
a_node = root.find("a")
# Before tranformation
text_in_a = a_node.text
for node in a_node:
text_in_a += str(node.tail)
print("Text in a:", text_in_a)
root_obj = Root.from_xml_tree(root)
out = root_obj.to_xml(pretty_print=True, encoding="UTF-8")
print(out.decode("utf-8"))
# After transformation
text_in_a = root_obj.a_raw.text
for node in root_obj.a_raw:
text_in_a += str(node.tail)
print("Text in a after deserializaton:", text_in_a)
main()
Output:
Text in a: startend
<root>
<a>start<b/></a>
<parse_raw_a>startNone</parse_raw_a>
</root>
Text in a after deserializaton: startNone
(in the output xml and after deserialization, concatenated string is "startNone" instead of "startend". It happens because b.tail == None)
I need to parse an xml element that contains a text mixed with other elements:
I tried to use a raw element typed field as it's described here: https://pydantic-xml.readthedocs.io/en/latest/pages/data-binding/raw.html, but it turned out that pydantic-xml cuts off some details. In the example above, element
<b>
has a tail "end", but after transformation it gets missed.Output:
(in the output xml and after deserialization, concatenated string is "startNone" instead of "startend". It happens because
b.tail == None
)Environment