dkpro / dkpro-cassis

UIMA CAS processing library written in Python
https://pypi.org/project/dkpro-cassis/
Apache License 2.0
84 stars 22 forks source link

Unable to parse empty arrays #221

Closed reckart closed 2 years ago

reckart commented 2 years ago

Describe the bug Cassis fails loading an empty array that has been directly added to a view.

To Reproduce I'll provide a unit test.

Expected behavior Should just work.

Error message

_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
../cassis/cas.py:556: in to_xmi
    serializer.serialize(sink, self, pretty_print=pretty_print)
../cassis/xmi.py:412: in serialize
    self._serialize_feature_structure(cas, root, fs)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

self = <cassis.xmi.CasXmiSerializer object at 0x10c341040>
cas = <cassis.cas.Cas object at 0x10c2e2c40>
root = <Element {http://www.omg.org/XMI}XMI at 0x10c1c2a00>
fs = uima_cas_FSArray(xmiID=21, elements=[], type=Type(name='uima.cas.FSArray', supertype=Type(name='uima.cas.ArrayBase', s...nherited_features={}), description=None, elementType=None, multipleReferencesAllowed=True, _has_reserved_name=False)}))

    def _serialize_feature_structure(self, cas: Cas, root: etree.Element, fs: FeatureStructure):
        ts = cas.typesystem

        type_name = fs.type.name
        if "." not in type_name:
            type_name = f"uima.noNamespace.{type_name}"

        # The type name is a Java package, e.g. `org.myproj.Foo`.
        parts = type_name.split(".")

        # The CAS type namespace is converted to an XML namespace URI by the following rule:
        # replace all dots with slashes, prepend http:///, and append .ecore.
        url = "http:///" + "/".join(parts[:-1]) + ".ecore"

        # The cas prefix is the last component of the CAS namespace, which is the second to last
        # element of the type (the last part is the type name without package name), e.g. `myproj`
        raw_prefix = parts[-2]
        typename = parts[-1]

        # If the url has not been seen yet, compute the namespace and add it
        if url not in self._urls_to_prefixes:
            # If the prefix already exists, but maps to a different url, then add it with
            # a number at the end, e.g. `type0`

            new_prefix = raw_prefix
            if raw_prefix in self._nsmap:
                suffix = self._duplicate_namespaces[raw_prefix]
                self._duplicate_namespaces[raw_prefix] += 1
                new_prefix = raw_prefix + str(suffix)

            self._nsmap[new_prefix] = url
            self._urls_to_prefixes[url] = new_prefix

        prefix = self._urls_to_prefixes[url]

        name = etree.QName(self._nsmap[prefix], typename)
        elem = etree.SubElement(root, name)

        # Serialize common attributes
        elem.attrib["{http://www.omg.org/XMI}id"] = str(fs.xmiID)

        # Case where arrays are rendered as separate elements (not inline) for use with multipleReferencesAllowed = True
        if ts.is_primitive_array(fs.type.name) or fs.type.name == "uima.cas.FSArray" and fs.elements:
            if ts.is_instance_of(fs.type.name, "uima.cas.StringArray"):
                # String arrays need to be serialized to a series of child elements, as strings can
                # contain whitespaces. Consider e.g. the array ['likes cats, 'likes dogs']. If we would
                # serialize it as an attribute, it would look like
                #
                # <my:fs elements="likes cats likes dogs" />
                #
                # which looses the information about the whitespace. Instead, we serialize it to
                #
                # <my:fs>
                #   <elements>likes cats</elements>
                #   <elements>likes dogs</elements>
                # </my:fs>
                for e in fs.elements:
                    child = etree.SubElement(elem, "elements")
                    child.text = e
            elif fs.type.name == "uima.cas.FSArray":
                elements = " ".join(str(e.xmiID) for e in fs.elements)
                elem.attrib["elements"] = elements
            else:
                elem.attrib["elements"] = self._serialize_primitive_array(fs.type.name, fs.elements)
            return

        # Serialize feature attributes
        t = fs.type
        for feature in t.all_features:
            if feature.name in CasXmiSerializer._COMMON_FIELD_NAMES:
                continue

            feature_name = feature.name

            # Strip the underscore we added for reserved names
            if feature._has_reserved_name:
                feature_name = feature.name[:-1]

            # Skip over 'None' features
            value = fs[feature.name]
            if value is None:
                continue

            # Map back from offsets in Unicode codepoints to UIMA UTF-16 based offsets
            if (
                ts.is_instance_of(fs.type.name, TYPE_NAME_ANNOTATION)
                and feature_name == FEATURE_BASE_NAME_BEGIN
                or feature_name == FEATURE_BASE_NAME_END
            ):
                sofa: Sofa = fs.sofa
                value = sofa._offset_converter.cassis_to_uima(value)

            if ts.is_instance_of(feature.rangeType, TYPE_NAME_STRING_ARRAY) and not feature.multipleReferencesAllowed:
                if value.elements is not None:  # Compare to none to not skip if elements is empty!
                    for e in value.elements:
                        child = etree.SubElement(elem, feature_name)
                        child.text = e
            elif ts.is_primitive_array(feature.rangeType) and not feature.multipleReferencesAllowed:
                if value.elements is not None:  # Compare to none to not skip if elements is empty!
                    elem.attrib[feature_name] = self._serialize_primitive_array(feature.rangeType.name, value.elements)
            elif feature.rangeType.name == TYPE_NAME_FS_ARRAY and not feature.multipleReferencesAllowed:
                if value.elements is not None:  # Compare to none to not skip if elements is empty!
                    elem.attrib[feature_name] = " ".join(str(e.xmiID) for e in value.elements)
            elif feature_name == FEATURE_BASE_NAME_SOFA:
                elem.attrib[feature_name] = str(value.xmiID)
            elif feature.rangeType.name == TYPE_NAME_BOOLEAN:
                elem.attrib[feature_name] = "true" if value else "false"
            elif feature.rangeType.name in {TYPE_NAME_DOUBLE, TYPE_NAME_FLOAT}:
                elem.attrib[feature_name] = self._serialize_float_value(value)
            elif ts.is_primitive(feature.rangeType):
                elem.attrib[feature_name] = str(value)
            else:
                # We need to encode non-primitive features as a reference
>               elem.attrib[feature_name] = str(value.xmiID)
E               AttributeError: 'list' object has no attribute 'xmiID'

../cassis/xmi.py:544: AttributeError

Please complete the following information: