python / cpython

The Python programming language
https://www.python.org
Other
63.49k stars 30.41k forks source link

xml.dom.minidom convenience methods for getting elements #111332

Open R0NUT opened 1 year ago

R0NUT commented 1 year ago

Feature or enhancement

Proposal:

The Todo for xml.dom.minidom includes adding convenience methods for getting elements. I've created the following methods based on existing javascript functions. These include getElementById, getElementByName, getElementsByClassName, closest, querySelector, and querySelectorAll.

To implement, these helper functions should be added to the top level of the module.

def _get_element_by_id_helper(parent, id, rc):
    for node in parent.childNodes:
        if node.nodeType == Node.ELEMENT_NODE and \
            (node.getAttribute('Id') == id):
            rc.append(node)
        _get_element_by_id_helper(node, id, rc)
    return rc

def _get_element_by_name_helper(parent, name, rc):
    for node in parent.childNodes:
        if node.nodeType == Node.ELEMENT_NODE and \
            (node.getAttribute('Name') == name):
            rc.append(node)
        _get_element_by_name_helper(node, name, rc)
    return rc

def _get_elements_by_className_helper(parent, className, rc):
    for node in parent.childNodes:
        if node.nodeType == Node.ELEMENT_NODE and \
            (node.getAttribute('Class') == className):
            rc.append(node)
        _get_elements_by_className_helper(node, className, rc)
    return rc

def _closest_helper(node, selectors:list, rc):
    initialLen = len(selectors)
    if node.nodeType == Node.ELEMENT_NODE and initialLen:
        parent = node.parentNode
        if parent.nodeType == Node.ELEMENT_NODE:
            if _element_selector_match(parent, selectors[0]):
                if len(selectors) == 1:
                    rc.append(node)
                selectors.pop(0)
            else:
                _closest_helper(parent, selectors, rc)
        if not initialLen == len(selectors) and len(selectors):
            if _element_selector_match(node, selectors[0]):
                if len(selectors) == 1:
                    rc.append(node)
                selectors.pop(0)
    return rc

def _query_selector_helper(parent, selectors:list, rc, **kwargs):
    for node in parent.childNodes:
        if node.nodeType == Node.ELEMENT_NODE and \
            len(selectors) and (not len(rc) or kwargs.get('all')):
            if _element_selector_match(node, selectors[0]):
                if len(selectors) == 1:
                    rc.append(node)
                else:
                    _query_selector_helper(node, selectors[1:], rc, **kwargs)
            else:
                _query_selector_helper(node, selectors, rc, **kwargs)
    return rc

def _element_selector_match(e:'Element', s:dict) -> bool:
    m = True
    if s.get('TagName'):
        if not s['TagName'] == e.tagName:
            m = False
    if s.get('Attributes') and m:
        for a in s['Attributes']:
            if not e.hasAttribute(a['Name']) or not m:
                m = False
            else:
                if a.get('Value'):
                    aV = str(a['Value'])
                    pV = str(e.getAttribute(a['Name']))
                    if a.get('CaseSensitivity'):
                        if a['CaseSensitivity'] == 'i':
                            aV = aV.lower()
                            pV = pV.lower()
                    if a.get('Operator'):
                        if a['Operator'] == '~':
                            if aV not in pV.split():
                                m = False
                        if a['Operator'] == '|':
                            if not aV == pV and not aV+'-' == pV[0: len(aV)+1]:
                                m = False
                        if a['Operator'] == '^':
                            if not aV == pV[0: len(aV)]:
                                m = False
                        if a['Operator'] == '$':
                            if not aV == pV[-len(aV):]:
                                m = False
                        if a['Operator'] == '*':
                            if aV not in pV:
                                m = False
                    else:
                        if not aV == pV:
                            m = False
    return m

def _parse_selector(selector:str, rc:list) -> list:
    """Returns list of selector dictionaries.

    Dictionaries may contain keys: 
        "TagName", and "Attributes"
    "TagName" has a string value.
    "Attributes" is a list of dictionaries that may contain:
        "Name", "Operator", "Value", and "CaseSensitivity"
    """
    def addCharacter(d:dict, key:str, character:str):
        if not d.get(key):
            d[key] = character
        else:
            d[key] += character
    # 1:TagName 2:AttributeName 3:AttributeOperator 4:AttributeValue 5:AttributeParenthesis 6:AttributeQuote 7:AttributeCaseSensitivity
    level = 1
    operatorModifiers = ['~','|','^','*']
    rc.append({})
    for c in selector:
        if c.isspace() and level < 2:
            level = 1
            if len(rc[-1].keys()):
                rc.append({})
        else:
            if c == '[' and level < 2:
                if not rc[-1].get('Attributes'):
                    rc[-1]['Attributes'] = []
                rc[-1]['Attributes'].append({})
                level = 2
            elif (c.isspace() or c in operatorModifiers) and level == 2:
                if rc[-1]['Attributes'][-1].get('Name'):
                    level = 3
            elif c == '=' and level < 4:
                level = 4
            elif c.isspace() and level == 4:
                if rc[-1]['Attributes'][-1].get('Value'):
                    level = 7
            elif c == '"' and level == 4:
                level = 5
            elif c == "'" and level == 4:
                level = 6
            elif c == '"' and level == 5:
                level = 7
            elif c == "'" and level == 6:
                level = 7
            elif c == ']' and (level > 1 and not (level == 5 or level == 6)):
                level = 1

            if level == 1 and not (c.isspace() or c == ']'):
                addCharacter(rc[-1], 'TagName', c)
            elif level == 2 and not (c.isspace() or c == '['):
                addCharacter(rc[-1]['Attributes'][-1], 'Name', c)
            elif level == 3 and c in operatorModifiers:
                addCharacter(rc[-1]['Attributes'][-1], 'Operator', c)
            elif level == 4 and not (c.isspace() or c == '='):
                addCharacter(rc[-1]['Attributes'][-1], 'Value', c)
            elif level == 5 and not c == '"':
                addCharacter(rc[-1]['Attributes'][-1], 'Value', c)
            elif level == 6 and not c == "'":
                addCharacter(rc[-1]['Attributes'][-1], 'Value', c)
            elif level == 7 and not (c.isspace() or c == '"' or c == "'"):
                addCharacter(rc[-1]['Attributes'][-1], 'CaseSensitivity', c)

    for r in rc:
        if r.get('Attributes'):
            for j,a in enumerate(r['Attributes']):
                if not len(a):
                    a.pop(j)
            if not len(r['Attributes']):
                del r['Attributes']

        if rc[-1].get('Attributes'):
            if not rc[-1]['Attributes'][-1]:
                rc[-1]
        if not rc[-1].keys():
            rc.pop(-1)
    return rc

These functions should be added to the NodeGet class.

    def getElementById(self, id:str) -> NodeList['Element']:
        """Returns all descendant elements with the given id.
        In valid xml, there should be only one element per id.

        Returns the list of all descendant elements (not direct children
        only) with the specified id.
        """
        return _get_element_by_id_helper(self, id, NodeList())

    def getElementByName(self, name:str) -> NodeList['Element']:
        """Returns all descendant elements with the given name.
        In valid xml, there should be only one element per name.

        Returns the list of all descendant elements (not direct children
        only) with the specified name.
        """
        return _get_element_by_name_helper(self, name, NodeList())

    def getElementsByClassName(self, className:str) -> NodeList['Element']:
        """Returns all descendant elements with the given class name.

        Returns the list of all descendant elements (not direct children
        only) with the specified class name.
        """
        return _get_elements_by_className_helper(self, className, NodeList())

    def closest(self, query:str) -> NodeList['Element']:
        """Returns element in path with the given query.

        Returns the element in path (not direct parent
        only) with the specified query.
        """
        return _closest_helper(self, _parse_selector(query, list()), NodeList())

    def querySelector(self, query:str) -> NodeList['Element']:
        """Returns first descendant element with the given query.

        Returns the first descendant element (not direct child
        only) with the specified query.
        """
        return _query_selector_helper(self, _parse_selector(query, list()), NodeList())

    def querySelectorAll(self, query:str) -> NodeList['Element']:
        """Returns all descendant elements with the given query.

        Returns the list of all descendant elements (not direct children
        only) with the specified query.
        """
        return _query_selector_helper(self, _parse_selector(query, list()), NodeList(), all=True)

Has this already been discussed elsewhere?

I have already discussed this feature proposal on Discourse

Links to previous discussion of this feature:

https://discuss.python.org/t/xml-dom-minidom-adding-convenience-methods-for-getting-elements/37140/1

R0NUT commented 1 year ago

@AlexWaygood What do I need to do to gain some traction with this? Have patience?

AlexWaygood commented 1 year ago

@AlexWaygood What do I need to do to gain some traction with this? Have patience?

I'm not an xml expert, so I can't comment much on whether the change is desirable. But it's much more likely to be accepted if you make a PR for the change! CPython is an open-source project that depends on the contributions of volunteers giving up their free time.

If you're new to CPython's workflow, we recommend having a skim through the devguide

hyperstown commented 9 months ago

That looks awesome! @R0NUT please open PR, can't wait for this to be added. Also I noticed that querySelector, getElementById and other get single element methods return NodeList but I think it would be more appropriate for them to return Node. EDIT: and closest returns one element before target.