lestrrat-go / libxml2

Interface to libxml2, with DOM interface
MIT License
230 stars 55 forks source link

How to remove a node? #79

Closed asyncins closed 3 years ago

asyncins commented 3 years ago

When i want remove a node:

parent, err := node.ParentNode()
if err != nil {
    return err
}
parent.RemoveChild(node)
return nil

1 - This way, the memory will continue to rise until the oom(the memory usage 20M,150M,800M,2G,7G).

2 - This happens in the for i:=0;i<10000;i++{},such as consumer from rabbitmq or kafka.

3 - If use node.Free(), the remove will be invalid,but memory usage doesn't rise.

How can I write code to remove a node???

Please help me, thanks.

lestrrat commented 3 years ago

Please provide a complete test/program. My current guess is, (1) there's a memory leak in my RemoveChild call, (2) you are reusing the root document node (which "owns" the node), and the memory region is somehow still being held, or (3) something else

asyncins commented 3 years ago
package main

import (
    "fmt"
    "time"

    "github.com/lestrrat-go/libxml2"
    "github.com/lestrrat-go/libxml2/types"
)

func removeNode(node types.Node) {
    parent, err := node.ParentNode()
    if parent != nil && err == nil {
        parent.RemoveChild(node)
    } else {
        fmt.Print(err)
    }
}

func parse(content string) {
    for i := 0; i < 1000000; i++ {
        time.Sleep(time.Millisecond * 10)
        fmt.Println("current is: ", i)
        doc, err := libxml2.ParseHTMLString(content)
        if err != nil {
            fmt.Print(err)
        }
        defer doc.Free()
        doc.Walk(func(n types.Node) error {
            nodeName := n.NodeName()
            if nodeName == "meta" || nodeName == "style" || nodeName == "script"  || nodeName == "noise"{
                removeNode(n)
            }
            return nil
        })
    }
}

func main() {
    content := `
        <!doctype html>
        <html>
        <head>
            <title>Example Domain</title>
            <meta charset="utf-8" />
            <meta http-equiv="Content-type" content="text/html; charset=utf-8" />
            <meta name="viewport" content="width=device-width, initial-scale=1" />
            <style type="text/css">
            body {
                background-color: #f0f0f2;
                margin: 0;
                padding: 0;
                font-family: -apple-system, system-ui, BlinkMacSystemFont, "Segoe UI", "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;

            }
            div {
                width: 600px;
                margin: 5em auto;
                padding: 2em;
                background-color: #fdfdff;
                border-radius: 0.5em;
                box-shadow: 2px 3px 7px 2px rgba(0,0,0,0.02);
            }
            a:link, a:visited {
                color: #38488f;
                text-decoration: none;
            }
            @media (max-width: 700px) {
                div {
                    margin: 0 auto;
                    width: auto;
                }
            }
            </style>    
        </head>

        <body>
        <div>
            <h1>Example Domain</h1>
            <p>This domain is for use in illustrative examples in documents. You may use this
            domain in literature without prior coordination or asking for permission.</p>
            <p><a href="https://www.iana.org/domains/example">More information...</a></p>
            <em>noise</em>
        </div>
        <script type="text/javascript">
            if(typeof(sina)!="object"){var sina={}}
            sina.$=function(i){if(!i){return null}
            return document.getElementById(i)};var sinaFlash=function(V,x,X,Z,v,z,i,c,I,l,o){var w=this;if(!document.createElement||!document.getElementById){return}
            w.id=x?x:'';var O=function(I,i){for(var l=0;l<I.length;l++){if(I[l]==i){return l}}
            return-1},C='8.0.42.0';if(O(['eladies.sina.com.cn','ent.sina.com.cn'],document.domain)>-1){w.ver=C}else{w.ver=v?v:C}
            w.ver=w.ver.replace(/\./g,',');w.__classid="clsid:D27CDB6E-AE6D-11cf-96B8-444553540000";w.__codebase="http://download.macromedia.com/pub/shockwave/cabs/flash/swflash.cab#version="+w.ver;w.width=X;w.height=Z;w.movie=V;w.src=w.movie;w.bgcolor=z?z:'';w.quality=c?c:"high";w.__pluginspage="http://www.macromedia.com/go/getflashplayer";w.__type="application/x-shockwave-flash";w.useExpressInstall=(typeof(i)=="boolean")?i:false;w.xir=I?I:window.location;w.redirectUrl=l?l:window.location;w.detectKey=(typeof(o)=="boolean")?o:true;w.escapeIs=false;w.__objAttrs={};w.__params={};w.__embedAttrs={};w.__flashVars=[];w.__flashVarsStr="";w.__forSetAttribute("id",w.id);w.__objAttrs["classid"]=w.__classid;w.__forSetAttribute("codebase",w.__codebase);w.__forSetAttribute("width",w.width);w.__forSetAttribute("height",w.height);w.__forSetAttribute("movie",w.movie);w.__forSetAttribute("quality",w.quality);w.__forSetAttribute("pluginspage",w.__pluginspage);w.__forSetAttribute("type",w.__type);w.__forSetAttribute("bgcolor",w.bgcolor)}
            sinaFlash.prototype={getFlashHtml:function(){var I=this;
            if(/\((iPhone|iPad|iPod)/i.test(navigator.userAgent) && I.width>=930 && I.height<100 && /^http\:\/\/d\d\./i.test(I.movie)){return '';}//iOS不投放通栏flash广告
            var i='<object ';for(var l in I.__objAttrs){i+=l+'="'+I.__objAttrs[l]+'"'+' '}
            i+='>\n';for(var l in I.__params){i+='  <param name="'+l+'" value="'+I.__params[l]+'" \/>\n'}
            if(I.__flashVarsStr!=""){i+='   <param name="flashvars" value="'+I.__flashVarsStr+'" \/>\n'}
            i+='    <embed ';for(var l in I.__embedAttrs){i+=l+'="'+I.__embedAttrs[l]+'"'+' '}
            i+='><\/embed>\n<\/object>';return i},__forSetAttribute:function(I,i){var l=this;if(typeof(I)=="undefined"||I==''||typeof(i)=="undefined"||i==''){return}
            I=I.toLowerCase();switch(I){case "classid":break;case "pluginspage":l.__embedAttrs[I]=i;break;case "onafterupdate":case "onbeforeupdate":case "onblur":case "oncellchange":case "onclick":case "ondblClick":case "ondrag":case "ondragend":case "ondragenter":case "ondragleave":case "ondragover":case "ondrop":case "onfinish":case "onfocus":case "onhelp":case "onmousedown":case "onmouseup":case "onmouseover":case "onmousemove":case "onmouseout":case "onkeypress":case "onkeydown":case "onkeyup":case "onload":case "onlosecapture":case "onpropertychange":case "onreadystatechange":case "onrowsdelete":case "onrowenter":case "onrowexit":case "onrowsinserted":case "onstart":case "onscroll":case "onbeforeeditfocus":case "onactivate":case "onbeforedeactivate":case "ondeactivate":case "codebase":l.__objAttrs[I]=i;break;case "src":case "movie":l.__embedAttrs["src"]=i;l.__params["movie"]=i;break;case "width":case "height":case "align":case "vspace":case "hspace":case "title":case "class":case "name":case "id":case "accesskey":case "tabindex":case "type":l.__objAttrs[I]=l.__embedAttrs[I]=i;break;default:l.__params[I]=l.__embedAttrs[I]=i}},__forGetAttribute:function(i){var I=this;i=i.toLowerCase();if(typeof I.__objAttrs[i]!="undefined"){return I.__objAttrs[i]}else if(typeof I.__params[i]!="undefined"){return I.__params[i]}else if(typeof I.__embedAttrs[i]!="undefined"){return I.__embedAttrs[i]}else{return null}},setAttribute:function(I,i){this.__forSetAttribute(I,i)},getAttribute:function(i){return this.__forGetAttribute(i)},addVariable:function(I,i){var l=this;if(l.escapeIs){I=escape(I);i=escape(i)}
            if(l.__flashVarsStr==""){l.__flashVarsStr=I+"="+i}else{l.__flashVarsStr+="&"+I+"="+i}
            l.__embedAttrs["FlashVars"]=l.__flashVarsStr},getVariable:function(I){var o=this,i=o.__flashVarsStr;if(o.escapeIs){I=escape(I)}
            var l=new RegExp(I+"=([^\\&]*)(\\&?)","i").exec(i);if(o.escapeIs){return unescape(RegExp.$1)}
            return RegExp.$1},addParam:function(I,i){this.__forSetAttribute(I,i)},getParam:function(i){return this.__forGetAttribute(i)},write:function(i){var I=this;if(typeof i=="string"){document.getElementById(i).innerHTML=I.getFlashHtml()}else if(typeof i=="object"){i.innerHTML=I.getFlashHtml()}}};
            </script>
        </body>
        </html>
    `
    parse(content)
}

Example like this,more and more memory is used.

It seems to have something to do with not only the remove node, but also the document object.

@lestrrat

lestrrat commented 3 years ago

This will not free any of the memory used by the doc variable, because you're setting a defer doc.Free() inside a for loop. Remember, defer only fires upon exiting the current function.

Again, I am not ruling out a bug in my code, but just putting an explicit doc.Free() after doc.Walk() should solve a lot, if not all, of your problems.

(See how giving me the entire code told me instantly about possible problems? Please always do this for any issues you file to any repository)

asyncins commented 3 years ago

The above example is not appropriate,defer shuold be outside the for

change function parse, can see me say the more and more memory is used:

// no remove,memory usage balance. Greate!

func parse(content string) {
    for i := 0; i < 1000000; i++ {
        time.Sleep(time.Millisecond * 10)
        fmt.Println("current is: ", i)
        doc, err := libxml2.ParseHTMLString(content)
        if err != nil {
            fmt.Print(err)
        }
        doc.Walk(func(n types.Node) error {
            nodeName := n.NodeName()
            if nodeName == "meta" || nodeName == "style" || nodeName == "script" || nodeName == "noise" {
                // removeNode(n)
            }
            return nil
        })
        doc.Free()
    }
}

The other parse

// remove node,more and more memory is used.

func parse(content string) {
    for i := 0; i < 1000000; i++ {
        time.Sleep(time.Millisecond * 10)
        fmt.Println("current is: ", i)
        doc, err := libxml2.ParseHTMLString(content)
        if err != nil {
            fmt.Print(err)
        }
        doc.Walk(func(n types.Node) error {
            nodeName := n.NodeName()
            if nodeName == "meta" || nodeName == "style" || nodeName == "script" || nodeName == "noise" {
                removeNode(n)
            }
            return nil
        })
        doc.Free()
    }
}

The program run for about five minutes, memory usage in no remove node mode about 6M,memory usage in remove node mode about 300M

@lestrrat

lestrrat commented 3 years ago

Did you try putting node.Free() in removeNode()?

asyncins commented 3 years ago

Did you try putting node.Free() in removeNode()

func removeNode(node types.Node) {
    parent, err := node.ParentNode()
    if parent != nil && err == nil {
        parent.RemoveChild(node)
                node.Free()  // I've tried that before
    } else {
        fmt.Print(err)
    }
}

if use node.Free(),the remove operation will be invalid - style/script/meta tag still exists and has not been removed.

if not use node.Free(), style/script/meta tag has been remove.

lestrrat commented 3 years ago

Then, this issue is not about memory usage. Please create a minimal test case that reproduces your "has not been removed" problem.

asyncins commented 3 years ago

Then, this issue is not about memory usage. Please create a minimal test case that reproduces your "has not been removed" problem.

package main

import (
    "fmt"
    "time"

    "github.com/lestrrat-go/libxml2"
    "github.com/lestrrat-go/libxml2/types"
)

func removeNode(node types.Node) {
    parent, err := node.ParentNode()
    if parent != nil && err == nil {
        parent.RemoveChild(node)
        // node.Free()
    } else {
        fmt.Print(err)
    }
}

func parse(content string) {
    for i := 0; i < 1; i++ {
        time.Sleep(time.Millisecond * 10)
        fmt.Println("current is: ", i)
        doc, err := libxml2.ParseHTMLString(content)
        if err != nil {
            fmt.Print(err)
        }
        nodeCount := 0

        // doc.Walk(func(n types.Node) error {
        //  nodeName := n.NodeName()
        //  if nodeName == "meta" || nodeName == "style" || nodeName == "script" || nodeName == "noise" {
        //      removeNode(n)
        //  }
        //  return nil
        // })

        doc.Walk(func(n types.Node) error {
            nodeName := n.NodeName()
            fmt.Print(nodeName, " ")
            nodeCount += 1
            return nil
        })

        doc.Free()
        fmt.Println("nodeCount: ", nodeCount)

    }
}

func main() {
    content := `
        <!doctype html>
        <html>
        <head>
            <title>Example Domain</title>
            <meta charset="utf-8" />
            <meta http-equiv="Content-type" content="text/html; charset=utf-8" />
            <meta name="viewport" content="width=device-width, initial-scale=1" />
            <style type="text/css">
            body {
                background-color: #f0f0f2;
                margin: 0;
                padding: 0;
                font-family: -apple-system, system-ui, BlinkMacSystemFont, "Segoe UI", "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;

            }
            div {
                width: 600px;
                margin: 5em auto;
                padding: 2em;
                background-color: #fdfdff;
                border-radius: 0.5em;
                box-shadow: 2px 3px 7px 2px rgba(0,0,0,0.02);
            }
            a:link, a:visited {
                color: #38488f;
                text-decoration: none;
            }
            @media (max-width: 700px) {
                div {
                    margin: 0 auto;
                    width: auto;
                }
            }
            </style>    
        </head>

        <body>
        <div>
            <h1>Example Domain</h1>
            <p>This domain is for use in illustrative examples in documents. You may use this
            domain in literature without prior coordination or asking for permission.</p>
            <p><a href="https://www.iana.org/domains/example">More information...</a></p>
            <em>noise</em>
        </div>
        <script type="text/javascript">
            if(typeof(sina)!="object"){var sina={}}
            sina.$=function(i){if(!i){return null}
            return document.getElementById(i)};var sinaFlash=function(V,x,X,Z,v,z,i,c,I,l,o){var w=this;if(!document.createElement||!document.getElementById){return}
            w.id=x?x:'';var O=function(I,i){for(var l=0;l<I.length;l++){if(I[l]==i){return l}}
            return-1},C='8.0.42.0';if(O(['eladies.sina.com.cn','ent.sina.com.cn'],document.domain)>-1){w.ver=C}else{w.ver=v?v:C}
            w.ver=w.ver.replace(/\./g,',');w.__classid="clsid:D27CDB6E-AE6D-11cf-96B8-444553540000";w.__codebase="http://download.macromedia.com/pub/shockwave/cabs/flash/swflash.cab#version="+w.ver;w.width=X;w.height=Z;w.movie=V;w.src=w.movie;w.bgcolor=z?z:'';w.quality=c?c:"high";w.__pluginspage="http://www.macromedia.com/go/getflashplayer";w.__type="application/x-shockwave-flash";w.useExpressInstall=(typeof(i)=="boolean")?i:false;w.xir=I?I:window.location;w.redirectUrl=l?l:window.location;w.detectKey=(typeof(o)=="boolean")?o:true;w.escapeIs=false;w.__objAttrs={};w.__params={};w.__embedAttrs={};w.__flashVars=[];w.__flashVarsStr="";w.__forSetAttribute("id",w.id);w.__objAttrs["classid"]=w.__classid;w.__forSetAttribute("codebase",w.__codebase);w.__forSetAttribute("width",w.width);w.__forSetAttribute("height",w.height);w.__forSetAttribute("movie",w.movie);w.__forSetAttribute("quality",w.quality);w.__forSetAttribute("pluginspage",w.__pluginspage);w.__forSetAttribute("type",w.__type);w.__forSetAttribute("bgcolor",w.bgcolor)}
            sinaFlash.prototype={getFlashHtml:function(){var I=this;
            if(/\((iPhone|iPad|iPod)/i.test(navigator.userAgent) && I.width>=930 && I.height<100 && /^http\:\/\/d\d\./i.test(I.movie)){return '';}//iOS不投放通栏flash广告
            var i='<object ';for(var l in I.__objAttrs){i+=l+'="'+I.__objAttrs[l]+'"'+' '}
            i+='>\n';for(var l in I.__params){i+='  <param name="'+l+'" value="'+I.__params[l]+'" \/>\n'}
            if(I.__flashVarsStr!=""){i+='   <param name="flashvars" value="'+I.__flashVarsStr+'" \/>\n'}
            i+='    <embed ';for(var l in I.__embedAttrs){i+=l+'="'+I.__embedAttrs[l]+'"'+' '}
            i+='><\/embed>\n<\/object>';return i},__forSetAttribute:function(I,i){var l=this;if(typeof(I)=="undefined"||I==''||typeof(i)=="undefined"||i==''){return}
            I=I.toLowerCase();switch(I){case "classid":break;case "pluginspage":l.__embedAttrs[I]=i;break;case "onafterupdate":case "onbeforeupdate":case "onblur":case "oncellchange":case "onclick":case "ondblClick":case "ondrag":case "ondragend":case "ondragenter":case "ondragleave":case "ondragover":case "ondrop":case "onfinish":case "onfocus":case "onhelp":case "onmousedown":case "onmouseup":case "onmouseover":case "onmousemove":case "onmouseout":case "onkeypress":case "onkeydown":case "onkeyup":case "onload":case "onlosecapture":case "onpropertychange":case "onreadystatechange":case "onrowsdelete":case "onrowenter":case "onrowexit":case "onrowsinserted":case "onstart":case "onscroll":case "onbeforeeditfocus":case "onactivate":case "onbeforedeactivate":case "ondeactivate":case "codebase":l.__objAttrs[I]=i;break;case "src":case "movie":l.__embedAttrs["src"]=i;l.__params["movie"]=i;break;case "width":case "height":case "align":case "vspace":case "hspace":case "title":case "class":case "name":case "id":case "accesskey":case "tabindex":case "type":l.__objAttrs[I]=l.__embedAttrs[I]=i;break;default:l.__params[I]=l.__embedAttrs[I]=i}},__forGetAttribute:function(i){var I=this;i=i.toLowerCase();if(typeof I.__objAttrs[i]!="undefined"){return I.__objAttrs[i]}else if(typeof I.__params[i]!="undefined"){return I.__params[i]}else if(typeof I.__embedAttrs[i]!="undefined"){return I.__embedAttrs[i]}else{return null}},setAttribute:function(I,i){this.__forSetAttribute(I,i)},getAttribute:function(i){return this.__forGetAttribute(i)},addVariable:function(I,i){var l=this;if(l.escapeIs){I=escape(I);i=escape(i)}
            if(l.__flashVarsStr==""){l.__flashVarsStr=I+"="+i}else{l.__flashVarsStr+="&"+I+"="+i}
            l.__embedAttrs["FlashVars"]=l.__flashVarsStr},getVariable:function(I){var o=this,i=o.__flashVarsStr;if(o.escapeIs){I=escape(I)}
            var l=new RegExp(I+"=([^\\&]*)(\\&?)","i").exec(i);if(o.escapeIs){return unescape(RegExp.$1)}
            return RegExp.$1},addParam:function(I,i){this.__forSetAttribute(I,i)},getParam:function(i){return this.__forGetAttribute(i)},write:function(i){var I=this;if(typeof i=="string"){document.getElementById(i).innerHTML=I.getFlashHtml()}else if(typeof i=="object"){i.innerHTML=I.getFlashHtml()}}};
            </script>
        </body>
        </html>
    `
    parse(content)
}

======================

Result

method a

html head title #text meta meta meta style #cdata-section body #text div #text h1 #text #text p #text #text p a #text #text em #text #text #text script #cdata-section

nodeCount:  29

method b

html head title #text body #text div #text h1 #text #text p #text #text p a #text #text em #text #text #text 

nodeCount:  22

method c

html head title #text meta meta style #cdata-section body #text div #text h1 #text #text p #text #text p a #text #text em #text #text #text script #cdata-section

nodeCount:  28

a) no removeNode() and no node.Free() nodeCount is 29. b) use removeNode(),but no node.Free(),nodeCount is 22. c) use removeNode(),and use node.Free(),nodeCount is 28.

c indicates that the node remove failed.

lestrrat commented 3 years ago

Thank you. Checking now....

lestrrat commented 3 years ago

Well, I don't have too much time on me right now, but from what I can tell, if you can wait to free the nodes until after Walk(), it should work.

# (pseudocode)

var toRemove []types.Node
doc.Walk(func(n types.Node) error {
   ....
   if ... {
     removeNode(n)
     toRemove = append(toRemove, n)
   }
})

for _, n := range toRemove {
   n.Free()
}

I haven't checked, but I think doing stuff that changes the tree structure from within Walk() causes problems

HTH

asyncins commented 3 years ago

Well, I don't have too much time on me right now, but from what I can tell, if you can wait to free the nodes until after Walk(), it should work.

It's work~Nice!

Thank you! You helped me solve the problem, thank you!!!

work's demo in here:

func removeNode(node types.Node) {
    parent, err := node.ParentNode()
    if parent != nil && err == nil {
        parent.RemoveChild(node)
    } else {
        fmt.Print(err)
    }
}

func parse(content string) {
    for i := 0; i < 100000; i++ {
        time.Sleep(time.Millisecond * 10)
        fmt.Println("current is: ", i)
        doc, err := libxml2.ParseHTMLString(content)
        if err != nil {
            fmt.Print(err)
        }
        nodeCount := 0
        var toRemove []types.Node
        doc.Walk(func(n types.Node) error {
            nodeName := n.NodeName()
            if nodeName == "meta" || nodeName == "style" || nodeName == "script" || nodeName == "noise" {
                removeNode(n)
                toRemove = append(toRemove, n)
            }
            return nil
        })

        for _, n := range toRemove {
            n.Free()
        }

        doc.Walk(func(n types.Node) error {
            nodeName := n.NodeName()
            fmt.Print(nodeName, " ")
            nodeCount += 1
            return nil
        })

        doc.Free()
        fmt.Println("nodeCount: ", nodeCount)

    }
}

The memory between 5M and 6M, nice!