Closed asyncins closed 3 years ago
Please provide a complete test/program. My current guess is, (1) there's a memory leak in my RemoveChild
call, (2) you are reusing the root document node (which "owns" the node), and the memory region is somehow still being held, or (3) something else
package main
import (
"fmt"
"time"
"github.com/lestrrat-go/libxml2"
"github.com/lestrrat-go/libxml2/types"
)
func removeNode(node types.Node) {
parent, err := node.ParentNode()
if parent != nil && err == nil {
parent.RemoveChild(node)
} else {
fmt.Print(err)
}
}
func parse(content string) {
for i := 0; i < 1000000; i++ {
time.Sleep(time.Millisecond * 10)
fmt.Println("current is: ", i)
doc, err := libxml2.ParseHTMLString(content)
if err != nil {
fmt.Print(err)
}
defer doc.Free()
doc.Walk(func(n types.Node) error {
nodeName := n.NodeName()
if nodeName == "meta" || nodeName == "style" || nodeName == "script" || nodeName == "noise"{
removeNode(n)
}
return nil
})
}
}
func main() {
content := `
<!doctype html>
<html>
<head>
<title>Example Domain</title>
<meta charset="utf-8" />
<meta http-equiv="Content-type" content="text/html; charset=utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<style type="text/css">
body {
background-color: #f0f0f2;
margin: 0;
padding: 0;
font-family: -apple-system, system-ui, BlinkMacSystemFont, "Segoe UI", "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;
}
div {
width: 600px;
margin: 5em auto;
padding: 2em;
background-color: #fdfdff;
border-radius: 0.5em;
box-shadow: 2px 3px 7px 2px rgba(0,0,0,0.02);
}
a:link, a:visited {
color: #38488f;
text-decoration: none;
}
@media (max-width: 700px) {
div {
margin: 0 auto;
width: auto;
}
}
</style>
</head>
<body>
<div>
<h1>Example Domain</h1>
<p>This domain is for use in illustrative examples in documents. You may use this
domain in literature without prior coordination or asking for permission.</p>
<p><a href="https://www.iana.org/domains/example">More information...</a></p>
<em>noise</em>
</div>
<script type="text/javascript">
if(typeof(sina)!="object"){var sina={}}
sina.$=function(i){if(!i){return null}
return document.getElementById(i)};var sinaFlash=function(V,x,X,Z,v,z,i,c,I,l,o){var w=this;if(!document.createElement||!document.getElementById){return}
w.id=x?x:'';var O=function(I,i){for(var l=0;l<I.length;l++){if(I[l]==i){return l}}
return-1},C='8.0.42.0';if(O(['eladies.sina.com.cn','ent.sina.com.cn'],document.domain)>-1){w.ver=C}else{w.ver=v?v:C}
w.ver=w.ver.replace(/\./g,',');w.__classid="clsid:D27CDB6E-AE6D-11cf-96B8-444553540000";w.__codebase="http://download.macromedia.com/pub/shockwave/cabs/flash/swflash.cab#version="+w.ver;w.width=X;w.height=Z;w.movie=V;w.src=w.movie;w.bgcolor=z?z:'';w.quality=c?c:"high";w.__pluginspage="http://www.macromedia.com/go/getflashplayer";w.__type="application/x-shockwave-flash";w.useExpressInstall=(typeof(i)=="boolean")?i:false;w.xir=I?I:window.location;w.redirectUrl=l?l:window.location;w.detectKey=(typeof(o)=="boolean")?o:true;w.escapeIs=false;w.__objAttrs={};w.__params={};w.__embedAttrs={};w.__flashVars=[];w.__flashVarsStr="";w.__forSetAttribute("id",w.id);w.__objAttrs["classid"]=w.__classid;w.__forSetAttribute("codebase",w.__codebase);w.__forSetAttribute("width",w.width);w.__forSetAttribute("height",w.height);w.__forSetAttribute("movie",w.movie);w.__forSetAttribute("quality",w.quality);w.__forSetAttribute("pluginspage",w.__pluginspage);w.__forSetAttribute("type",w.__type);w.__forSetAttribute("bgcolor",w.bgcolor)}
sinaFlash.prototype={getFlashHtml:function(){var I=this;
if(/\((iPhone|iPad|iPod)/i.test(navigator.userAgent) && I.width>=930 && I.height<100 && /^http\:\/\/d\d\./i.test(I.movie)){return '';}//iOS不投放通栏flash广告
var i='<object ';for(var l in I.__objAttrs){i+=l+'="'+I.__objAttrs[l]+'"'+' '}
i+='>\n';for(var l in I.__params){i+=' <param name="'+l+'" value="'+I.__params[l]+'" \/>\n'}
if(I.__flashVarsStr!=""){i+=' <param name="flashvars" value="'+I.__flashVarsStr+'" \/>\n'}
i+=' <embed ';for(var l in I.__embedAttrs){i+=l+'="'+I.__embedAttrs[l]+'"'+' '}
i+='><\/embed>\n<\/object>';return i},__forSetAttribute:function(I,i){var l=this;if(typeof(I)=="undefined"||I==''||typeof(i)=="undefined"||i==''){return}
I=I.toLowerCase();switch(I){case "classid":break;case "pluginspage":l.__embedAttrs[I]=i;break;case "onafterupdate":case "onbeforeupdate":case "onblur":case "oncellchange":case "onclick":case "ondblClick":case "ondrag":case "ondragend":case "ondragenter":case "ondragleave":case "ondragover":case "ondrop":case "onfinish":case "onfocus":case "onhelp":case "onmousedown":case "onmouseup":case "onmouseover":case "onmousemove":case "onmouseout":case "onkeypress":case "onkeydown":case "onkeyup":case "onload":case "onlosecapture":case "onpropertychange":case "onreadystatechange":case "onrowsdelete":case "onrowenter":case "onrowexit":case "onrowsinserted":case "onstart":case "onscroll":case "onbeforeeditfocus":case "onactivate":case "onbeforedeactivate":case "ondeactivate":case "codebase":l.__objAttrs[I]=i;break;case "src":case "movie":l.__embedAttrs["src"]=i;l.__params["movie"]=i;break;case "width":case "height":case "align":case "vspace":case "hspace":case "title":case "class":case "name":case "id":case "accesskey":case "tabindex":case "type":l.__objAttrs[I]=l.__embedAttrs[I]=i;break;default:l.__params[I]=l.__embedAttrs[I]=i}},__forGetAttribute:function(i){var I=this;i=i.toLowerCase();if(typeof I.__objAttrs[i]!="undefined"){return I.__objAttrs[i]}else if(typeof I.__params[i]!="undefined"){return I.__params[i]}else if(typeof I.__embedAttrs[i]!="undefined"){return I.__embedAttrs[i]}else{return null}},setAttribute:function(I,i){this.__forSetAttribute(I,i)},getAttribute:function(i){return this.__forGetAttribute(i)},addVariable:function(I,i){var l=this;if(l.escapeIs){I=escape(I);i=escape(i)}
if(l.__flashVarsStr==""){l.__flashVarsStr=I+"="+i}else{l.__flashVarsStr+="&"+I+"="+i}
l.__embedAttrs["FlashVars"]=l.__flashVarsStr},getVariable:function(I){var o=this,i=o.__flashVarsStr;if(o.escapeIs){I=escape(I)}
var l=new RegExp(I+"=([^\\&]*)(\\&?)","i").exec(i);if(o.escapeIs){return unescape(RegExp.$1)}
return RegExp.$1},addParam:function(I,i){this.__forSetAttribute(I,i)},getParam:function(i){return this.__forGetAttribute(i)},write:function(i){var I=this;if(typeof i=="string"){document.getElementById(i).innerHTML=I.getFlashHtml()}else if(typeof i=="object"){i.innerHTML=I.getFlashHtml()}}};
</script>
</body>
</html>
`
parse(content)
}
Example like this,more and more memory is used.
It seems to have something to do with not only the remove node, but also the document object.
@lestrrat
This will not free any of the memory used by the doc
variable, because you're setting a defer doc.Free()
inside a for loop.
Remember, defer
only fires upon exiting the current function.
Again, I am not ruling out a bug in my code, but just putting an explicit doc.Free()
after doc.Walk()
should solve a lot, if not all, of your problems.
(See how giving me the entire code told me instantly about possible problems? Please always do this for any issues you file to any repository)
The above example is not appropriate,defer
shuold be outside the for
。
change function parse
, can see me say the more and more memory is used
:
// no remove,memory usage balance. Greate!
func parse(content string) {
for i := 0; i < 1000000; i++ {
time.Sleep(time.Millisecond * 10)
fmt.Println("current is: ", i)
doc, err := libxml2.ParseHTMLString(content)
if err != nil {
fmt.Print(err)
}
doc.Walk(func(n types.Node) error {
nodeName := n.NodeName()
if nodeName == "meta" || nodeName == "style" || nodeName == "script" || nodeName == "noise" {
// removeNode(n)
}
return nil
})
doc.Free()
}
}
The other parse
// remove node,more and more memory is used.
func parse(content string) {
for i := 0; i < 1000000; i++ {
time.Sleep(time.Millisecond * 10)
fmt.Println("current is: ", i)
doc, err := libxml2.ParseHTMLString(content)
if err != nil {
fmt.Print(err)
}
doc.Walk(func(n types.Node) error {
nodeName := n.NodeName()
if nodeName == "meta" || nodeName == "style" || nodeName == "script" || nodeName == "noise" {
removeNode(n)
}
return nil
})
doc.Free()
}
}
The program run for about five minutes, memory usage in no remove node
mode about 6M,memory usage in remove node
mode about 300M
@lestrrat
Did you try putting node.Free()
in removeNode()
?
Did you try putting
node.Free()
inremoveNode()
func removeNode(node types.Node) {
parent, err := node.ParentNode()
if parent != nil && err == nil {
parent.RemoveChild(node)
node.Free() // I've tried that before
} else {
fmt.Print(err)
}
}
if use node.Free()
,the remove operation will be invalid - style/script/meta
tag still exists and has not been removed.
if not use node.Free()
, style/script/meta
tag has been remove.
Then, this issue is not about memory usage. Please create a minimal test case that reproduces your "has not been removed" problem.
Then, this issue is not about memory usage. Please create a minimal test case that reproduces your "has not been removed" problem.
package main
import (
"fmt"
"time"
"github.com/lestrrat-go/libxml2"
"github.com/lestrrat-go/libxml2/types"
)
func removeNode(node types.Node) {
parent, err := node.ParentNode()
if parent != nil && err == nil {
parent.RemoveChild(node)
// node.Free()
} else {
fmt.Print(err)
}
}
func parse(content string) {
for i := 0; i < 1; i++ {
time.Sleep(time.Millisecond * 10)
fmt.Println("current is: ", i)
doc, err := libxml2.ParseHTMLString(content)
if err != nil {
fmt.Print(err)
}
nodeCount := 0
// doc.Walk(func(n types.Node) error {
// nodeName := n.NodeName()
// if nodeName == "meta" || nodeName == "style" || nodeName == "script" || nodeName == "noise" {
// removeNode(n)
// }
// return nil
// })
doc.Walk(func(n types.Node) error {
nodeName := n.NodeName()
fmt.Print(nodeName, " ")
nodeCount += 1
return nil
})
doc.Free()
fmt.Println("nodeCount: ", nodeCount)
}
}
func main() {
content := `
<!doctype html>
<html>
<head>
<title>Example Domain</title>
<meta charset="utf-8" />
<meta http-equiv="Content-type" content="text/html; charset=utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<style type="text/css">
body {
background-color: #f0f0f2;
margin: 0;
padding: 0;
font-family: -apple-system, system-ui, BlinkMacSystemFont, "Segoe UI", "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;
}
div {
width: 600px;
margin: 5em auto;
padding: 2em;
background-color: #fdfdff;
border-radius: 0.5em;
box-shadow: 2px 3px 7px 2px rgba(0,0,0,0.02);
}
a:link, a:visited {
color: #38488f;
text-decoration: none;
}
@media (max-width: 700px) {
div {
margin: 0 auto;
width: auto;
}
}
</style>
</head>
<body>
<div>
<h1>Example Domain</h1>
<p>This domain is for use in illustrative examples in documents. You may use this
domain in literature without prior coordination or asking for permission.</p>
<p><a href="https://www.iana.org/domains/example">More information...</a></p>
<em>noise</em>
</div>
<script type="text/javascript">
if(typeof(sina)!="object"){var sina={}}
sina.$=function(i){if(!i){return null}
return document.getElementById(i)};var sinaFlash=function(V,x,X,Z,v,z,i,c,I,l,o){var w=this;if(!document.createElement||!document.getElementById){return}
w.id=x?x:'';var O=function(I,i){for(var l=0;l<I.length;l++){if(I[l]==i){return l}}
return-1},C='8.0.42.0';if(O(['eladies.sina.com.cn','ent.sina.com.cn'],document.domain)>-1){w.ver=C}else{w.ver=v?v:C}
w.ver=w.ver.replace(/\./g,',');w.__classid="clsid:D27CDB6E-AE6D-11cf-96B8-444553540000";w.__codebase="http://download.macromedia.com/pub/shockwave/cabs/flash/swflash.cab#version="+w.ver;w.width=X;w.height=Z;w.movie=V;w.src=w.movie;w.bgcolor=z?z:'';w.quality=c?c:"high";w.__pluginspage="http://www.macromedia.com/go/getflashplayer";w.__type="application/x-shockwave-flash";w.useExpressInstall=(typeof(i)=="boolean")?i:false;w.xir=I?I:window.location;w.redirectUrl=l?l:window.location;w.detectKey=(typeof(o)=="boolean")?o:true;w.escapeIs=false;w.__objAttrs={};w.__params={};w.__embedAttrs={};w.__flashVars=[];w.__flashVarsStr="";w.__forSetAttribute("id",w.id);w.__objAttrs["classid"]=w.__classid;w.__forSetAttribute("codebase",w.__codebase);w.__forSetAttribute("width",w.width);w.__forSetAttribute("height",w.height);w.__forSetAttribute("movie",w.movie);w.__forSetAttribute("quality",w.quality);w.__forSetAttribute("pluginspage",w.__pluginspage);w.__forSetAttribute("type",w.__type);w.__forSetAttribute("bgcolor",w.bgcolor)}
sinaFlash.prototype={getFlashHtml:function(){var I=this;
if(/\((iPhone|iPad|iPod)/i.test(navigator.userAgent) && I.width>=930 && I.height<100 && /^http\:\/\/d\d\./i.test(I.movie)){return '';}//iOS不投放通栏flash广告
var i='<object ';for(var l in I.__objAttrs){i+=l+'="'+I.__objAttrs[l]+'"'+' '}
i+='>\n';for(var l in I.__params){i+=' <param name="'+l+'" value="'+I.__params[l]+'" \/>\n'}
if(I.__flashVarsStr!=""){i+=' <param name="flashvars" value="'+I.__flashVarsStr+'" \/>\n'}
i+=' <embed ';for(var l in I.__embedAttrs){i+=l+'="'+I.__embedAttrs[l]+'"'+' '}
i+='><\/embed>\n<\/object>';return i},__forSetAttribute:function(I,i){var l=this;if(typeof(I)=="undefined"||I==''||typeof(i)=="undefined"||i==''){return}
I=I.toLowerCase();switch(I){case "classid":break;case "pluginspage":l.__embedAttrs[I]=i;break;case "onafterupdate":case "onbeforeupdate":case "onblur":case "oncellchange":case "onclick":case "ondblClick":case "ondrag":case "ondragend":case "ondragenter":case "ondragleave":case "ondragover":case "ondrop":case "onfinish":case "onfocus":case "onhelp":case "onmousedown":case "onmouseup":case "onmouseover":case "onmousemove":case "onmouseout":case "onkeypress":case "onkeydown":case "onkeyup":case "onload":case "onlosecapture":case "onpropertychange":case "onreadystatechange":case "onrowsdelete":case "onrowenter":case "onrowexit":case "onrowsinserted":case "onstart":case "onscroll":case "onbeforeeditfocus":case "onactivate":case "onbeforedeactivate":case "ondeactivate":case "codebase":l.__objAttrs[I]=i;break;case "src":case "movie":l.__embedAttrs["src"]=i;l.__params["movie"]=i;break;case "width":case "height":case "align":case "vspace":case "hspace":case "title":case "class":case "name":case "id":case "accesskey":case "tabindex":case "type":l.__objAttrs[I]=l.__embedAttrs[I]=i;break;default:l.__params[I]=l.__embedAttrs[I]=i}},__forGetAttribute:function(i){var I=this;i=i.toLowerCase();if(typeof I.__objAttrs[i]!="undefined"){return I.__objAttrs[i]}else if(typeof I.__params[i]!="undefined"){return I.__params[i]}else if(typeof I.__embedAttrs[i]!="undefined"){return I.__embedAttrs[i]}else{return null}},setAttribute:function(I,i){this.__forSetAttribute(I,i)},getAttribute:function(i){return this.__forGetAttribute(i)},addVariable:function(I,i){var l=this;if(l.escapeIs){I=escape(I);i=escape(i)}
if(l.__flashVarsStr==""){l.__flashVarsStr=I+"="+i}else{l.__flashVarsStr+="&"+I+"="+i}
l.__embedAttrs["FlashVars"]=l.__flashVarsStr},getVariable:function(I){var o=this,i=o.__flashVarsStr;if(o.escapeIs){I=escape(I)}
var l=new RegExp(I+"=([^\\&]*)(\\&?)","i").exec(i);if(o.escapeIs){return unescape(RegExp.$1)}
return RegExp.$1},addParam:function(I,i){this.__forSetAttribute(I,i)},getParam:function(i){return this.__forGetAttribute(i)},write:function(i){var I=this;if(typeof i=="string"){document.getElementById(i).innerHTML=I.getFlashHtml()}else if(typeof i=="object"){i.innerHTML=I.getFlashHtml()}}};
</script>
</body>
</html>
`
parse(content)
}
======================
method a
html head title #text meta meta meta style #cdata-section body #text div #text h1 #text #text p #text #text p a #text #text em #text #text #text script #cdata-section
nodeCount: 29
method b
html head title #text body #text div #text h1 #text #text p #text #text p a #text #text em #text #text #text
nodeCount: 22
method c
html head title #text meta meta style #cdata-section body #text div #text h1 #text #text p #text #text p a #text #text em #text #text #text script #cdata-section
nodeCount: 28
a) no removeNode()
and no node.Free()
nodeCount is 29.
b) use removeNode()
,but no node.Free()
,nodeCount is 22.
c) use removeNode()
,and use node.Free()
,nodeCount is 28.
c indicates that the node remove failed.
Thank you. Checking now....
Well, I don't have too much time on me right now, but from what I can tell, if you can wait to free the nodes until after Walk()
, it should work.
# (pseudocode)
var toRemove []types.Node
doc.Walk(func(n types.Node) error {
....
if ... {
removeNode(n)
toRemove = append(toRemove, n)
}
})
for _, n := range toRemove {
n.Free()
}
I haven't checked, but I think doing stuff that changes the tree structure from within Walk()
causes problems
HTH
Well, I don't have too much time on me right now, but from what I can tell, if you can wait to free the nodes until after
Walk()
, it should work.
It's work~Nice!
Thank you! You helped me solve the problem, thank you!!!
work's demo in here:
func removeNode(node types.Node) {
parent, err := node.ParentNode()
if parent != nil && err == nil {
parent.RemoveChild(node)
} else {
fmt.Print(err)
}
}
func parse(content string) {
for i := 0; i < 100000; i++ {
time.Sleep(time.Millisecond * 10)
fmt.Println("current is: ", i)
doc, err := libxml2.ParseHTMLString(content)
if err != nil {
fmt.Print(err)
}
nodeCount := 0
var toRemove []types.Node
doc.Walk(func(n types.Node) error {
nodeName := n.NodeName()
if nodeName == "meta" || nodeName == "style" || nodeName == "script" || nodeName == "noise" {
removeNode(n)
toRemove = append(toRemove, n)
}
return nil
})
for _, n := range toRemove {
n.Free()
}
doc.Walk(func(n types.Node) error {
nodeName := n.NodeName()
fmt.Print(nodeName, " ")
nodeCount += 1
return nil
})
doc.Free()
fmt.Println("nodeCount: ", nodeCount)
}
}
The memory between 5M and 6M, nice!
When i want remove a node:
1 - This way, the memory will continue to rise until the oom(the memory usage 20M,150M,800M,2G,7G).
2 - This happens in the
for i:=0;i<10000;i++{}
,such as consumer from rabbitmq or kafka.3 - If use
node.Free()
, the remove will be invalid,but memory usage doesn't rise.How can I write code to remove a node???
Please help me, thanks.