Html Agility Pack (HAP) is a free and open-source HTML parser written in C# to read/write DOM and supports plain XPATH or XSLT. It is a .NET code library that allows you to parse "out of the web" HTML files.
When I run this C# console app code (using the HtmlAgilityPack), memory usage starts to build up over time in what appears to be a memory leak, but I can't track it down, despite doing forced Garbage Collection, reusing objects, setting things to null, and clearing out information.
Where is the flaw in my code here? Or is this an issue with HtmlAgilityPack?
Things just grind to a halt after 10 minutes of this loop running. And this is on a system with 128 GB of RAM.
Doing a snapshot compare of the memory in Visual Studio over a short period of time shows the following has massive growth in object count diffs:
HtmlAgilityPack.HtmlNode +292504
HtmlAgilityPack.HtmlNodeCollection +276430
List +276430
HtmlAgilityPack.HtmlAttributeCollection +276248
List +276248
Dictionary<String, HtmlAgilityPack.HtmlAttribute> +276248
HtmlAgilityPack.HtmlAttribute +150445
HtmlAgilityPack.HtmlTextNode +121968
internal class Program
{
private static int setNum = 16020;
static HtmlDocument doc = null;
static HtmlWeb web = new HtmlWeb();
static void Main(string[] args)
{
ScrapeTypeYearName(setNum);
Console.WriteLine("Press Enter to exit...");
Console.ReadLine(); // Wait for the user to press Enter
}
public static void ScrapeTypeYearName(int setNum)
{
string setUrl = "https://www.tcdb.com/Checklist.cfm/sid/" + setNum;
doc = web.Load(setUrl);
// Process breadcrumb information
string category = "";
string year = "";
string company = "";
string setName = "";
var breadcrumb = doc.DocumentNode.SelectSingleNode("//div[@class='d-none d-md-block']/nav[@aria-label='breadcrumb']");
if (breadcrumb != null)
{
var breadcrumbItems = breadcrumb.SelectNodes("./ol/li/a");
if (breadcrumbItems != null && breadcrumbItems.Count >= 4)
{
category = breadcrumbItems[1].InnerText.Trim();
year = breadcrumbItems[3].InnerText.Trim();
Console.WriteLine("Category: " + category);
Console.WriteLine("Year: " + year);
var companyNode = breadcrumb.SelectSingleNode("following::h1[@class='site']");
var setNode = breadcrumb.SelectSingleNode("following::h3[@class='site']");
if (companyNode != null)
{
company = companyNode.InnerText.Trim();
company = company.Replace(year, "").Trim();
Console.WriteLine("Company: " + company);
}
if (setNode != null)
{
setName = setNode.InnerText.Trim();
Console.WriteLine("Set: " + setName);
}
}
}
// Do stuff here with the
// Clear the main document to free up memory
doc?.DocumentNode.RemoveAll();
setNum += 1;
if (setNum < 99999)
{
Console.WriteLine($"");
Console.WriteLine($"On set #: {setNum}");
// Force garbage collection before the next recursive call
GC.Collect();
GC.WaitForPendingFinalizers();
ScrapeTypeYearName(setNum);
}
}
}
When I run this C# console app code (using the HtmlAgilityPack), memory usage starts to build up over time in what appears to be a memory leak, but I can't track it down, despite doing forced Garbage Collection, reusing objects, setting things to null, and clearing out information.
Where is the flaw in my code here? Or is this an issue with HtmlAgilityPack?
Things just grind to a halt after 10 minutes of this loop running. And this is on a system with 128 GB of RAM.
Doing a snapshot compare of the memory in Visual Studio over a short period of time shows the following has massive growth in object count diffs:
HtmlAgilityPack.HtmlNode +292504 HtmlAgilityPack.HtmlNodeCollection +276430 List +276430
HtmlAgilityPack.HtmlAttributeCollection +276248
List +276248
Dictionary<String, HtmlAgilityPack.HtmlAttribute> +276248
HtmlAgilityPack.HtmlAttribute +150445
HtmlAgilityPack.HtmlTextNode +121968