openzim / python-scraperlib

Collection of Python code to re-use across Python-based scrapers
GNU General Public License v3.0
18 stars 16 forks source link

Memory leak #81

Closed rgaudin closed 2 years ago

rgaudin commented 2 years ago

This is a documentation issue to record that we've compared pylibzim and scraperlib RAM usage to ensure there is no scraperlib memoryleak in basic usage.

Scraperlib abstracts a lot of things so there are possibilities for bugs but at least the core of writing content to ZIM is safe.

pylibzim

import uuid
import sys

import libzim.writer
from libzim.writer import Hint

path_called = 0

class UserItem(libzim.writer.Item):
    __slots__ = ["content", "path"]
    def __init__(self, content):
        self.content = content
        self.path = uuid.uuid4().hex

    def get_path(self):
        global path_called
        path_called += 1
        return self.path

    def get_title(self):
        return ""

    def get_mimetype(self):
        return "text/html"

    def get_hints(self):
        return {Hint.COMPRESS: False}

    def get_contentprovider(self):
        return libzim.writer.StringProvider(self.content)

def main_no_xml():
    creator = (
        libzim.writer.Creator(
            filename="test-leak.zim",
        )
        .config_indexing(False, "eng")
        .config_verbose(True)
        .config_clustersize(100)
    )
    with creator:
        #    for i in range(500_000):
        for i in range(100_000):
            content = f"{i}|" + "Oupsy" * (i // 10)
            creator.add_item(UserItem(content))

    print("Done.")

if __name__ == "__main__":
    sys.exit(main_no_xml())

pylibzim


scraperlib

import uuid
import sys

from zimscraperlib.zim.creator import Creator

def main_no_xml():
    creator = (
        Creator(
            filename="test-leak-scraperlib.zim",
        )
        .config_indexing(False, "eng")
        .config_verbose(True)
        .config_clustersize(100)
    )
    with creator:
        for i in range(100_000):
            content = f"{i}|" + "Oupsy" * (i // 10)
            creator.add_item_for(
                path=uuid.uuid4().hex, title="", content=content, should_compress=False
            )
    print("Done.")

if __name__ == "__main__":
    sys.exit(main_no_xml())

scraperlib