Memory leak - Githubissues

This is a documentation issue to record that we've compared pylibzim and scraperlib RAM usage to ensure there is no scraperlib memoryleak in basic usage.

Scraperlib abstracts a lot of things so there are possibilities for bugs but at least the core of writing content to ZIM is safe.

pylibzim

import uuid
import sys

import libzim.writer
from libzim.writer import Hint

path_called = 0

class UserItem(libzim.writer.Item):
    __slots__ = ["content", "path"]
    def __init__(self, content):
        self.content = content
        self.path = uuid.uuid4().hex

    def get_path(self):
        global path_called
        path_called += 1
        return self.path

    def get_title(self):
        return ""

    def get_mimetype(self):
        return "text/html"

    def get_hints(self):
        return {Hint.COMPRESS: False}

    def get_contentprovider(self):
        return libzim.writer.StringProvider(self.content)

def main_no_xml():
    creator = (
        libzim.writer.Creator(
            filename="test-leak.zim",
        )
        .config_indexing(False, "eng")
        .config_verbose(True)
        .config_clustersize(100)
    )
    with creator:
        #    for i in range(500_000):
        for i in range(100_000):
            content = f"{i}|" + "Oupsy" * (i // 10)
            creator.add_item(UserItem(content))

    print("Done.")

if __name__ == "__main__":
    sys.exit(main_no_xml())

pylibzim

scraperlib

import uuid
import sys

from zimscraperlib.zim.creator import Creator

def main_no_xml():
    creator = (
        Creator(
            filename="test-leak-scraperlib.zim",
        )
        .config_indexing(False, "eng")
        .config_verbose(True)
        .config_clustersize(100)
    )
    with creator:
        for i in range(100_000):
            content = f"{i}|" + "Oupsy" * (i // 10)
            creator.add_item_for(
                path=uuid.uuid4().hex, title="", content=content, should_compress=False
            )
    print("Done.")

if __name__ == "__main__":
    sys.exit(main_no_xml())

scraperlib

openzim / python-scraperlib

Memory leak #81

pylibzim

scraperlib