This is a documentation issue to record that we've compared pylibzim and scraperlib RAM usage to ensure there is no scraperlib memoryleak in basic usage.
Scraperlib abstracts a lot of things so there are possibilities for bugs but at least the core of writing content to ZIM is safe.
pylibzim
import uuid
import sys
import libzim.writer
from libzim.writer import Hint
path_called = 0
class UserItem(libzim.writer.Item):
__slots__ = ["content", "path"]
def __init__(self, content):
self.content = content
self.path = uuid.uuid4().hex
def get_path(self):
global path_called
path_called += 1
return self.path
def get_title(self):
return ""
def get_mimetype(self):
return "text/html"
def get_hints(self):
return {Hint.COMPRESS: False}
def get_contentprovider(self):
return libzim.writer.StringProvider(self.content)
def main_no_xml():
creator = (
libzim.writer.Creator(
filename="test-leak.zim",
)
.config_indexing(False, "eng")
.config_verbose(True)
.config_clustersize(100)
)
with creator:
# for i in range(500_000):
for i in range(100_000):
content = f"{i}|" + "Oupsy" * (i // 10)
creator.add_item(UserItem(content))
print("Done.")
if __name__ == "__main__":
sys.exit(main_no_xml())
scraperlib
import uuid
import sys
from zimscraperlib.zim.creator import Creator
def main_no_xml():
creator = (
Creator(
filename="test-leak-scraperlib.zim",
)
.config_indexing(False, "eng")
.config_verbose(True)
.config_clustersize(100)
)
with creator:
for i in range(100_000):
content = f"{i}|" + "Oupsy" * (i // 10)
creator.add_item_for(
path=uuid.uuid4().hex, title="", content=content, should_compress=False
)
print("Done.")
if __name__ == "__main__":
sys.exit(main_no_xml())
This is a documentation issue to record that we've compared pylibzim and scraperlib RAM usage to ensure there is no scraperlib memoryleak in basic usage.
Scraperlib abstracts a lot of things so there are possibilities for bugs but at least the core of writing content to ZIM is safe.
pylibzim
scraperlib