Closed theo-m closed 3 years ago
# benchmark_filter.py import logging import sys import time from datasets import load_dataset, set_caching_enabled if __name__ == "__main__": set_caching_enabled(False) logging.basicConfig(level=logging.DEBUG) bc = load_dataset("bookcorpus") now = time.time() try: bc["train"].filter(lambda x: len(x["text"]) < 64, num_proc=int(sys.argv[1])) except Exception as e: print(f"cancelled: {e}") elapsed = time.time() - now print(elapsed)
Running python benchmark_filter.py 1 (20min+) is faster than python benchmark_filter.py 2 (2hrs+)
python benchmark_filter.py 1
python benchmark_filter.py 2
dupe of #1992
Running
python benchmark_filter.py 1
(20min+) is faster thanpython benchmark_filter.py 2
(2hrs+)