Closed cdown closed 6 years ago
Seems the workload per-worker is way too small:
Before:
In [3]: %timeit -n 100 list(srt.parse(srt_data))
12.3 ms ± 68.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
After:
In [15]: %timeit -n 100 list(srt.parse(srt_data))
32.7 ms ± 93 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
srt develop % gd
diff --git srt.py srt.py
index 2d620ae..764f80a 100755
--- srt.py
+++ srt.py
@@ -7,6 +7,7 @@ import functools
import re
from datetime import timedelta
import logging
+import multiprocessing
log = logging.getLogger(__name__)
@@ -317,20 +318,29 @@ def parse(srt):
expected_start = 0
- for match in SRT_REGEX.finditer(srt):
- actual_start = match.start()
- _raise_if_not_contiguous(srt, expected_start, actual_start)
+ # _sre.SRE_Match objects are not serialisable by pickle
+ match_iter = (match.groups() for match in SRT_REGEX.finditer(srt))
+
+ pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())
+ subs = pool.imap(_parse_single, match_iter, chunksize=10)
+ pool.close()
+ return subs
- raw_index, raw_start, raw_end, proprietary, content = match.groups()
- yield Subtitle(
- index=int(raw_index), start=srt_timestamp_to_timedelta(raw_start),
- end=srt_timestamp_to_timedelta(raw_end),
- content=content.replace('\r\n', '\n'), proprietary=proprietary,
- )
- expected_start = match.end()
+def _parse_single(match):
+ r'''
+ Given a regex match of an SRT block, convert it to a :py:class:`Subtitle`.
- _raise_if_not_contiguous(srt, expected_start, len(srt))
+ :param re.MatchObject match: A regex match of an SRT block
+ :returns: The subtitle, the start of match, and the end of match
+ :rtype: (:py:class:`Subtitle`, int, int)
+ '''
+ raw_index, raw_start, raw_end, proprietary, content = match
+ return Subtitle(
+ index=int(raw_index), start=srt_timestamp_to_timedelta(raw_start),
+ end=srt_timestamp_to_timedelta(raw_end),
+ content=content.replace('\r\n', '\n'), proprietary=proprietary,
+ )
def _raise_if_not_contiguous(srt, expected_start, actual_start):
Better with chunksize=200, but still not worth it:
In [3]: %timeit -n 100 list(srt.parse(srt_data))
26.3 ms ± 49.6 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
Not doing based on above
This shows we could probably gain a lot by doing everything in the iterations in threads