Closed ArneBachmann closed 6 years ago
When looking just at positive samples, I see a speedup of 120-200% (up to factor 2). But when looking at the (many more) negative samples from the test suite (which are really micro-benchmarks on mini datasets), I see similar speeddowns but also a wider range of performance losses (factor 1.5x-10x).
This has nothing to say about real-life speeds though, which are already quite OK, plus --only
and --except
are probably not the most-often used options (?).
In a real larger repository I could see only performance losses, probably due to the creation of more complex data structures. Here's the profiling code:
import random, subprocess
poss = []
negs = []
for i in range(100):
folder = ""
folder2 = ""
for p in range(random.randint(3, 9) // 3):
folder += ("/" if p > 0 else "") + "%s" % chr(random.randint(65, 85)) * random.randint(1, 2) + chr(random.randint(65, 85)) * random.randint(1, 2)
for p in range(random.randint(1, 3)):
folder2 += ("/" if p > 0 else "") + "%s" % chr(random.randint(65, 85)) * random.randint(1, 2) + chr(random.randint(65, 85)) * random.randint(1, 2)
file = "?%s*" % chr(random.randint(65, 85)) * random.randint(1, 2) + chr(random.randint(65, 85)) * random.randint(1, 2)
file2 = "?%s*" % chr(random.randint(65, 85)) * random.randint(1, 2) + chr(random.randint(65, 85)) * random.randint(1, 2)
# print('sos diff --only "%s/%s" --except "%s/%s"' % (folder, file, folder2, file2))
out = subprocess.Popen('sos diff %s%s' % (('--only "%s/%s" ' % (folder, file)) if random.randint(1, 2) == 1 else "", ('--except "%s/%s"' % (folder2, file2)) if random.randint(1, 2) == 1 else ""), bufsize = 1, shell = True, stdout = subprocess.PIPE, stderr = subprocess.STDOUT).communicate()[0].decode("utf-8")
if "=======" in out:
poss += [float(o.split(" ")[1]) for o in out.replace("\r", "\n").split("\n") if o.startswith("======")]
print("+%f" % poss[-1])
elif "-----" in out:
negs += [float(o.split(" ")[1]) for o in out.replace("\r", "\n").split("\n") if o.startswith("------")]
print("-%f" % negs[-1])
if poss:
print("Pos num/med/mean")
print(len(poss))
print(sorted(poss)[int(len(poss)) // 2])
print(sum(poss)/len(poss))
if negs:
print("Neg num/med/mean")
print(len(negs))
print(sorted(negs)[int(len(poss)) // 2])
print(sum(negs)/len(negs))
and here is the supposedly faster fnmatch.filter
code:
byFolder:Dict[str,List[str]] = collections.defaultdict(list)
onlyByFolder:Dict[str,List[str]] = collections.defaultdict(list)
dontByFolder:Dict[str,List[str]] = collections.defaultdict(list)
for path, pinfo in ((_path, _pinfo) for _path, _pinfo in _.paths.items() if _pinfo is not None):
byFolder[path[:path.rindex(SLASH)]].append(path[path.rindex(SLASH) + 1:])
for pattern in considerOnly ?? []: onlyByFolder[pattern[:pattern.rindex(SLASH)]].append(pattern[pattern.rindex(SLASH) + 1:])
for pattern in dontConsider ?? []: dontByFolder[pattern[:pattern.rindex(SLASH)]].append(pattern[pattern.rindex(SLASH) + 1:])
for folder, paths in byFolder.items():
pos:Set[str] = set.union(set(paths), *[fnmatch.filter(paths, pattern) for pattern in onlyByFolder.get(folder, [])]) if onlyByFolder is not None else paths
neg:Set[str] = set.union(set(paths), *[fnmatch.filter(paths, pattern) for pattern in dontByFolder.get(folder, [])]) if dontByFolder is not None else set()
knownPathz[folder].extend(list(pos - neg))
Some optimizations:
Now we have consistenly speedups in real life and only a minority of speeddowns in the test suite.
Full profiling code:
_old:float = time.time()
for path, pinfo in _.paths.items(): # TODO this approach sometimes leaves keys with emoty lists?
if pinfo.size is not None\
and (considerOnly is None or any(path[:path.rindex(SLASH)] == pattern[:pattern.rindex(SLASH)] and fnmatch.fnmatch(path[path.rindex(SLASH) + 1:], pattern[pattern.rindex(SLASH) + 1:]) for pattern in considerOnly))\
and (dontConsider is None or not any(path[:path.rindex(SLASH)] == pattern[:pattern.rindex(SLASH)] and fnmatch.fnmatch(path[path.rindex(SLASH) + 1:], pattern[pattern.rindex(SLASH) + 1:]) for pattern in dontConsider)):
knownPaths[os.path.dirname(path)].append(os.path.basename(path)) # TODO #249 reimplement using fnmatch.filter and set operations for all files per path for speed
_new:float = time.time()
_old = _new - _old # compute old strategy
byFolder:Dict[str,List[str]] = collections.defaultdict(list)
onlyByFolder:Dict[str,List[str]] = collections.defaultdict(list)
dontByFolder:Dict[str,List[str]] = collections.defaultdict(list)
for path, pinfo in _.paths.items():
if pinfo is None: continue # quicker than generator expression above
byFolder[path[:path.rindex(SLASH)]].append(path[path.rindex(SLASH) + 1:])
for pattern in considerOnly ?? []: onlyByFolder[pattern[:pattern.rindex(SLASH)]].append(pattern[pattern.rindex(SLASH) + 1:])
for pattern in dontConsider ?? []: dontByFolder[pattern[:pattern.rindex(SLASH)]].append(pattern[pattern.rindex(SLASH) + 1:])
for folder, paths in byFolder.items():
pos:Set[str] = set.union(set(), *[fnmatch.filter(paths, pattern) for pattern in onlyByFolder.get(folder, [])]) if considerOnly is not None else set(paths)
neg:Set[str] = set.union(set(), *[fnmatch.filter(paths, pattern) for pattern in dontByFolder.get(folder, [])]) if dontConsider is not None else set()
knownPathz[folder] = list(pos - neg)
_new = time.time() - _new
if _new < _old: print("%s %.2f" % ("#" * 80, _old * 100. / _new)) # was faster!
else: print("%s %.2f" % ("-" * 80, _new * 100. / _old))
assert {k: sorted(v) for k, v in knownPaths.items()} == {k: sorted(v) for k, v in knownPathz.items()}
Now usually 2x faster than before.
Went from 5 to 13 LOC and has more complex data structures now... If we don't see a a big performance gain, then we should keep it as is. Will post the code and test results here.