If I import sklearn or from sklearn import manifold before opening the searcher = ix.searcher(), I get MemoryError:
#!python
File ".../sklearn_whoosh_memory_error.py", line 58, in open_index
wh_searcher = ix.searcher()
File "...\lib\site-packages\whoosh\index.py", line 318, in searcher
return Searcher(self.reader(), fromindex=self, **kwargs)
File "...\lib\site-packages\whoosh\index.py", line 548, in reader
info.generation, reuse=reuse)
File "...\lib\site-packages\whoosh\index.py", line 535, in _reader
readers = [segreader(segment) for segment in segments]
File "...\lib\site-packages\whoosh\index.py", line 535, in <listcomp>
readers = [segreader(segment) for segment in segments]
File "...\lib\site-packages\whoosh\index.py", line 524, in segreader
generation=generation)
File "...\lib\site-packages\whoosh\reading.py", line 620, in __init__
self._terms = self._codec.terms_reader(self._storage, segment)
File "...\lib\site-packages\whoosh\codec\whoosh3.py", line 122, in terms_reader
postfile = segment.open_file(storage, self.POSTS_EXT)
File "...\lib\site-packages\whoosh\codec\base.py", line 556, in open_file
return storage.open_file(fname, **kwargs)
File "...\lib\site-packages\whoosh\filedb\filestore.py", line 333, in open_file
return self.a.open_file(name, *args, **kwargs)
File "...\lib\site-packages\whoosh\filedb\compound.py", line 121, in open_file
f = BufferFile(buf, name=name)
File "...\lib\site-packages\whoosh\filedb\structfile.py", line 357, in __init__
self.file = BytesIO(buf)
MemoryError
The solution is to import sklearn after the whoosh searcher is closed: searcher.close().
Note, I have more than 3 GB of available memory, while the peak working memory set of the whole script is about 1.5 GB. At that, my indexdir contains segments of total size 33 GB, the largest segment is 7.5 GB.
Here is example of my code:
#!python
def show_memory(memory_tuple):
def convert_to_mb(num):
num //= (1024*1024)
return num
wset = memory_tuple.wset # Windows working memory set
peak_wset = memory_tuple.peak_wset # Windows peak working memory set
print('wset = {} MB ♦ peak_wset = {} MB'.format(convert_to_mb(wset),
convert_to_mb(peak_wset)))
def show_event(event):
print(event.ljust(45, '-') + ':', end=' ')
import os
import psutil
process = psutil.Process(os.getpid())
show_event('Start')
show_memory(process.memory_info())
from whoosh.index import open_dir
show_event('whoosh.index')
show_memory(process.memory_info())
from whoosh.qparser import MultifieldParser, PhrasePlugin, SequencePlugin
show_event('whoosh.qparser')
show_memory(process.memory_info())
from sklearn import manifold
show_event('from sklearn import manifold')
show_memory(process.memory_info())
from sklearn.decomposition import PCA
show_event('from sklearn.decomposition import PCA')
show_memory(process.memory_info())
def open_index():
"""Open existing whoosh search index."""
global wh_searcher
global wh_parser
folder = r"\\a\b\c"
ix = open_dir(folder)
wh_parser = MultifieldParser(["q", "w", "e"], schema=ix.schema)
wh_searcher = ix.searcher()
wh_parser.remove_plugin_class(PhrasePlugin)
wh_parser.add_plugin(SequencePlugin)
def close_searcher():
wh_searcher.close()
def search_whoosh_index(search_query):
query = wh_parser.parse(search_query)
results = wh_searcher.search(query, limit=1, scored=False, sortedby=None)
return results
def main():
open_index()
show_event('open_index()')
show_memory(process.memory_info())
search_query = 'some query terms'
results = search_whoosh_index(search_query)
show_event('search_whoosh_index')
show_memory(process.memory_info())
close_searcher()
show_event('close_searcher')
show_memory(process.memory_info())
#Import here to resolve the issue
# from sklearn import manifold
# from sklearn.decomposition import PCA
if __name__ == '__main__':
#main()
try:
main()
show_event('Success')
show_memory(process.memory_info())
except Exception:
show_event('Memory error')
show_memory(process.memory_info())
Original report by mx2048 (Bitbucket: mx2048, GitHub: mx2048).
If I
import sklearn
orfrom sklearn import manifold
before opening thesearcher = ix.searcher()
, I get MemoryError:The solution is to
import sklearn
after the whoosh searcher is closed:searcher.close()
.Note, I have more than 3 GB of available memory, while the peak working memory set of the whole script is about 1.5 GB. At that, my
indexdir
contains segments of total size 33 GB, the largest segment is 7.5 GB.Here is example of my code:
Here is output when success:
I don't know if the whoosh project is abandoned or not. Anyway, I hope the solution to this issue will help somebody.