whoosh-community / whoosh

Whoosh is a fast, featureful full-text indexing and searching library implemented in pure Python.
Other
244 stars 37 forks source link

Whoosh raises MemoryError if importing sklearn modules #485

Open fortable1999 opened 6 years ago

fortable1999 commented 6 years ago

Original report by mx2048 (Bitbucket: mx2048, GitHub: mx2048).


If I import sklearn or from sklearn import manifold before opening the searcher = ix.searcher(), I get MemoryError:

#!python

  File ".../sklearn_whoosh_memory_error.py", line 58, in open_index
    wh_searcher = ix.searcher()
  File "...\lib\site-packages\whoosh\index.py", line 318, in searcher
    return Searcher(self.reader(), fromindex=self, **kwargs)
  File "...\lib\site-packages\whoosh\index.py", line 548, in reader
    info.generation, reuse=reuse)
  File "...\lib\site-packages\whoosh\index.py", line 535, in _reader
    readers = [segreader(segment) for segment in segments]
  File "...\lib\site-packages\whoosh\index.py", line 535, in <listcomp>
    readers = [segreader(segment) for segment in segments]
  File "...\lib\site-packages\whoosh\index.py", line 524, in segreader
    generation=generation)
  File "...\lib\site-packages\whoosh\reading.py", line 620, in __init__
    self._terms = self._codec.terms_reader(self._storage, segment)
  File "...\lib\site-packages\whoosh\codec\whoosh3.py", line 122, in terms_reader
    postfile = segment.open_file(storage, self.POSTS_EXT)
  File "...\lib\site-packages\whoosh\codec\base.py", line 556, in open_file
    return storage.open_file(fname, **kwargs)
  File "...\lib\site-packages\whoosh\filedb\filestore.py", line 333, in open_file
    return self.a.open_file(name, *args, **kwargs)
  File "...\lib\site-packages\whoosh\filedb\compound.py", line 121, in open_file
    f = BufferFile(buf, name=name)
  File "...\lib\site-packages\whoosh\filedb\structfile.py", line 357, in __init__
    self.file = BytesIO(buf)
MemoryError

The solution is to import sklearn after the whoosh searcher is closed: searcher.close().

Note, I have more than 3 GB of available memory, while the peak working memory set of the whole script is about 1.5 GB. At that, my indexdir contains segments of total size 33 GB, the largest segment is 7.5 GB.

Here is example of my code:

#!python

def show_memory(memory_tuple):

    def convert_to_mb(num):
        num //= (1024*1024)
        return num

    wset = memory_tuple.wset  # Windows working memory set
    peak_wset = memory_tuple.peak_wset  # Windows peak working memory set

    print('wset = {} MB ♦ peak_wset = {} MB'.format(convert_to_mb(wset),
                                                  convert_to_mb(peak_wset)))

def show_event(event):
    print(event.ljust(45, '-') + ':', end=' ')

import os
import psutil

process = psutil.Process(os.getpid())
show_event('Start')
show_memory(process.memory_info())

from whoosh.index import open_dir
show_event('whoosh.index')
show_memory(process.memory_info())

from whoosh.qparser import MultifieldParser, PhrasePlugin, SequencePlugin
show_event('whoosh.qparser')
show_memory(process.memory_info())

from sklearn import manifold
show_event('from sklearn import manifold')
show_memory(process.memory_info())

from sklearn.decomposition import PCA
show_event('from sklearn.decomposition import PCA')
show_memory(process.memory_info())

def open_index():
    """Open existing whoosh search index."""

    global wh_searcher
    global wh_parser

    folder = r"\\a\b\c"

    ix = open_dir(folder)

    wh_parser = MultifieldParser(["q", "w", "e"], schema=ix.schema)
    wh_searcher = ix.searcher()
    wh_parser.remove_plugin_class(PhrasePlugin)
    wh_parser.add_plugin(SequencePlugin)

def close_searcher():
    wh_searcher.close()

def search_whoosh_index(search_query):
    query = wh_parser.parse(search_query)
    results = wh_searcher.search(query, limit=1, scored=False, sortedby=None)
    return results

def main():
    open_index()

    show_event('open_index()')
    show_memory(process.memory_info())

    search_query = 'some query terms'
    results = search_whoosh_index(search_query)

    show_event('search_whoosh_index')
    show_memory(process.memory_info())

    close_searcher()

    show_event('close_searcher')
    show_memory(process.memory_info())

    #Import here to resolve the issue
    # from sklearn import manifold
    # from sklearn.decomposition import PCA

if __name__ == '__main__':

    #main()

    try:
        main()
        show_event('Success')
        show_memory(process.memory_info())

    except Exception:
        show_event('Memory error')
        show_memory(process.memory_info())

Here is output when success:

#!python

Start----------------------------------------: wset = 12 MB ♦ peak_wset = 12 MB
whoosh.index---------------------------------: wset = 16 MB ♦ peak_wset = 16 MB
whoosh.qparser-------------------------------: wset = 18 MB ♦ peak_wset = 18 MB
from sklearn import manifold-----------------: wset = 63 MB ♦ peak_wset = 63 MB
from sklearn.decomposition import PCA--------: wset = 63 MB ♦ peak_wset = 63 MB
open_index()---------------------------------: wset = 1543 MB ♦ peak_wset = 1543 MB
Hit:  498
search_whoosh_index--------------------------: wset = 1545 MB ♦ peak_wset = 1545 MB
close_searcher-------------------------------: wset = 806 MB ♦ peak_wset = 1545 MB
Success--------------------------------------: wset = 806 MB ♦ peak_wset = 1545 MB

I don't know if the whoosh project is abandoned or not. Anyway, I hope the solution to this issue will help somebody.

nijel commented 5 years ago

@mx2048 Can you please test if https://github.com/whoosh-community/whoosh/pull/522 helps in your case?