Open rain1024 opened 7 years ago
Cấu hình hệ thống:
Sử dụng 2 cách:
Vấn đề: Process được khoảng 100MB text thì full RAM
Bên dưới là code sử dụng Pool.map còn code sử dụng Queue thì đã xóa rồi.
import multiprocessing
from gensim.models import Phrases
from gensim.models.phrases import Phraser
import pickle
from underthesea import word_sent
import time
from multiprocessing import Queue, Pool
sentences = []
with open('corpus.txt', 'r') as fs:
for count, sentence in enumerate(fs):
if count % 10000 == 0:
print(count)
sentence = sentence.strip()
sentences.append(sentence)
def run(sent):
p_name = multiprocessing.current_process().name
files_fs[p_name].write("%s\n" % "".join(word_sent(sent, format="text")))
if __name__ == '__main__':
num_process = 20
files_fs = {}
for i in range(1, num_process + 1):
file_name = './corpus/ForkPoolWorker-%s' % i
files_fs['ForkPoolWorker-%s' % i] = open(file_name, 'w')
p = Pool(num_process)
print(p.map(run, sentences))`
Please support multithreading and multi processing in underthesea