Open ZK-Zhou opened 1 year ago
换成多线程+分块处理 import json import numpy as np from multiprocessing import Pool, cpu_count
import tqdm
from chatglm_tokenizer.tokenization_chatglm import ChatGLMTokenizer
tokenizer = ChatGLMTokenizer(vocab_file='../chatglm_tokenizer/tokenizer.model')
def process_line(line):
line = json.loads(line)
text = ''
try:
text += line['title'] + ':' + line['summary']
except:
pass
for per in line['sections']:
text += per['title'] + ':' + per['content'] + '。'
text_id = tokenizer.encode(text, add_special_tokens=False)
text_id.append(tokenizer.special_tokens['
def chunked_file_reader(file, chunksize=10000): while True: lines = [file.readline() for in range(chunk_size)] if not lines or not lines[0]: break yield lines
if name == "main": with open('563w_baidubaike.json', 'r') as f, open('baidubaike_563w.bin', 'wb') as out_f: doc_ids = [] for chunk in tqdm.tqdm(chunked_file_reader(f)): with Pool(20) as p: results = p.map(process_line, chunk) doc_ids.extend([item for sublist in results if sublist is not None for item in sublist]) arr = np.array(doc_ids, dtype=np.uint16) out_f.write(arr.tobytes())
谢谢!
内存不够的话,分片+多进程也没用吧,因为doc_ids
会爆掉。我是切成了多个bin。
具体怎么切啊?
有处理成功的小伙伴吗,到底需要多大的内存才能正常处理数据,谢谢啦
Traceback (most recent call last): File "C:\Users\zhou\Desktop\baby_llm\data_process.py", line 145, in
process_baidu()
File "C:\Users\zhou\Desktop\baby_llm\data_process.py", line 126, in process_baidu
doc_ids+=text_id
MemoryError
请问这种情况如何解决呢,有遇到过吗
打扰啦,多谢