Open vmtmxmf5 opened 3 years ago
속도 개선 버전 (실험 중)
import re
import json
def LargeJsonToPandas(path):
pat = '"src_lang":|"src_text_raw":|"src_text":|"tgt_lang":|"tgt_text_raw":|"tgt_text":|"origin":|"domain":'
oneobj = ''
with open(path, 'r', encoding='utf-8') as f:
cnt = 0
for line in f:
if re.search(pat, str(line)) != None:
if re.search('"src_lang":', str(line)) != None:
oneobj += '{'
oneobj += line.strip()
elif re.search('"domain":', str(line)) != None:
oneobj += line.strip()
oneobj += '}\n'
cnt += 1
elif cnt % 2000 == 0:
with open('/content/drive/MyDrive/Data' + '/test.json', 'a+') as g:
g.write(oneobj)
oneobj = ''
else:
oneobj += line.strip()
# cnt += 1
# if cnt == 1000:
# break
LargeJsonToPandas('/content/final_4.json')
import re
import json
def LargeJsonToPandas(path):
pat = '"src_lang":|"src_text_raw":|"src_text":|"tgt_lang":|"tgt_text_raw":|"tgt_text":|"origin":|"domain":'
oneobj = ''
with open(path, 'r', encoding='utf-8') as f:
cnt = 0
for line in f:
if re.search(pat, str(line)) != None:
if re.search('"src_lang":', str(line)) != None:
oneobj += '{'
oneobj += line.strip()
elif re.search('"domain":', str(line)) != None:
oneobj += line.strip()
oneobj += '}\n'
cnt += 1
else:
oneobj += line.strip()
if cnt % 2000 == 0:
with open('/content/drive/MyDrive/Data' + '/test.json', 'a+') as g:
g.write(oneobj)
oneobj = ''
if str(line) == ']':
with open('/content/drive/MyDrive/Data' + '/test.json', 'a+') as g:
g.write(oneobj)
# cnt += 1
# if cnt == 1000:
# break
LargeJsonToPandas('/content/final_4.json')
with open('final.json', encoding='utf-8') as f: data = json.load(f) chunkSize = 10000 for i in range(0, len(data), chunkSize): with open('./finalchunk/' + 'final' + str(i//chunkSize) + '.json', 'w') as outfile: json.dump(data[i:i+chunkSize], outfile)
chunkSize = 10000 for i in range(1866, len(data)+1, chunkSize): with open('./finalchunk/' + 'final' + str(i) + '.json', 'w', encoding='utf-8') as f: json.dump(data[i:i+chunkSize], f, ensure_ascii=False)