vmtmxmf5 / Pytorch-

pytorch로 머신러닝~딥러닝 구현
3 stars 0 forks source link

Json 병합과 분리 #20

Open vmtmxmf5 opened 2 years ago

vmtmxmf5 commented 2 years ago
def MergeJson():
    '''
    final_file은 cleansing_json 중 첫 번째 청크 사본 경로입니다.
    '''
    import os
    import io
    cleansing_folder = 'C:/Users/CPB06GameN/글을쓰자/PyTorch-master/연습폴더/cleansing_json'
    cleansed_file_names = os.listdir(cleansing_folder)
    final_file = 'C:/ff/test/AIhub_total.json'
    for file in range(1, len(cleansed_file_names)):
        with open(cleansing_folder + '/final_{}.json'.format(file), 'rb') as t:
            data = t.read()
        with open(final_file, 'rb+') as f:
            f.seek(-1, io.SEEK_END)
            f.write(b', ' + data[1:])

with open('final.json', encoding='utf-8') as f: data = json.load(f) chunkSize = 10000 for i in range(0, len(data), chunkSize): with open('./finalchunk/' + 'final' + str(i//chunkSize) + '.json', 'w') as outfile: json.dump(data[i:i+chunkSize], outfile)

chunkSize = 10000 for i in range(1866, len(data)+1, chunkSize): with open('./finalchunk/' + 'final' + str(i) + '.json', 'w', encoding='utf-8') as f: json.dump(data[i:i+chunkSize], f, ensure_ascii=False)


- 경로 Converter

```sage
import re

def PathConverter(path):
    '''
    r-string으로 넣어줘
    '''
    path = path.split('\\')
    return path[-1], '/'.join(path[:-1])
AIhub_file, folder_path = PathConverter(r"C:\ff\test\AIhub_total.json")
import re

def LargeJsonToPandas(folder_path, original_file_name, revised_file_name):
    pat = '"src_lang":|"src_text_raw":|"src_text":|"tgt_lang":|"tgt_text_raw":|"tgt_text":|"origin":|"domain":'
    with open(folder_path + '/' + original_file_name, 'r', encoding='utf-8') as f:
        for line in f:
            if re.search(pat, str(line)) != None:
                with open(folder_path + '/' + f'{revised_file_name}', 'a+', encoding='utf-8') as g:
                    if re.search('"src_lang":', str(line)) != None:
                        g.write('{')
                        g.write(line.strip())
                    elif re.search('"domain":', str(line)) != None:
                        g.write(line.strip())
                        g.write('}\n')
                    else:
                        g.write(line.strip())
vmtmxmf5 commented 2 years ago

속도 개선 버전 (실험 중)

import re
import json

def LargeJsonToPandas(path):
    pat = '"src_lang":|"src_text_raw":|"src_text":|"tgt_lang":|"tgt_text_raw":|"tgt_text":|"origin":|"domain":'
    oneobj = ''
    with open(path, 'r', encoding='utf-8') as f:
        cnt = 0
        for line in f:
            if re.search(pat, str(line)) != None:
                    if re.search('"src_lang":', str(line)) != None:
                        oneobj += '{'
                        oneobj += line.strip()
                    elif re.search('"domain":', str(line)) != None:
                        oneobj += line.strip()
                        oneobj += '}\n'
                        cnt += 1
                    elif cnt % 2000 == 0:
                        with open('/content/drive/MyDrive/Data' + '/test.json', 'a+') as g:
                            g.write(oneobj)
                        oneobj = ''
                    else:
                        oneobj += line.strip()
        #     cnt += 1
        # if cnt == 1000:
        #     break

LargeJsonToPandas('/content/final_4.json')
vmtmxmf5 commented 2 years ago
import re
import json

def LargeJsonToPandas(path):
    pat = '"src_lang":|"src_text_raw":|"src_text":|"tgt_lang":|"tgt_text_raw":|"tgt_text":|"origin":|"domain":'
    oneobj = ''
    with open(path, 'r', encoding='utf-8') as f:
        cnt = 0
        for line in f:
            if re.search(pat, str(line)) != None:
                    if re.search('"src_lang":', str(line)) != None:
                        oneobj += '{'
                        oneobj += line.strip()
                    elif re.search('"domain":', str(line)) != None:
                        oneobj += line.strip()
                        oneobj += '}\n'
                        cnt += 1
                    else:
                        oneobj += line.strip()
            if cnt % 2000 == 0:
                with open('/content/drive/MyDrive/Data' + '/test.json', 'a+') as g:
                    g.write(oneobj)
                oneobj = ''
            if str(line) == ']':
                with open('/content/drive/MyDrive/Data' + '/test.json', 'a+') as g:
                    g.write(oneobj)
        #     cnt += 1
        # if cnt == 1000:
        #     break

LargeJsonToPandas('/content/final_4.json')