import os
import re
import json
data_path = "./data/aistage-mrc"
def preprocess(text):
text = re.sub(r'\n', ' ', text)
text = re.sub(r"\\n", " ", text)
text = re.sub(r"\s+", " ", text)
text = re.sub(r'#', ' ', text)
text = re.sub(r"[^a-zA-Z0-9가-힣ㄱ-ㅎㅏ-ㅣぁ-ゔァ-ヴー々〆〤一-龥<>()\s\.\?!》《≪≫\'<>〈〉:‘’%,『』「」<>・\"-“”∧]", "", text)
return text
def run_preprocess(data_dict):
context = data_dict["context"]
start_ids = data_dict["answers"]["answer_start"][0]
before = data_dict["context"][:start_ids]
after = data_dict["context"][start_ids:]
process_before = preprocess(before)
process_after = preprocess(after)
process_data = process_before + process_after
ids_move = len(before) - len(process_before)
data_dict["context"] = process_data
data_dict["answers"]["answer_start"][0] = start_ids - ids_move
return data_dict
def run_preprocess_to_wiki(data_dict):
context = data_dict["text"]
process_data = preprocess(context)
data_dict["text"] = process_data
return data_dict
with open(os.path.join(data_path, "wikipedia_documents.json"), "r", encoding='utf-8') as f:
wiki = json.load(f)
new_wiki = dict()
for ids in range(len(wiki)):
new_wiki[str(ids)] = run_preprocess_to_wiki(wiki[str(ids)])
with open(os.path.join(data_path, 'preprocess_wiki.json'), 'w', encoding='utf-8') as make_file:
json.dump(new_wiki, make_file, indent="\t", ensure_ascii=False)
Duplicated wiki docs
with open(os.path.join(data_path, "preprocess_wiki.json"), "r", encoding="utf-8") as f:
wiki = json.load(f)
from collections import Counter
length_limit = 0
lengths = [len(i['text']) for i in wiki.values()]
ls = sorted([i for i in lengths if i > length_limit], reverse=True)
ls = Counter(ls)
dup_index = []
check_list = [(length, c) for length, c in dict(ls).items() if c > 1]
for length, c in tqdm(check_list):
indexes = []
i = 0
while c != len(indexes):
ind = lengths.index(length, i)
if not indexes or indexes[-1] != ind:
indexes.append(str(ind))
i = ind + 1
for i in range(len(indexes)):
for j in range(i+1, len(indexes)):
if wiki[indexes[i]]['text'] == wiki[indexes[j]]['text']:
if j not in dup_index:
dup_index.append(indexes[j])
print(len(dup_index), len(wiki))
Wiki Processing
Duplicated wiki docs
다 날려버리겠습니다 ㅎㅎ