boostcampaitech2 / mrc-level2-nlp-14

mrc-level2-nlp-14 created by GitHub Classroom
MIT License
10 stars 8 forks source link

[Data] wiki doc에 중복된 문서가 엄청 많습니다! #8

Closed jinmang2 closed 2 years ago

jinmang2 commented 2 years ago

Wiki Processing

import os
import re
import json

data_path = "./data/aistage-mrc"

def preprocess(text):
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r"\\n", " ", text)
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r'#', ' ', text)
    text = re.sub(r"[^a-zA-Z0-9가-힣ㄱ-ㅎㅏ-ㅣぁ-ゔァ-ヴー々〆〤一-龥<>()\s\.\?!》《≪≫\'<>〈〉:‘’%,『』「」<>・\"-“”∧]", "", text)
    return text

def run_preprocess(data_dict):
    context = data_dict["context"]
    start_ids = data_dict["answers"]["answer_start"][0]
    before = data_dict["context"][:start_ids]
    after = data_dict["context"][start_ids:]
    process_before = preprocess(before)
    process_after = preprocess(after)
    process_data = process_before + process_after
    ids_move = len(before) - len(process_before)
    data_dict["context"] = process_data
    data_dict["answers"]["answer_start"][0] = start_ids - ids_move
    return data_dict

def run_preprocess_to_wiki(data_dict):
    context = data_dict["text"]
    process_data = preprocess(context)
    data_dict["text"] = process_data
    return data_dict

with open(os.path.join(data_path, "wikipedia_documents.json"), "r", encoding='utf-8') as f:
    wiki = json.load(f)
new_wiki = dict()
for ids in range(len(wiki)):
    new_wiki[str(ids)] = run_preprocess_to_wiki(wiki[str(ids)])
with open(os.path.join(data_path, 'preprocess_wiki.json'), 'w', encoding='utf-8') as make_file:
    json.dump(new_wiki, make_file, indent="\t", ensure_ascii=False)

Duplicated wiki docs

with open(os.path.join(data_path, "preprocess_wiki.json"), "r", encoding="utf-8") as f:
    wiki = json.load(f)

from collections import Counter

length_limit = 0
lengths = [len(i['text']) for i in wiki.values()]
ls = sorted([i for i in lengths if i > length_limit], reverse=True)
ls = Counter(ls)

dup_index = []
check_list = [(length, c) for length, c in dict(ls).items() if c > 1]
for length, c in tqdm(check_list):
    indexes = []
    i = 0
    while c != len(indexes):
        ind = lengths.index(length, i)
        if not indexes or indexes[-1] != ind:
            indexes.append(str(ind))
        i = ind + 1
    for i in range(len(indexes)):
        for j in range(i+1, len(indexes)):
            if wiki[indexes[i]]['text'] == wiki[indexes[j]]['text']:
                if j not in dup_index:
                    dup_index.append(indexes[j])

print(len(dup_index), len(wiki))
100%|████████████████████████████████████████████████████████████████████████████| 2161/2161 [00:01<00:00, 1102.60it/s]
(5464, 60613)

다 날려버리겠습니다 ㅎㅎ

jinmang2 commented 2 years ago

위의 mention은 잘못된 언급.