Open ratsgo opened 4 years ago
분할 코드
import random
raw_corpus = open("/Users/david/Downloads/original_data.txt", "r").readlines()
processed_corpus1, processed_corpus2 = [], []
expanded_line = ""
for line in raw_corpus:
if line.startswith("##"):
expanded_line += line
else:
processed_corpus1.append(expanded_line)
expanded_line = ""
for document in processed_corpus1:
sentences = document.strip().split("\n")
if not sentences[1].endswith("."):
sentences[1] += "."
if not sentences[2].endswith("."):
sentences[2] += "."
processed_document = sentences[1].replace("## ", "") + "\u241E" + sentences[2].replace("## ", "")
processed_corpus2.append(processed_document)
random.seed(7)
num_of_total_data = len(processed_corpus2)
num_of_train_data = int(num_of_total_data * 0.8)
num_of_valid_data = int(num_of_total_data * 0.1)
train_instance_idxes = random.sample(range(num_of_total_data), num_of_train_data)
valid_instance_idxes = random.sample([el for el in range(num_of_total_data) if el not in train_instance_idxes], num_of_valid_data)
test_instance_idxes = [el for el in range(num_of_total_data) if el not in train_instance_idxes and el not in valid_instance_idxes]
train_corpus, valid_corpus, test_corpus = [], [], []
for idx, document in enumerate(processed_corpus2):
if idx in valid_instance_idxes:
valid_corpus.append(document)
elif idx in test_instance_idxes:
test_corpus.append(document)
else:
train_corpus.append(document)
with open("/Users/david/Downloads/ner/train.txt", "w") as f1:
for el in train_corpus:
f1.writelines(el + "\n")
with open("/Users/david/Downloads/ner/valid.txt", "w") as f2:
for el in valid_corpus:
f2.writelines(el + "\n")
with open("/Users/david/Downloads/ner/test.txt", "w") as f3:
for el in test_corpus:
f3.writelines(el + "\n")
개요
개체명 인식 관련 데이터를 train, valid, test로 분리한다