Open hxy-62 opened 6 months ago
I wrote a simple script to do this:
import os
import shutil
# Prefix to only include certain files
FILE_NAME_PREFIX = "wiki_"
# The path to the directory containing the extracted results
START_DIR = "/home/ubuntu/datasets/text"
# The directory to save the results per document
DOCS_SAVE_PATH = "/home/ubuntu/datasets/utf8_wikipedia_data"
def get_all_files():
all_files = []
for dir_name in os.listdir(START_DIR):
dir_path = os.path.join(START_DIR, dir_name)
for file_name in os.listdir(dir_path):
if FILE_NAME_PREFIX not in file_name or file_name[0] == '.':
continue
# Record the file_path
file_path = os.path.join(dir_path, file_name)
all_files.append(file_path)
return all_files
RECORD_END_MARKER = "</doc>"
def extract_records_from_docs(doc_path):
with open(doc_path, 'r') as reader:
lines = reader.readlines()
curr_idx = 0
max_txt_len = 0
while curr_idx < len(lines):
# Get the current document id
curr_doc_line = lines[curr_idx].strip()
doc_line_parts = curr_doc_line.split(" ")
doc_id_str = doc_line_parts[1].split("=")[1]
doc_id = doc_id_str[1 : -1]
# Get the current document lines
doc_lines = []
curr_idx += 1
while RECORD_END_MARKER not in lines[curr_idx]:
curr_line = lines[curr_idx].strip()
if len(curr_line) > 0:
doc_lines.append(curr_line)
curr_idx += 1
# Determine the text to write
if len(doc_lines) > 1:
doc_lines.pop(0)
txt_to_write = "\n".join(doc_lines)
max_txt_len = max(max_txt_len, len(txt_to_write))
save_path = os.path.join(DOCS_SAVE_PATH, doc_id + ".txt")
with open(save_path, 'w+') as writer:
writer.write(txt_to_write)
# Increment to the next record
curr_idx += 1
return max_txt_len
def main():
# Create the save directory
if os.path.exists(DOCS_SAVE_PATH):
shutil.rmtree(DOCS_SAVE_PATH)
os.makedirs(DOCS_SAVE_PATH, exist_ok = True)
# Get the result per file
all_files = get_all_files()
max_overall_len = 0
for file_path in all_files:
max_len = extract_records_from_docs(file_path)
print("Got max len of ", max_len, "for file", file_path)
max_overall_len = max(max_overall_len, max_len)
print("Got maximum txt length of", max_overall_len)
if __name__ == "__main__":
main()
The default execution result is that a txt file contains multiple documents. Now, I want a txt file to only contain one document. What should I do?