Closed abaheti95 closed 7 months ago
No worries. I wrote a very short script which does the job
# We will split the ACL bib file into multiple files such that each is < 50MB
import sys
import os
from time import time
import logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
def save_in_txt(list_of_strings, save_file, no_strip=False):
with open(save_file, "w") as writer:
for line in list_of_strings:
if not no_strip:
line = line.strip()
writer.write(f"{line}\n")
def load_from_txt(txt_file):
with open(txt_file, "r") as reader:
all_lines = list()
for line in reader:
line = line.strip()
all_lines.append(line)
return all_lines
acl_bib_file = "anthology.bib"
# Load the bib file
start_time = time()
bib_data = load_from_txt(acl_bib_file)
logging.info(f"Loaded {len(bib_data)} lines from {acl_bib_file} in {time()-start_time:.2f} seconds")
# We need to find a line with "}" and split the file there
# We will keep track of the size of the file and split it when it exceeds 50MB
file_size = 0
file_num = 1
total_entries = sum([1 for line in bib_data if line == "}"])
# total_entries = 92402
# We will keep 70K entries in each file
entries_per_file = 70000
prev_start_index = 0
cur_entries = 0
for i, line in enumerate(bib_data):
if line == "}":
cur_entries += 1
if cur_entries == entries_per_file or i == len(bib_data)-1:
save_in_txt(bib_data[prev_start_index:i+1], f"anthology_{file_num}.bib")
logging.info(f"Saved {prev_start_index} to {i} in anthology_{file_num}.bib")
file_num += 1
prev_start_index = i+1
cur_entries = 0
Hi @abaheti95 , due to tight schedule I couldn't finish on it, but I'm preparing a auto-release that weekly (or monthly) updates the splits though I cannot give any ETA currently.