devrimcavusoglu / acl-bib-overleaf

Split bib files for anthology bibliography for overleaf
MIT License
7 stars 2 forks source link

Is it possible to share the script to split the most updated acl bib? #1

Closed abaheti95 closed 7 months ago

devrimcavusoglu commented 7 months ago

Hi @abaheti95 , due to tight schedule I couldn't finish on it, but I'm preparing a auto-release that weekly (or monthly) updates the splits though I cannot give any ETA currently.

abaheti95 commented 7 months ago

No worries. I wrote a very short script which does the job

# We will split the ACL bib file into multiple files such that each is < 50MB

import sys
import os
from time import time
import logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")

def save_in_txt(list_of_strings, save_file, no_strip=False):
    with open(save_file, "w") as writer:
        for line in list_of_strings:
            if not no_strip:
                line = line.strip()
            writer.write(f"{line}\n")

def load_from_txt(txt_file):
    with open(txt_file, "r") as reader:
        all_lines = list()
        for line in reader:
            line = line.strip()
            all_lines.append(line)
        return all_lines

acl_bib_file = "anthology.bib"
# Load the bib file
start_time = time()
bib_data = load_from_txt(acl_bib_file)
logging.info(f"Loaded {len(bib_data)} lines from {acl_bib_file} in {time()-start_time:.2f} seconds")

# We need to find a line with "}" and split the file there
# We will keep track of the size of the file and split it when it exceeds 50MB
file_size = 0
file_num = 1
total_entries = sum([1 for line in bib_data if line == "}"])
# total_entries = 92402
# We will keep 70K entries in each file
entries_per_file = 70000
prev_start_index = 0
cur_entries = 0
for i, line in enumerate(bib_data):
    if line == "}":
        cur_entries += 1
        if cur_entries == entries_per_file or i == len(bib_data)-1:
            save_in_txt(bib_data[prev_start_index:i+1], f"anthology_{file_num}.bib")
            logging.info(f"Saved {prev_start_index} to {i} in anthology_{file_num}.bib")
            file_num += 1
            prev_start_index = i+1
            cur_entries = 0