id in SQL database to use with get_text_by_id

Generally I don't think these issue threads are meant for help like this, but I just wrote some code that I'd be happy to share with you:

from gutenbergpy.gutenbergcache import GutenbergCache, GutenbergCacheTypes
import gutenbergpy
import gutenbergpy.textget

# Will take about 5 mins to build the cache.
# The Mongo DB version will break because it hasn't been updated properly for Python3.
if not GutenbergCache.exists():
    GutenbergCache.create(refresh=False, download=False, unpack=False, parse=False)

# Get the book ids and their titles from the cache
cache = GutenbergCache.get_cache()
query = """
SELECT books.id, titles.name
FROM books
INNER JOIN book_subjects bs ON books.id = bs.bookid
INNER JOIN subjects ON bs.subjectid = subjects.id
INNER JOIN languages ON languages.id = books.languageid
INNER JOIN titles ON books.id = titles.bookid
WHERE subjects.name LIKE '%fantasy fiction%'
AND languages.name = 'en'
"""
cursor = cache.native_query(query)
books = cursor.fetchall()

# Save each book as a file in the given directory.
# Do not overwrite the book if the file already exists
# The error parsing doesn't work as expected, but I didn't debug that fully
output_dir = 'raw_text/gutenberg_samples'
for book_id, title in books:
    # Construct filename, replace problematic characters in titles for filenames
    filename = f"{book_id} - {title.replace('/', '_').replace('\\', '_')}.txt"
    filepath = os.path.join(output_dir, filename)

    # Check if the file already exists
    if not os.path.exists(filepath):
        try:
            # Fetch the book text by ID
            book_raw_text = gutenbergpy.textget.get_text_by_id(book_id)
            book_text = gutenbergpy.textget.strip_headers(book_raw_text) # without headers

            # Write the text to a file
            with open(filepath, 'wb') as file:
                file.write(book_text)
            print(f"Downloaded and saved: {filepath}")
        except Exception as e:
            print(f"Failed to download {title} (ID: {book_id}): {e}")
    else:
        print(f"File already exists, skipping: {filepath}")

raduangelescu / gutenbergpy

id in SQL database to use with get_text_by_id #16