AlJohri / docx2pdf

MIT License
523 stars 97 forks source link

Ability to use multiple word instances to process multiple documents concurrently #103

Open MarcellRoos opened 6 months ago

MarcellRoos commented 6 months ago

I've been trying to speed up the conversion process by implementing some basic multithreading but it seems like the single instance of word is my bottleneck. Is there a workaround for this?

mrchengshunlong commented 4 months ago

I also encountered the same problem, and hope someone can discuss it together. I use ProcessPoolExecutor to accomplish run at the same time, in order to avoid Word's threads cannot be closed normally, I add "pythoncom.CoInitialize()", "doc.Close(0)", "word.Quit()", "pythoncom.CoUninitialize()". howerver, The CPU usage is still 20-30% , usually 2-3 processes running at the same time, the rest of the process created, but while waiting, I wonder if it is due to the conversion ceiling of MS office itself, here is my code:

# Only set up multiple processes to convert DOCX to PDF
from pathlib import Path
from tqdm.auto import tqdm
import win32com.client
import pythoncom
import time
import os
import concurrent.futures

def convert(docx_filepath):
    pythoncom.CoInitialize()
    word = win32com.client.DispatchEx('word.application')
    wdFormatPDF = 17

    docx_filepath = Path(docx_filepath).resolve()
    pdf_filepath = docx_filepath.with_suffix('.pdf')
    doc = word.Documents.Open(str(docx_filepath))
    try:
        doc.ExportAsFixedFormat(str(pdf_filepath), wdFormatPDF, False, 0)
    except Exception as e:
        print(f"Error converting {docx_filepath}: {e}")
    finally:
        doc.Close(0)
        word.Quit()
        pythoncom.CoUninitialize()

if __name__ == "__main__":
    start_time = time.time()
    directory = "rusult/"  # replace_with_your_directory_path
    docx_files = [os.path.join(root, f)
                  for root, dirs, files in os.walk(directory)
                  for f in files if f.endswith('.docx')]

    max_workers = min(32, len(docx_files))

    with concurrent.futures.ProcessPoolExecutor(max_workers=max_workers) as executor:
        with tqdm(total=len(docx_files), desc="Converting DOCX to PDF") as pbar:
            futures = {executor.submit(convert, docx_file): docx_file for docx_file in docx_files}
            for future in concurrent.futures.as_completed(futures):
                pbar.update(1)

    end_time = time.time()
    print(f"Full processing time: {end_time - start_time:.2f} seconds")

To avoid being every' word. Application' on and off wasted time, and later I added a for loop in convert, but the total time difference is not great.

# Also set to resize batches and the number of process pools to optimize performance
from tqdm.auto import tqdm
import win32com.client
import pythoncom
import time
import os
import concurrent.futures

def convert_batch(docx_filepaths):
    pythoncom.CoInitialize()
    word = win32com.client.DispatchEx('word.application')
    wdFormatPDF = 17

    for docx_filepath in docx_filepaths:
        docx_filepath = Path(docx_filepath).resolve()
        pdf_filepath = docx_filepath.with_suffix('.pdf')
        doc = word.Documents.Open(str(docx_filepath))
        try:
            doc.ExportAsFixedFormat(str(pdf_filepath), wdFormatPDF, False, 0)
        except Exception as e:
            print(f"Error converting {docx_filepath}: {e}")
        finally:
            doc.Close(0)

    word.Quit()
    pythoncom.CoUninitialize()

if __name__ == "__main__":
    start_time = time.time()
    directory = "result/"  # replace_with_your_directory_path
    docx_files = [os.path.join(root, f)
                  for root, dirs, files in os.walk(directory)
                  for f in files if f.endswith('.docx')]

    num_batches = min(32, len(docx_files))
    batch_size = len(docx_files) // num_batches + (1 if len(docx_files) % num_batches != 0 else 0)
    batches = [docx_files[i:i + batch_size] for i in range(0, len(docx_files), batch_size)]

    with concurrent.futures.ProcessPoolExecutor(max_workers=num_batches) as executor:
        with tqdm(total=len(docx_files), desc="Converting DOCX to PDF") as pbar:
            futures = {executor.submit(convert_batch, batch): batch for batch in batches}
            for future in concurrent.futures.as_completed(futures):
                pbar.update(len(futures[future]))

    end_time = time.time()
    print(f"Full processing time: {end_time - start_time:.2f} second")