Open MarcellRoos opened 6 months ago
I also encountered the same problem, and hope someone can discuss it together. I use ProcessPoolExecutor to accomplish run at the same time, in order to avoid Word's threads cannot be closed normally, I add "pythoncom.CoInitialize()", "doc.Close(0)", "word.Quit()", "pythoncom.CoUninitialize()". howerver, The CPU usage is still 20-30% , usually 2-3 processes running at the same time, the rest of the process created, but while waiting, I wonder if it is due to the conversion ceiling of MS office itself, here is my code:
# Only set up multiple processes to convert DOCX to PDF
from pathlib import Path
from tqdm.auto import tqdm
import win32com.client
import pythoncom
import time
import os
import concurrent.futures
def convert(docx_filepath):
pythoncom.CoInitialize()
word = win32com.client.DispatchEx('word.application')
wdFormatPDF = 17
docx_filepath = Path(docx_filepath).resolve()
pdf_filepath = docx_filepath.with_suffix('.pdf')
doc = word.Documents.Open(str(docx_filepath))
try:
doc.ExportAsFixedFormat(str(pdf_filepath), wdFormatPDF, False, 0)
except Exception as e:
print(f"Error converting {docx_filepath}: {e}")
finally:
doc.Close(0)
word.Quit()
pythoncom.CoUninitialize()
if __name__ == "__main__":
start_time = time.time()
directory = "rusult/" # replace_with_your_directory_path
docx_files = [os.path.join(root, f)
for root, dirs, files in os.walk(directory)
for f in files if f.endswith('.docx')]
max_workers = min(32, len(docx_files))
with concurrent.futures.ProcessPoolExecutor(max_workers=max_workers) as executor:
with tqdm(total=len(docx_files), desc="Converting DOCX to PDF") as pbar:
futures = {executor.submit(convert, docx_file): docx_file for docx_file in docx_files}
for future in concurrent.futures.as_completed(futures):
pbar.update(1)
end_time = time.time()
print(f"Full processing time: {end_time - start_time:.2f} seconds")
To avoid being every' word. Application' on and off wasted time, and later I added a for loop in convert, but the total time difference is not great.
# Also set to resize batches and the number of process pools to optimize performance
from tqdm.auto import tqdm
import win32com.client
import pythoncom
import time
import os
import concurrent.futures
def convert_batch(docx_filepaths):
pythoncom.CoInitialize()
word = win32com.client.DispatchEx('word.application')
wdFormatPDF = 17
for docx_filepath in docx_filepaths:
docx_filepath = Path(docx_filepath).resolve()
pdf_filepath = docx_filepath.with_suffix('.pdf')
doc = word.Documents.Open(str(docx_filepath))
try:
doc.ExportAsFixedFormat(str(pdf_filepath), wdFormatPDF, False, 0)
except Exception as e:
print(f"Error converting {docx_filepath}: {e}")
finally:
doc.Close(0)
word.Quit()
pythoncom.CoUninitialize()
if __name__ == "__main__":
start_time = time.time()
directory = "result/" # replace_with_your_directory_path
docx_files = [os.path.join(root, f)
for root, dirs, files in os.walk(directory)
for f in files if f.endswith('.docx')]
num_batches = min(32, len(docx_files))
batch_size = len(docx_files) // num_batches + (1 if len(docx_files) % num_batches != 0 else 0)
batches = [docx_files[i:i + batch_size] for i in range(0, len(docx_files), batch_size)]
with concurrent.futures.ProcessPoolExecutor(max_workers=num_batches) as executor:
with tqdm(total=len(docx_files), desc="Converting DOCX to PDF") as pbar:
futures = {executor.submit(convert_batch, batch): batch for batch in batches}
for future in concurrent.futures.as_completed(futures):
pbar.update(len(futures[future]))
end_time = time.time()
print(f"Full processing time: {end_time - start_time:.2f} second")
I've been trying to speed up the conversion process by implementing some basic multithreading but it seems like the single instance of word is my bottleneck. Is there a workaround for this?