from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
import os
import sys, getopt
converts pdf, returns its text content as a string
converts all pdfs in directory pdfDir, saves all resulting txt files to txtdir
def convertMultiple(pdfDir, txtDir):
判断是否有储存的文件夹,如果有则pass,没有则创建
if os.path.exists(txtDir):
pass
else:
os.makedirs(txtDir)
#判断读取pdf是否需要密码
if pdfDir == "": pdfDir = os.getcwd() + "\\" # if no pdfDir passed in
#遍历文件夹下每一个pdf文件
for pdf in os.listdir(pdfDir): # iterate through pdfs in pdf directory
fileExtension = pdf.split(".")[-1]
#判断是否该文件夹下的文件是否是pdf文件
if fileExtension == "pdf" or fileExtension == "PDF":
#构建pdf的完全路径
pdfFilename = pdfDir + pdf
text = convert(pdfFilename) # get string of text content of pdf
#构建存储文件的目标路径
textFilename = txtDir + '\\' +pdf[:-4]+ ".txt"
#将解析得到的pdf文件写入对应的txt文件
f = open(textFilename,'a',encoding='utf-8')
f.write(text)
f.close()
用法:Python PDFTOTXT.py -p D:\works\Python\jiaoxue\评级报告\ -t D:\works\Python\jiaoxue\pingji2
用法:Python PDFTOTXT.py -p D:\works\Python\pdf\pdfdir\ -t D:\works\Python\pdf\txtdir
from io import StringIO
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.converter import TextConverter from pdfminer.layout import LAParams from pdfminer.pdfpage import PDFPage import os import sys, getopt
converts pdf, returns its text content as a string
from https://www.binpress.com/tutorial/manipulating-pdfs-with-python/167
from pdfminer.pdfparser import PDFParser
def convert(fname, pages=None): if not pages: pagenums = set() else: pagenums = set(pages)
converts all pdfs in directory pdfDir, saves all resulting txt files to txtdir
def convertMultiple(pdfDir, txtDir):
判断是否有储存的文件夹,如果有则pass,没有则创建
i : info
p : pdfDir
t = txtDir
def PDFTOTXT(argv): try:
opts是指拿到argv中必须拿到的参数,args是argv中不需要的参数
if name == "main": PDFTOTXT(sys.argv[1:])