fydy / elaw

:100: 个人博客站,记录一些事。
https://git.io/JWxoK
25 stars 8 forks source link

将DOC、RTF格式文件批量转为TXT格式文件 #54

Open fydy opened 4 years ago

fydy commented 4 years ago

dataset_loca=sys.argv[1] dataset_loca3=sys.argv[2]+'/'

class MSOffice2txt(): def init(self, fileType=['doc','ppt']): self.docCom = None self.pptCom = None pythoncom.CoInitialize() if type(fileType) is not list: return 'Error, please check the fileType, it must be list[]' for ft in fileType: if ft == 'doc': self.docCom = self.docApplicationOpen() def close(self): self.docApplicationClose(self.docCom)

def docApplicationOpen(self): docCom = win32com.client.Dispatch('Word.Application') docCom.Visible = 1 docCom.DisplayAlerts = 0 docHwnd = win32gui.FindWindow(None, 'Microsoft Word') win32gui.ShowWindow(docHwnd, win32con.SW_HIDE) return docCom

def docApplicationClose(self,docCom): if docCom is not None: docCom.Quit()

def doc2Txt(self, docCom, docFile, txtFile): doc = docCom.Documents.Open(FileName=docFile,ReadOnly=1) doc.SaveAs(txtFile, 2) doc.Close()

def translate(self, filename, txtFilename): if filename.endswith('doc') or filename.endswith('docx'): if self.docCom is None: self.docCom = self.docApplicationOpen() self.doc2Txt(self.docCom, filename, txtFilename) return True else: return False

files = glob(dataset_loca) oldfile=[] count=0 msoffice = MSOffice2txt() for file_name in files: tmpp=filename tmpp=tmpp.split('\',2)[-1] tmpp=tmpp.split('.',1)[0] tmpp=tmpp.replace(' ','') shutil.copyfile(file_name,dataset_loca3+tmpp+'.doc') count=count+1 msoffice.translate(dataset_loca3+tmpp+'.doc', dataset_loca3+tmpp+'.txt')



- 在运行过程中,需要输入两个参数:第一个为转换文件DOC、RTF的源目录的绝对地址;第二个为转换后的TXT格式文件存放的绝对地址。
转换示例:
` python LDA_RTF_TXT.py D:/2019/*.doc D:/2020 `
将D:/2019/*.doc路径下的所有doc格式文件批量转换为TXT文件存放至D:/2020路径下。