class MSOffice2txt():
def init(self, fileType=['doc','ppt']):
self.docCom = None
self.pptCom = None
pythoncom.CoInitialize()
if type(fileType) is not list:
return 'Error, please check the fileType, it must be list[]'
for ft in fileType:
if ft == 'doc':
self.docCom = self.docApplicationOpen()
def close(self):
self.docApplicationClose(self.docCom)
Windows 下将目录下所有文件下的文件 拷贝到一个目录中:搜索
"."
安装python 及 win32com模块
pip install pypiwin32
创建
LDA_RTF_TXT.py
dataset_loca=sys.argv[1] dataset_loca3=sys.argv[2]+'/'
class MSOffice2txt(): def init(self, fileType=['doc','ppt']): self.docCom = None self.pptCom = None pythoncom.CoInitialize() if type(fileType) is not list: return 'Error, please check the fileType, it must be list[]' for ft in fileType: if ft == 'doc': self.docCom = self.docApplicationOpen() def close(self): self.docApplicationClose(self.docCom)
def docApplicationOpen(self): docCom = win32com.client.Dispatch('Word.Application') docCom.Visible = 1 docCom.DisplayAlerts = 0 docHwnd = win32gui.FindWindow(None, 'Microsoft Word') win32gui.ShowWindow(docHwnd, win32con.SW_HIDE) return docCom
def docApplicationClose(self,docCom): if docCom is not None: docCom.Quit()
def doc2Txt(self, docCom, docFile, txtFile): doc = docCom.Documents.Open(FileName=docFile,ReadOnly=1) doc.SaveAs(txtFile, 2) doc.Close()
def translate(self, filename, txtFilename): if filename.endswith('doc') or filename.endswith('docx'): if self.docCom is None: self.docCom = self.docApplicationOpen() self.doc2Txt(self.docCom, filename, txtFilename) return True else: return False
files = glob(dataset_loca) oldfile=[] count=0 msoffice = MSOffice2txt() for file_name in files: tmpp=filename tmpp=tmpp.split('\',2)[-1] tmpp=tmpp.split('.',1)[0] tmpp=tmpp.replace(' ','') shutil.copyfile(file_name,dataset_loca3+tmpp+'.doc') count=count+1 msoffice.translate(dataset_loca3+tmpp+'.doc', dataset_loca3+tmpp+'.txt')