Closed enahwe closed 4 years ago
hi @enahwe patches are welcome!
Honestly, this small class bellow which uses Tika in command line is widely enough to meet our needs: You just have to instanciate this class with JAVA_HOME path, and the Tika jar path.
#!/bin/python
# -*- coding: utf-8 -*-
# Class to extract metadata and text from different file types (such as PPT, XLS, and PDF)
# Developed by Philippe ROSSIGNOL
#####################
# TikaWrapper class #
#####################
class TikaWrapper:
java_home = None
tikalib_path = None
# Constructor
def __init__(self, java_home, tikalib_path):
self.java_home = java_home
self.tika_lib_path = tikalib_path
def extractMetadata(self, filePath, encoding="UTF-8", returnTuple=False):
'''
- Description:
Extract metadata from a document
- Params:
filePath: The document file path
encoding: The encoding (default = "UTF-8")
returnTuple: If True return a tuple which contains both the output and the error (default = False)
- Examples:
metadata = extractMetadata(filePath="MyDocument.docx")
metadata, error = extractMetadata(filePath="MyDocument.docx", encoding="UTF-8", returnTuple=True)
'''
cmd = self._getCmd(self._cmdExtractMetadata, filePath, encoding)
out, err = self._execute(cmd, encoding)
if (returnTuple): return out, err
return out
def extractText(self, filePath, encoding="UTF-8", returnTuple=False):
'''
- Description:
Extract text from a document
- Params:
filePath: The document file path
encoding: The encoding (default = "UTF-8")
returnTuple: If True return a tuple which contains both the output and the error (default = False)
- Examples:
text = extractText(filePath="MyDocument.docx")
text, error = extractText(filePath="MyDocument.docx", encoding="UTF-8", returnTuple=True)
'''
cmd = self._getCmd(self._cmdExtractText, filePath, encoding)
out, err = self._execute(cmd, encoding)
return out, err
# ===========
# = PRIVATE =
# ===========
_cmdExtractMetadata = "${JAVA_HOME}/bin/java -jar ${TIKALIB_PATH} --metadata ${FILE_PATH}"
_cmdExtractText = "${JAVA_HOME}/bin/java -jar ${TIKALIB_PATH} --encoding=${ENCODING} --text ${FILE_PATH}"
def _getCmd(self, cmdModel, filePath, encoding):
cmd = cmdModel.replace("${JAVA_HOME}", self.java_home)
cmd = cmd.replace("${TIKALIB_PATH}", self.tika_lib_path)
cmd = cmd.replace("${ENCODING}", encoding)
cmd = cmd.replace("${FILE_PATH}", filePath)
return cmd
def _execute(self, cmd, encoding):
import subprocess
process = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
out, err = process.communicate()
out = out.decode(encoding=encoding)
err = err.decode(encoding=encoding)
return out, err
thanks for sharing @enahwe
What a shame, this "wrapper" has been developed for mono-account (e.g: one laptop, one user), not for multi-accounts .... ! When used with multi-accounts, there's a permission issue (concurrency access) on the file /tmp/tika.log... What a shame :-( Please think team and not always for only one developer...