Closed kenwoodjw closed 7 years ago
I solve it .
@blackjws I have the same problem. Do you mind telling me your solution? Thanks.
Edit: thanks @jyu01. Passing java_class = 'edu.stanford.nlp.ie.crf.CRFClassifier'
to the StanfordSegmenter
constructor was right.
My solution: After segmenter = StanfordSegmenter(...)
One more line of code is needed: segmenter.default_config('zh')
This will set: self._java_class = 'edu.stanford.nlp.ie.crf.CRFClassifier'
import nltk segmenter = StanfordSegmenter( java_class='edu.stanford.nlp.ie.crf.CRFClassifier', path_to_jar='/home/kenwood/stanford/segmenter/stanford-segmenter.jar', path_to_slf4j='/home/kenwood/stanford/segmenter/slf4j-api.jar', path_to_sihan_corpora_dict='/home/kenwood/stanford/segmenter/data', path_to_model='/home/kenwood/stanford/segmenter/data/pku.gz', path_to_dict='/home/kenwood/stanford/segmenter/data/dict-chris6.ser.gz' )
from nltk.tokenize.stanford_segmenter import StanfordSegmenter from nltk.tag import StanfordNERTagger
import nltk
seg = StanfordSegmenter( path_to_jar="D:/NLTK/stanford-segmenter.jar", path_to_slf4j="D:/NLTK/slf4j-api.jar", path_to_dict="D:/NLTK/dict-chris6.ser.gz")
seg.default_config('zh') sent = u'这是斯坦福中文分词器测试' print(seg.segment(sent))
File "C:\Users\Suny\Anaconda3\lib\site-packages\nltk\tokenize\stanford_segmenter.py", line 120, in default_config
"variables STANFORD_MODELS and
LookupError: Could not find 'dict-chris6.ser.gz' (tried using env. variables STANFORD_MODELS and
@sunygithub Hey, I'm having the same problem (LookupError: Could not find 'dict-chris6.ser.gz', after typing _segmenter.defaultconfig('zh')). Have you fixed it?
I have personally run into the same problem and I have looked into the code of stanford_segmenter.py of nltk version 3.2.4
TLDR: Use the below code.
seg = StanfordSegmenter(
path_to_jar="D:/NLTK/stanford-segmenter.jar",
path_to_slf4j="D:/NLTK/slf4j-api.jar",
java_class = 'edu.stanford.nlp.ie.crf.CRFClassifier',
path_to_sihan_corpora_dict = 'D:/NLTK/data',
path_to_model = 'D:/NLTK/data/pku.gz',
path_to_dict="D:/NLTK/data/dict-chris6.ser.gz",
)
# without seg.default_config('zh')
Below is the code extract that is relevant to this specific error:
def __init__(self,
path_to_jar=None, path_to_slf4j=None,
java_class=None,
path_to_model=None,
path_to_dict=None,
path_to_sihan_corpora_dict=None,
sihan_post_processing='false',
keep_whitespaces='false',
encoding='UTF-8', options=None,
verbose=False, java_options='-mx2g'):
stanford_segmenter = find_jar(
self._JAR, path_to_jar,
env_vars=('STANFORD_SEGMENTER',),
searchpath=(), url=_stanford_url,
verbose=verbose)
slf4j = find_jar(
self._SLF4J, path_to_slf4j,
env_vars=('SLF4J', 'STANFORD_SEGMENTER',),
searchpath=(), url=_stanford_url,
verbose=verbose)
# This is passed to java as the -cp option, the segmenter needs slf4j.
self._stanford_jar = os.pathsep.join(
[_ for _ in [stanford_segmenter, slf4j] if not _ is None])
self._java_class = java_class
self._model = path_to_model
self._sihan_corpora_dict = path_to_sihan_corpora_dict
self._sihan_post_processing = sihan_post_processing
self._keep_whitespaces = keep_whitespaces
self._dict = path_to_dict
self._encoding = encoding
self.java_options = java_options
options = {} if options is None else options
self._options_cmd = ','.join('{0}={1}'.format(key, json.dumps(val)) for key, val in options.items())
def default_config(self, lang):
"""
Attempt to intialize Stanford Word Segmenter for the specified language
using the STANFORD_SEGMENTER and STANFORD_MODELS environment variables
"""
search_path = ()
if os.environ.get('STANFORD_SEGMENTER'):
search_path = {os.path.join(os.environ.get('STANFORD_SEGMENTER'), 'data')}
# init for Chinese-specific files
self._dict = None
self._sihan_corpora_dict = None
self._sihan_post_processing = 'false'
if lang == 'ar':
self._java_class = 'edu.stanford.nlp.international.arabic.process.ArabicSegmenter'
model = 'arabic-segmenter-atb+bn+arztrain.ser.gz'
elif lang == 'zh':
self._java_class = 'edu.stanford.nlp.ie.crf.CRFClassifier'
model = 'pku.gz'
self._sihan_post_processing = 'true'
path_to_dict = 'dict-chris6.ser.gz'
try:
self._dict = find_file(path_to_dict, searchpath=search_path,
url=_stanford_url, verbose=False,
env_vars=('STANFORD_MODELS',))
except LookupError:
raise LookupError("Could not find '%s' (tried using env. "
"variables STANFORD_MODELS and <STANFORD_SEGMENTER>/data/)" % path_to_dict)
sihan_dir = './data/'
try:
path_to_sihan_dir = find_dir(sihan_dir,
url=_stanford_url, verbose=False,
env_vars=('STANFORD_SEGMENTER',))
self._sihan_corpora_dict = os.path.join(path_to_sihan_dir, sihan_dir)
except LookupError:
raise LookupError("Could not find '%s' (tried using the "
"STANFORD_SEGMENTER environment variable)" % sihan_dir)
else:
raise LookupError("Unsupported language '%'" % lang)
try:
self._model = find_file(model, searchpath=search_path,
url=_stanford_url, verbose=False,
env_vars=('STANFORD_MODELS', 'STANFORD_SEGMENTER',))
except LookupError:
raise LookupError("Could not find '%s' (tried using env. "
"variables STANFORD_MODELS and <STANFORD_SEGMENTER>/data/)" % model)
Specifically, what seg.default_config does is ""intializing Stanford Word Segmenter for the specified language using the STANFORD_SEGMENTER and STANFORD_MODELS environment variables".
If you look into default_config, path_to_dict is set to:
path_to_dict = 'dict-chris6.ser.gz'
search_path = ()
if os.environ.get('STANFORD_SEGMENTER'):
search_path = {os.path.join(os.environ.get('STANFORD_SEGMENTER'), 'data')}
self._dict = find_file(path_to_dict, searchpath=search_path,
url=_stanford_url, verbose=False,
env_vars=('STANFORD_MODELS',))
As you can see from the code, searching is done within the environmental variable instead of the path specified by the user. Therefore instead of calling seg.default_config, following the example from @sunygithub, you may want to only use:
seg = StanfordSegmenter(
path_to_jar="D:/NLTK/stanford-segmenter.jar",
path_to_slf4j="D:/NLTK/slf4j-api.jar",
java_class = 'edu.stanford.nlp.ie.crf.CRFClassifier',
path_to_sihan_corpora_dict = 'D:/NLTK/data',
path_to_model = 'D:/NLTK/data/pku.gz',
path_to_dict="D:/NLTK/data/dict-chris6.ser.gz",
)
# without seg.default_config('zh')
Since the major purpose of setting seg.default_config('zh') after the StanfordSegmenter constructor is to set self._java_class to 'edu.stanford.nlp.ie.crf.CRFClassifier'. You may as well directly set it with the constructor.
Note that I am using stanford-segmenter-2017-06-09 and it does not come with slf4j-api.jar, you may specify it to any other file instead, it should still work fine. (e.g. even "D:/NLTK/stanford-segmenter.jar" is fine) According to https://github.com/nltk/nltk/issues/1652, this should be a already resolved issue.
Hope this helps.
Easiest hack is to copy the following line at the bottom of your .bashrc file
export STANFORD_SEGMENTER="path/to/your/stanford-segmenter/folder"
This is my code:
coding:utf-8
from nltk.tokenize.stanford_segmenter import StanfordSegmenter from nltk.tag import StanfordNERTagger import nltk
segmenter = StanfordSegmenter( path_to_jar='/home/kenwood/stanford/segmenter/stanford-segmenter.jar', path_to_slf4j='/home/kenwood/stanford/segmenter/slf4j-api.jar', path_to_sihan_corpora_dict='/home/kenwood/stanford/segmenter/data', path_to_model='/home/kenwood/stanford/segmenter/data/pku.gz', path_to_dict='/home/kenwood/stanford/segmenter/data/dict-chris6.ser.gz' )
chi_tagger = StanfordNERTagger(path_to_jar='/home/kenwood/stanford/ner/stanford-ner.jar',model_filename='/home/kenwood/stanford/ner/classifiers/chinese.misc.distsim.crf.ser.gz')
sentence = "这是斯坦福中文分词器测试" result = segmenter.segment(sentence) print (result)
The error message : Traceback (most recent call last): File "/home/kenwood/PycharmProjects/wordcut/toolpackage/ntlk_module.py", line 17, in
result = segmenter.segment(sentence)
File "/usr/lib/python3.6/site-packages/nltk/tokenize/stanford_segmenter.py", line 164, in segment
return self.segment_sents([tokens])
File "/usr/lib/python3.6/site-packages/nltk/tokenize/stanford_segmenter.py", line 192, in segment_sents
stdout = self._execute(cmd)
File "/usr/lib/python3.6/site-packages/nltk/tokenize/stanford_segmenter.py", line 211, in _execute
stdout, _stderr = java(cmd, classpath=self._stanford_jar, stdout=PIPE, stderr=PIPE)
File "/usr/lib/python3.6/site-packages/nltk/internals.py", line 129, in java
p = subprocess.Popen(cmd, stdin=stdin, stdout=stdout, stderr=stderr)
File "/usr/lib/python3.6/subprocess.py", line 707, in init
restore_signals, start_new_session)
File "/usr/lib/python3.6/subprocess.py", line 1260, in _execute_child
restore_signals, start_new_session, preexec_fn)
TypeError: expected str, bytes or os.PathLike object, not NoneType
Someone can help me? thank you!