uvacw / inca

24 stars 6 forks source link

use pre-trained classifier to enrich documents #443

Closed damian0604 closed 6 years ago

damian0604 commented 6 years ago

Use a CLF file with a pre-trained classifier to add new keys to fields

Test case:


p = myinca.processing.pretrained('nu','text',path_to_model='/home/damian/Downloads/PasiveAggressive_text.pkl',new_key='topic',save=False)
next(p)

Obviously with save=True in case you want to keep it.

still to do: add documentation how to actually create the .clf file.

mariekevh commented 6 years ago

Error:

In [5]: a = next(p)

StopIteration Traceback (most recent call last)

in () ----> 1 a = next(p) StopIteration: In [6]: p = myinca.processing.pretrained('nu', 'text', path_to_model='/home/marieke/news_events/susan_PAclassifier/Pa ...: ssiveAggressive_text.pkl', new_key='topic', save=True) In [7]: a = next(p) INFO:INCA:assuming input is a query_string INFO:INCA:force=False, ignoring documents where the result key exists (and has non-NULL value) --------------------------------------------------------------------------- | 0/20 [00:00 48 return json.dumps(data, default=self.default, ensure_ascii=False) 49 except (ValueError, TypeError) as e: /usr/lib/python3.5/json/__init__.py in dumps(obj, skipkeys, ensure_ascii, check_circular, allow_nan, cls, indent, separators, default, sort_keys, **kw) 236 separators=separators, default=default, sort_keys=sort_keys, --> 237 **kw).encode(obj) 238 /usr/lib/python3.5/json/encoder.py in encode(self, o) 197 # equivalent to the PySequence_Fast that ''.join() would do. --> 198 chunks = self.iterencode(o, _one_shot=True) 199 if not isinstance(chunks, (list, tuple)): /usr/lib/python3.5/json/encoder.py in iterencode(self, o, _one_shot) 255 self.skipkeys, _one_shot) --> 256 return _iterencode(o, 0) 257 /usr/local/lib/python3.5/dist-packages/elasticsearch/serializer.py in default(self, data) 33 return str(data) ---> 34 raise TypeError("Unable to serialize %r (type: %s)" % (data, type(data))) 35 TypeError: Unable to serialize 3 (type: ) During handling of the above exception, another exception occurred: SerializationError Traceback (most recent call last) in () ----> 1 a = next(p) ~/inca/inca/inca/__main__.py in endpoint(*args, **kwargs) 251 if inspect.isgeneratorfunction(method): 252 def endpoint(*args, **kwargs): --> 253 for i in method(*args, **kwargs): 254 yield i 255 else: ~/inca/inca/inca/core/processor_class.py in runwrap(self, docs_or_query, field, new_key, save, force, action, *args, **kwargs) 66 yield self.run(doc, field, new_key, save, force, *args, **kwargs) 67 elif save==True: # do not yield documents if saving to database anyway ---> 68 _ = self.run(doc, field, new_key, save, force, *args, **kwargs) 69 70 elif action == 'delay': ~/inca/inca/inca/core/processor_class.py in run(self, document, field, new_key, save, force, *args, **kwargs) 160 #print('ABOUT TO SAVE') 161 #print(document) --> 162 update_document(document, force=force) 163 # 6. emit dotkey-field 164 if masked: ~/inca/inca/inca/core/database.py in update_document(document, force, retry, max_retries) 109 doc_type='doc', 110 id=document['_id'], --> 111 body={'doc':document['_source']} 112 ) 113 elif exists and force: /usr/local/lib/python3.5/dist-packages/elasticsearch/client/utils.py in _wrapped(*args, **kwargs) 71 if p in kwargs: 72 params[p] = kwargs.pop(p) ---> 73 return func(*args, params=params, **kwargs) 74 return _wrapped 75 return _wrapper /usr/local/lib/python3.5/dist-packages/elasticsearch/client/__init__.py in update(self, index, doc_type, id, body, params) 523 raise ValueError("Empty value passed for a required argument.") 524 return self.transport.perform_request('POST', _make_path(index, --> 525 doc_type, id, '_update'), params=params, body=body) 526 527 @query_params('_source', '_source_exclude', '_source_include', /usr/local/lib/python3.5/dist-packages/elasticsearch/transport.py in perform_request(self, method, url, params, body) 276 """ 277 if body is not None: --> 278 body = self.serializer.dumps(body) 279 280 # some clients or environments don't support sending GET with body /usr/local/lib/python3.5/dist-packages/elasticsearch/serializer.py in dumps(self, data) 48 return json.dumps(data, default=self.default, ensure_ascii=False) 49 except (ValueError, TypeError) as e: ---> 50 raise SerializationError(data, e) 51 52 DEFAULT_SERIALIZERS = { SerializationError: