Maximum size limited for NCBO annotator service?

ncbo / ncbo_annotator

To automatically process a piece of data text to annotate it with relevant ontology concepts and return the annotations.

Other

18 stars 9 forks source link

When I submit a very long text query to NCBO annotator service using Python3.5 on urllib3, it gives me this error.

exceptions.MaxRetryError: HTTPConnectionPool(host='data.bioontology.org', port=80): Max retries exceeded with url: /annotator?text=Prevention+and+Early+Detection+of+ ... (Caused by ProtocolError('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer')))

When I query short text to annotate, it is processed without any problem but when the size of query text is going to be 20Kbyte, it gives me the error.

Presumably, it seems there is certain amount of maximum length of query allowed to the annotator service. If so, can I know how much exactly it is?

For your information, I attached my python code.

Thanks, Jeongmin

import json
import urllib3
import urllib
import traceback
import sys
import re
import glob
from time import sleep

# user parameters
TEXT_DIR = '../data/text/'
JSON_DIR = '../data/json/'

apikey=''
REST_URL = "http://data.bioontology.org"

ontology_list = 'ICD9CM,LOINC,NDFRT,RXNORM,SNOMEDCT'
tui_list = 'T017,T029,T023'
options = '&longest_only=true&exclude_numbers=false&whole_word_only=true&exclude_synonyms=false' param = '&ontologies=' + ontology_list + '&semantic_types=' + tui_list + options

def get_json(text):
    # create request_url
    request_url = REST_URL + "/annotator?text=" + text.replace(' ','+') + param + "&apikey=" + apikey
    # get data as json type
    http = urllib3.PoolManager() 
    r = http.request('POST', request_url, headers={'Authorization': 'apikey token=' + apikey})
    print('request_url: '+request_url)
    print('http status: '+ str(r.status))
    data_json = json.loads(r.data.decode('utf-8'))
    return data_json

def main():
    for filename in glob.glob(TEXT_DIR+'*.txt'):
        # for each file load file 
        text = ''
        lines = open(filename,"r").read().splitlines()
        for l in lines:
            text = text + l.rstrip()
        # remove special characters
        text = re.sub('[^A-Za-z0-9]+', ' ', text)
        # get json
        data = get_json(text)
        # save to json file
        filename_nodir = filename.split('/')[-1].split('.')[0]
        json_fn = '' + filename_nodir + '.json'
        # print(json_fn)
        with open(JSON_DIR+json_fn, 'w') as outfile:
            json.dump(data, outfile)

if __name__ == "__main__":
    main()

import traceback import sys import re import glob import json import urllib import urllib2 from time import gmtime, strftime from time import sleep INPUT_DIR = '../data/text/' OUTPUT_DIR = '../data/json/' API_KEY= '' annotatorUrl = 'http://data.bioontology.org/annotator' ontology_list = 'ICD9CM,LOINC' tui_list = 'T017,T029,T023,T030' def get_json(text): params = { 'text':text, 'longest_only':'true', 'whole_word_only':'true', 'stop_words':'', 'ontologies':ontology_list, 'ontologiesToKeepInResult':'', 'isVirtualOntologyId':'true', 'semantic_types':tui_list, 'apikey':API_KEY } headers = {'Authorization': 'apikey token=' + API_KEY} data = urllib.urlencode(params) request = urllib2.Request(annotatorUrl, data, headers) # request.add_header('Content-type','text/xml') response = urllib2.urlopen(request) data_json = json.loads(response.read().decode('utf-8')) # print 'http status: '+ str(response.getcode()) return data_json def main(): for filename in glob.glob(INPUT_DIR+'*.txt'): # for each file load file text = '' lines = open(filename,"r").read().splitlines() for l in lines: text = text + l.rstrip() # remove special characters text = re.sub('[^A-Za-z0-9]+', ' ', text) try: # get json data = get_json(text) # save to json file filename_nodir = filename.split('/')[-1].split('.')[0] json_fn = '' + filename_nodir + '.json' with open(OUTPUT_DIR+json_fn, 'w') as outfile: json.dump(data, outfile) print strftime("%Y-%m-%d %H:%M:%S") + ' SUCCESS ' + filename_nodir except: print strftime("%Y-%m-%d %H:%M:%S") + ' FAIL ' + filename_nodir raise if __name__ == "__main__": main()

ncbo / ncbo_annotator

Maximum size limited for NCBO annotator service? #2