ncbo / ncbo_annotator

To automatically process a piece of data text to annotate it with relevant ontology concepts and return the annotations.
http://bioportal.bioontology.org/annotator
Other
18 stars 9 forks source link

Maximum size limited for NCBO annotator service? #2

Closed leej35 closed 7 years ago

leej35 commented 7 years ago

When I submit a very long text query to NCBO annotator service using Python3.5 on urllib3, it gives me this error.

exceptions.MaxRetryError: HTTPConnectionPool(host='data.bioontology.org', port=80): Max retries exceeded with url: /annotator?text=Prevention+and+Early+Detection+of+ ... (Caused by ProtocolError('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer')))

When I query short text to annotate, it is processed without any problem but when the size of query text is going to be 20Kbyte, it gives me the error.

Presumably, it seems there is certain amount of maximum length of query allowed to the annotator service. If so, can I know how much exactly it is?

For your information, I attached my python code.

Thanks, Jeongmin

import json
import urllib3
import urllib
import traceback
import sys
import re
import glob
from time import sleep

# user parameters
TEXT_DIR = '../data/text/'
JSON_DIR = '../data/json/'

apikey=''
REST_URL = "http://data.bioontology.org"

ontology_list = 'ICD9CM,LOINC,NDFRT,RXNORM,SNOMEDCT'
tui_list = 'T017,T029,T023'
options = '&longest_only=true&exclude_numbers=false&whole_word_only=true&exclude_synonyms=false' param = '&ontologies=' + ontology_list + '&semantic_types=' + tui_list + options

def get_json(text):
    # create request_url
    request_url = REST_URL + "/annotator?text=" + text.replace(' ','+') + param + "&apikey=" + apikey
    # get data as json type
    http = urllib3.PoolManager() 
    r = http.request('POST', request_url, headers={'Authorization': 'apikey token=' + apikey})
    print('request_url: '+request_url)
    print('http status: '+ str(r.status))
    data_json = json.loads(r.data.decode('utf-8'))
    return data_json

def main():
    for filename in glob.glob(TEXT_DIR+'*.txt'):
        # for each file load file 
        text = ''
        lines = open(filename,"r").read().splitlines()
        for l in lines:
            text = text + l.rstrip()
        # remove special characters
        text = re.sub('[^A-Za-z0-9]+', ' ', text)
        # get json
        data = get_json(text)
        # save to json file
        filename_nodir = filename.split('/')[-1].split('.')[0]
        json_fn = '' + filename_nodir + '.json'
        # print(json_fn)
        with open(JSON_DIR+json_fn, 'w') as outfile:
            json.dump(data, outfile)

if __name__ == "__main__":
    main()
leej35 commented 7 years ago

I solved this issue by finding my error of use 'POST' method incorrectly. Following code is you can use when you have text files in a speicfic folder (TEXT DIR) and want annotate them and export each file as a json.

import traceback
import sys
import re
import glob
import json
import urllib
import urllib2
from time import gmtime, strftime
from time import sleep

INPUT_DIR = '../data/text/'
OUTPUT_DIR = '../data/json/'

API_KEY= ''
annotatorUrl = 'http://data.bioontology.org/annotator' 

ontology_list = 'ICD9CM,LOINC'
tui_list = 'T017,T029,T023,T030'

def get_json(text):
    params = {
        'text':text, 
        'longest_only':'true',
        'whole_word_only':'true',
        'stop_words':'',
        'ontologies':ontology_list,   
        'ontologiesToKeepInResult':'',   
        'isVirtualOntologyId':'true', 
        'semantic_types':tui_list,
        'apikey':API_KEY
    }
    headers = {'Authorization': 'apikey token=' + API_KEY}
    data = urllib.urlencode(params)
    request = urllib2.Request(annotatorUrl, data, headers)
    # request.add_header('Content-type','text/xml')
    response = urllib2.urlopen(request)
    data_json = json.loads(response.read().decode('utf-8'))
    # print 'http status: '+ str(response.getcode())
    return data_json

def main():
    for filename in glob.glob(INPUT_DIR+'*.txt'):
        # for each file load file 
        text = ''
        lines = open(filename,"r").read().splitlines()
        for l in lines:
            text = text + l.rstrip()
        # remove special characters
        text = re.sub('[^A-Za-z0-9]+', ' ', text)
        try:
            # get json
            data = get_json(text)
            # save to json file
            filename_nodir = filename.split('/')[-1].split('.')[0]
            json_fn = '' + filename_nodir + '.json'

            with open(OUTPUT_DIR+json_fn, 'w') as outfile:
                json.dump(data, outfile)
            print strftime("%Y-%m-%d %H:%M:%S") + ' SUCCESS ' + filename_nodir
        except:
            print strftime("%Y-%m-%d %H:%M:%S") + ' FAIL ' + filename_nodir
            raise

if __name__ == "__main__":
    main()