calpoly-csai / api

Official API for the NIMBUS Voice Assistant accessible via HTTP REST protocol.
https://nimbus.api.calpolycsai.com/
GNU General Public License v3.0
9 stars 4 forks source link

hitting 512 MB memory quota on heroku after just 2 API calls #101

Open mfekadu opened 4 years ago

mfekadu commented 4 years ago

Bug Description

The Heroku Standard 1x Dyno has a 512 MB memory quota and we have hit the limit after just

  1. deployment
  2. 2 API calls to the /ask endpoint

Screenshots

screenshots

![image](https://user-images.githubusercontent.com/11131250/75659644-2dc75080-5c1f-11ea-93ab-f30732961b0d.png) image image

Proposed solutions

1. switch to GCP and choose an appropriately sized virtual machine

2. scale the heroku dyno to standard-2x or something else

This would get expensive, quickly...

3. identify memory usage & performance improvements for our code

maybe avoid the use of pandas in QA and use import csv instead

only read_csv is imported

https://github.com/calpoly-csai/api/blob/48336b06f95e3676c5322d6760c34968b2d145d5/QA.py#L10

reference @zpdeng 's usage of csv module

https://github.com/calpoly-csai/api/blob/48336b06f95e3676c5322d6760c34968b2d145d5/database_wrapper.py#L426-L433

it may be a good idea to actually test if pandas is truly hogging memory

maybe spacy?

maybe nltk?

maybe SQLAlchemy?

maybe Flask?

maybe gcloud?

maybe another_package_expected_to_be_large?

consider this output of python3 -m memory_profiler flask_api.py

memory profile

``` {'entity': 'Dr. Khosmood', 'tag': 'PROF', 'normalized entity': 'Khosmood', 'input question': "What is Dr. Khosmood's phone number?", 'normalized question': "What is [PROF]'s phone number?", 'question class': "What is [PROF]'s phone number?"} Filename: /Users/mfekadu/GitHub/api/QA.py Line # Mem usage Increment Line Contents ================================================ 61 218.5 MiB 218.5 MiB @profile 62 def answer(self, extracted_vars): 63 218.5 MiB 0.0 MiB db_data = self._get_data_from_db(extracted_vars) 64 218.5 MiB 0.0 MiB return self._format_answer(extracted_vars, db_data) Filename: /Users/mfekadu/GitHub/api/nimbus.py Line # Mem usage Increment Line Contents ================================================ 22 215.0 MiB 215.0 MiB @profile 23 def answer_question(self, question): 24 218.5 MiB 3.5 MiB ans_dict = NIMBUS_NLP.predict_question(question) 25 218.5 MiB 0.0 MiB print(ans_dict) 26 218.5 MiB 0.0 MiB try: 27 218.5 MiB 0.0 MiB qa = self.qa_dict[ans_dict["question class"]] 28 except KeyError: 29 return "I'm sorry, I don't understand. Please try another question." 30 else: 31 218.5 MiB 0.0 MiB answer = qa.answer(ans_dict) 32 218.5 MiB 0.0 MiB if answer is None: 33 return("I'm sorry, I understand your question but was unable to find an answer. " 34 "Please try another question.") 35 else: 36 218.5 MiB 0.0 MiB return answer 127.0.0.1 - - [02/Mar/2020 01:28:41] "POST /ask HTTP/1.1" 200 - ```

git diff

```diff diff --git a/Pipfile b/Pipfile index 6069200..0f8e7ef 100644 --- a/Pipfile +++ b/Pipfile @@ -22,7 +22,7 @@ pytest = "==5.3.4" pyre-check = "==0.0.41" ## like the Unix `make` but better invoke = "==1.4.1" - +memory-profiler = "*" [packages] # REST API diff --git a/Pipfile.lock b/Pipfile.lock index 6575e89..e55e8ff 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "fb30d39142d3cc83d8909d9f4f4648a60ac33d4ec3a5a94d8dac7b90ef727a24" + "sha256": "c1f663e58339a2e67ba7d26a44722711969699a9998316437dcaef26cbbe8b80" }, "pipfile-spec": 6, "requires": { @@ -826,6 +826,13 @@ ], "version": "==0.6.1" }, + "memory-profiler": { + "hashes": [ + "sha256:23b196f91ea9ac9996e30bfab1e82fecc30a4a1d24870e81d1e81625f786a2c3" + ], + "index": "pypi", + "version": "==0.57.0" + }, "more-itertools": { "hashes": [ "sha256:5dd8bcf33e5f9513ffa06d5ad33d78f31e1931ac9a18f33d37e77a180d393a7c", diff --git a/QA.py b/QA.py index 250b065..06ecf0b 100644 --- a/QA.py +++ b/QA.py @@ -9,6 +9,8 @@ from Entity.Sections import Sections from database_wrapper import NimbusMySQLAlchemy from pandas import read_csv +from memory_profiler import profile + Extracted_Vars = Dict[str, Any] DB_Data = Dict[str, Any] DB_Query = Callable[[Extracted_Vars], DB_Data] @@ -33,6 +35,7 @@ class QA: A class for wrapping functions used to answer a question. """ + @profile def __init__(self, q_format, db_query, format_answer): """ Args: @@ -55,6 +58,7 @@ class QA: def _format_answer(self, extracted_vars, db_data): return self.format_answer(extracted_vars, db_data) + @profile def answer(self, extracted_vars): db_data = self._get_data_from_db(extracted_vars) return self._format_answer(extracted_vars, db_data) @@ -66,6 +70,7 @@ class QA: return hash(self.q_format) +@profile def create_qa_mapping(qa_list): """ Creates a dictionary whose values are QA objects and keys are the question @@ -186,6 +191,7 @@ def yes_no(a_format, pred=None): return functools.partial(_yes_no, a_format, pred) +@profile def generate_fact_QA(csv): df = read_csv(csv) text_in_brackets = r'\[[^\[\]]*\]' diff --git a/flask_api.py b/flask_api.py index d6478c5..34fe17b 100755 --- a/flask_api.py +++ b/flask_api.py @@ -18,6 +18,8 @@ from modules.validators import WakeWordValidator, WakeWordValidatorError from nimbus import Nimbus +from memory_profiler import profile + BAD_REQUEST = 400 SUCCESS = 200 @@ -44,6 +46,7 @@ def generate_session_token() -> str: return "SOME_NEW_TOKEN" +@profile @app.route('/ask', methods=['POST']) def handle_question(): """ diff --git a/nimbus.py b/nimbus.py index 7d6bdfc..729cf62 100644 --- a/nimbus.py +++ b/nimbus.py @@ -9,14 +9,17 @@ from werkzeug.exceptions import BadRequestKeyError from QA import create_qa_mapping, generate_fact_QA from nimbus_nlp.NIMBUS_NLP import NIMBUS_NLP +from memory_profiler import profile class Nimbus: + @profile def __init__(self): self.qa_dict = create_qa_mapping( generate_fact_QA("q_a_pairs.csv") ) + @profile def answer_question(self, question): ans_dict = NIMBUS_NLP.predict_question(question) print(ans_dict) (END) ```

mfekadu commented 4 years ago

Additional Problems

For a moment, when the answers were incorrect, I hypothesized that the wrong DATABASE_NAME was set in the environment variables, but dev_feb_28_2020 is correct as of right now.

$ heroku run --app calpoly-csai-nimbus "env | grep DATABASE_NAME"
Running env | grep DATABASE_NAME on ⬢ calpoly-csai-nimbus... up, run.7815 (Standard-1X)
DATABASE_NAME=dev_feb_28_2020
$
mfekadu commented 4 years ago

a more broad memory profile

POST data

# endpoint ``` http://0.0.0.0:8080/ask ``` # request ```JSON { "question": "Who is the contact for Color Coded?" } ``` # response ```JSON { "answer": "Color Coded's advisor is Foaad Khosmood.", "session": "SOME_NEW_TOKEN" } ```

memory profile logs just after POST

``` [nltk_data] Downloading package stopwords to [nltk_data] /Users/mfekadu/nltk_data... [nltk_data] Package stopwords is already up-to-date! [nltk_data] Downloading package punkt to /Users/mfekadu/nltk_data... [nltk_data] Package punkt is already up-to-date! [nltk_data] Downloading package averaged_perceptron_tagger to [nltk_data] /Users/mfekadu/nltk_data... [nltk_data] Package averaged_perceptron_tagger is already up-to- [nltk_data] date! Filename: /Users/mfekadu/GitHub/api/nimbus_nlp/question_classifier.py Line # Mem usage Increment Line Contents ================================================ 19 153.5 MiB 153.5 MiB @profile 20 def __init__(self): 21 154.6 MiB 1.1 MiB nltk.download('stopwords') 22 154.6 MiB 0.0 MiB nltk.download('punkt') 23 154.7 MiB 0.0 MiB nltk.download('averaged_perceptron_tagger') 24 154.7 MiB 0.0 MiB self.classifier = None 25 281.3 MiB 126.7 MiB self.nlp = spacy.load('en_core_web_sm') 26 281.3 MiB 0.0 MiB self.WH_WORDS = {'WDT', 'WP', 'WP$', 'WRB'} 27 281.3 MiB 0.0 MiB self.overall_features = {} /Users/mfekadu/.local/share/virtualenvs/api-3RYm-VjB/lib/python3.6/site-packages/sklearn/base.py:251: UserWarning: Trying to unpickle estimator KNeighborsClassifier from version 0.21.3 when using version 0.20.2. This might lead to breaking code or invalid results. Use at your own risk. UserWarning) Filename: /Users/mfekadu/GitHub/api/nimbus_nlp/save_and_load_model.py Line # Mem usage Increment Line Contents ================================================ 35 281.3 MiB 281.3 MiB @profile 36 def load_latest_model(): 37 # https://stackoverflow.com/a/39327156 38 281.3 MiB 0.0 MiB train_path = PROJECT_DIR + '/models/classification/*' 39 281.3 MiB 0.0 MiB list_of_files = glob.glob(train_path) 40 281.3 MiB 0.0 MiB latest_file = max(list_of_files, key=os.path.getctime) 41 284.6 MiB 3.3 MiB return joblib.load(latest_file) Filename: /Users/mfekadu/GitHub/api/nimbus_nlp/question_classifier.py Line # Mem usage Increment Line Contents ================================================ 44 281.3 MiB 281.3 MiB @profile 45 def load_latest_classifier(self): 46 284.6 MiB 3.3 MiB self.classifier = load_latest_model() 47 284.6 MiB 0.0 MiB with open(PROJECT_DIR+ '/models/features/overall_features.json', 'r') as fp: 48 284.6 MiB 0.0 MiB self.overall_features = json.load(fp) Filename: /Users/mfekadu/GitHub/api/nimbus_nlp/question_classifier.py Line # Mem usage Increment Line Contents ================================================ 130 284.6 MiB 284.6 MiB @profile 131 def extract_main_verb(self, question): 132 286.0 MiB 1.4 MiB doc = self.nlp(question) 133 286.1 MiB 0.1 MiB sents = list(doc.sents) 134 286.1 MiB 0.0 MiB if len(sents) == 0: 135 raise ValueError("Empty question") 136 137 286.1 MiB 0.0 MiB return sents[0].root Filename: /Users/mfekadu/GitHub/api/nimbus_nlp/question_classifier.py Line # Mem usage Increment Line Contents ================================================ 139 286.4 MiB 286.4 MiB @profile 140 def get_lemmas(self, words): 141 286.5 MiB 0.0 MiB return [self.nlp(word)[0].lemma_ for word in words] Filename: /Users/mfekadu/GitHub/api/nimbus_nlp/question_classifier.py Line # Mem usage Increment Line Contents ================================================ 143 323.8 MiB 323.8 MiB @profile 144 def is_wh_word(self, pos): 145 323.8 MiB 0.0 MiB return pos in self.WH_WORDS Filename: /Users/mfekadu/GitHub/api/nimbus_nlp/question_classifier.py Line # Mem usage Increment Line Contents ================================================ 143 323.8 MiB 323.8 MiB @profile 144 def is_wh_word(self, pos): 145 323.8 MiB 0.0 MiB return pos in self.WH_WORDS Filename: /Users/mfekadu/GitHub/api/nimbus_nlp/question_classifier.py Line # Mem usage Increment Line Contents ================================================ 143 323.8 MiB 323.8 MiB @profile 144 def is_wh_word(self, pos): 145 323.8 MiB 0.0 MiB return pos in self.WH_WORDS Filename: /Users/mfekadu/GitHub/api/nimbus_nlp/question_classifier.py Line # Mem usage Increment Line Contents ================================================ 143 323.8 MiB 323.8 MiB @profile 144 def is_wh_word(self, pos): 145 323.8 MiB 0.0 MiB return pos in self.WH_WORDS Filename: /Users/mfekadu/GitHub/api/nimbus_nlp/question_classifier.py Line # Mem usage Increment Line Contents ================================================ 143 323.8 MiB 323.8 MiB @profile 144 def is_wh_word(self, pos): 145 323.8 MiB 0.0 MiB return pos in self.WH_WORDS Filename: /Users/mfekadu/GitHub/api/nimbus_nlp/question_classifier.py Line # Mem usage Increment Line Contents ================================================ 50 284.6 MiB 284.6 MiB @profile 51 def get_question_features(self, question): 52 # print("using new algorithm") 53 """ 54 Method to extract features from each individual question. 55 """ 56 284.6 MiB 0.0 MiB features = {} 57 58 # Extract the main verb from the question before additional processing 59 286.1 MiB 1.5 MiB main_verb = str(self.extract_main_verb(question)) 60 61 # ADD ALL VARIABLES TO THE FEATURE DICT WITH A WEIGHT OF 90 62 286.1 MiB 0.0 MiB matches = re.findall(r'(\[(.*?)\])', question) 63 286.1 MiB 0.0 MiB for match in matches: 64 286.1 MiB 0.0 MiB question = question.replace(match[0], '') 65 286.1 MiB 0.0 MiB features[match[0]] = 90 66 67 286.1 MiB 0.0 MiB question = re.sub('[^a-zA-Z0-9]', ' ', question) 68 69 # PRE-PROCESSING: TOKENIZE SENTENCE, AND LOWER AND STEM EACH WORD 70 286.4 MiB 0.3 MiB words = nltk.word_tokenize(question) 71 286.4 MiB 0.0 MiB words = [word.lower() for word in words if '[' and ']' not in word] 72 73 286.5 MiB 0.1 MiB filtered_words = self.get_lemmas(words) 74 75 # ADD THE LEMMATIZED MAIN VERB TO THE FEATURE SET WITH A WEIGHT OF 60 76 286.5 MiB 0.0 MiB stemmed_main_verb = self.nlp(main_verb)[0] 77 286.5 MiB 0.0 MiB features[stemmed_main_verb.text] = 60 78 79 # TAG WORDS' PART OF SPEECH, AND ADD ALL WH WORDS TO FEATURE DICT 80 # WITH WEIGHT 60 81 323.8 MiB 37.3 MiB words_pos = nltk.pos_tag(filtered_words) 82 323.8 MiB 0.0 MiB for word_pos in words_pos: 83 323.8 MiB 0.0 MiB if self.is_wh_word(word_pos[1]): 84 323.8 MiB 0.0 MiB features[word_pos[0]] = 60 85 86 # ADD FIRST WORD AND NON-STOP WORDS TO FEATURE DICT 87 filtered_words = [ 88 323.9 MiB 0.0 MiB word for word in filtered_words if word not in nltk.corpus.stopwords.words('english')] 89 323.9 MiB 0.0 MiB for word in filtered_words: 90 # ADD EACH WORD NOT ALREADY PRESENT IN FEATURE SET WITH WEIGHT OF 30 91 323.9 MiB 0.0 MiB if word not in features: 92 323.9 MiB 0.0 MiB features[word] = 30 93 94 323.9 MiB 0.0 MiB return features Filename: /Users/mfekadu/GitHub/api/nimbus_nlp/question_classifier.py Line # Mem usage Increment Line Contents ================================================ 139 324.0 MiB 324.0 MiB @profile 140 def get_lemmas(self, words): 141 324.4 MiB 0.2 MiB return [self.nlp(word)[0].lemma_ for word in words] Filename: /Users/mfekadu/GitHub/api/nimbus_nlp/question_classifier.py Line # Mem usage Increment Line Contents ================================================ 143 324.4 MiB 324.4 MiB @profile 144 def is_wh_word(self, pos): 145 324.4 MiB 0.0 MiB return pos in self.WH_WORDS Filename: /Users/mfekadu/GitHub/api/nimbus_nlp/question_classifier.py Line # Mem usage Increment Line Contents ================================================ 143 324.4 MiB 324.4 MiB @profile 144 def is_wh_word(self, pos): 145 324.4 MiB 0.0 MiB return pos in self.WH_WORDS Filename: /Users/mfekadu/GitHub/api/nimbus_nlp/question_classifier.py Line # Mem usage Increment Line Contents ================================================ 143 324.4 MiB 324.4 MiB @profile 144 def is_wh_word(self, pos): 145 324.4 MiB 0.0 MiB return pos in self.WH_WORDS Filename: /Users/mfekadu/GitHub/api/nimbus_nlp/question_classifier.py Line # Mem usage Increment Line Contents ================================================ 143 324.4 MiB 324.4 MiB @profile 144 def is_wh_word(self, pos): 145 324.4 MiB 0.0 MiB return pos in self.WH_WORDS Filename: /Users/mfekadu/GitHub/api/nimbus_nlp/question_classifier.py Line # Mem usage Increment Line Contents ================================================ 143 324.4 MiB 324.4 MiB @profile 144 def is_wh_word(self, pos): 145 324.4 MiB 0.0 MiB return pos in self.WH_WORDS Filename: /Users/mfekadu/GitHub/api/nimbus_nlp/question_classifier.py Line # Mem usage Increment Line Contents ================================================ 188 324.0 MiB 324.0 MiB @profile 189 def filterWHTags(self, question): 190 # ADD ALL VARIABLES TO THE FEATURE DICT WITH A WEIGHT OF 90 191 324.0 MiB 0.0 MiB matches = re.findall(r'(\[(.*?)\])', question) 192 324.0 MiB 0.0 MiB for match in matches: 193 324.0 MiB 0.0 MiB question = question.replace(match[0], '') 194 195 324.0 MiB 0.0 MiB question = re.sub('[^a-zA-Z0-9]', ' ', question) 196 197 # PRE-PROCESSING: TOKENIZE SENTENCE, AND LOWER AND STEM EACH WORD 198 324.0 MiB 0.0 MiB words = nltk.word_tokenize(question) 199 324.0 MiB 0.0 MiB words = [word.lower() for word in words if '[' and ']' not in word] 200 201 324.4 MiB 0.4 MiB filtered_words = self.get_lemmas(words) 202 203 324.4 MiB 0.0 MiB question_tags = nltk.pos_tag(filtered_words) 204 question_tags = [ 205 324.4 MiB 0.0 MiB tag for tag in question_tags if self.is_wh_word(tag[1])] 206 324.4 MiB 0.0 MiB return question_tags Filename: /Users/mfekadu/GitHub/api/nimbus_nlp/question_classifier.py Line # Mem usage Increment Line Contents ================================================ 139 324.4 MiB 324.4 MiB @profile 140 def get_lemmas(self, words): 141 324.5 MiB 0.0 MiB return [self.nlp(word)[0].lemma_ for word in words] Filename: /Users/mfekadu/GitHub/api/nimbus_nlp/question_classifier.py Line # Mem usage Increment Line Contents ================================================ 143 324.5 MiB 324.5 MiB @profile 144 def is_wh_word(self, pos): 145 324.5 MiB 0.0 MiB return pos in self.WH_WORDS Filename: /Users/mfekadu/GitHub/api/nimbus_nlp/question_classifier.py Line # Mem usage Increment Line Contents ================================================ 143 324.5 MiB 324.5 MiB @profile 144 def is_wh_word(self, pos): 145 324.5 MiB 0.0 MiB return pos in self.WH_WORDS Filename: /Users/mfekadu/GitHub/api/nimbus_nlp/question_classifier.py Line # Mem usage Increment Line Contents ================================================ 143 324.5 MiB 324.5 MiB @profile 144 def is_wh_word(self, pos): 145 324.5 MiB 0.0 MiB return pos in self.WH_WORDS Filename: /Users/mfekadu/GitHub/api/nimbus_nlp/question_classifier.py Line # Mem usage Increment Line Contents ================================================ 143 324.5 MiB 324.5 MiB @profile 144 def is_wh_word(self, pos): 145 324.5 MiB 0.0 MiB return pos in self.WH_WORDS Filename: /Users/mfekadu/GitHub/api/nimbus_nlp/question_classifier.py Line # Mem usage Increment Line Contents ================================================ 143 324.5 MiB 324.5 MiB @profile 144 def is_wh_word(self, pos): 145 324.5 MiB 0.0 MiB return pos in self.WH_WORDS Filename: /Users/mfekadu/GitHub/api/nimbus_nlp/question_classifier.py Line # Mem usage Increment Line Contents ================================================ 143 324.5 MiB 324.5 MiB @profile 144 def is_wh_word(self, pos): 145 324.5 MiB 0.0 MiB return pos in self.WH_WORDS Filename: /Users/mfekadu/GitHub/api/nimbus_nlp/question_classifier.py Line # Mem usage Increment Line Contents ================================================ 188 324.4 MiB 324.4 MiB @profile 189 def filterWHTags(self, question): 190 # ADD ALL VARIABLES TO THE FEATURE DICT WITH A WEIGHT OF 90 191 324.4 MiB 0.0 MiB matches = re.findall(r'(\[(.*?)\])', question) 192 324.4 MiB 0.0 MiB for match in matches: 193 324.4 MiB 0.0 MiB question = question.replace(match[0], '') 194 195 324.4 MiB 0.0 MiB question = re.sub('[^a-zA-Z0-9]', ' ', question) 196 197 # PRE-PROCESSING: TOKENIZE SENTENCE, AND LOWER AND STEM EACH WORD 198 324.4 MiB 0.0 MiB words = nltk.word_tokenize(question) 199 324.4 MiB 0.0 MiB words = [word.lower() for word in words if '[' and ']' not in word] 200 201 324.5 MiB 0.1 MiB filtered_words = self.get_lemmas(words) 202 203 324.5 MiB 0.0 MiB question_tags = nltk.pos_tag(filtered_words) 204 question_tags = [ 205 324.5 MiB 0.0 MiB tag for tag in question_tags if self.is_wh_word(tag[1])] 206 324.5 MiB 0.0 MiB return question_tags Filename: /Users/mfekadu/GitHub/api/nimbus_nlp/question_classifier.py Line # Mem usage Increment Line Contents ================================================ 208 324.0 MiB 324.0 MiB @profile 209 def validate_WH(self, test_question, predicted_question): 210 """ 211 Assumes that only 1 WH word exists 212 Returns True if the WH word in the test question equals the 213 WH word in the predicted question 214 """ 215 216 324.4 MiB 0.4 MiB test_tags = self.filterWHTags(test_question) 217 324.5 MiB 0.1 MiB predicted_tags = self.filterWHTags(predicted_question) 218 219 # Uncomment these lines below to see 220 # print("Test") 221 # print(test_tags) 222 # print() 223 224 # print("Predicted") 225 # print(predicted_tags) 226 # print() 227 228 # Compares all WH words in the tags array and returns False if one doesn't match 229 324.5 MiB 0.0 MiB min_tag_len = min(len(test_tags), len(predicted_tags)) 230 324.5 MiB 0.0 MiB wh_match = True 231 324.5 MiB 0.0 MiB i = 0 232 324.5 MiB 0.0 MiB while (wh_match and i < min_tag_len): 233 324.5 MiB 0.0 MiB wh_match = wh_match and (test_tags[i][0] == predicted_tags[i][0]) 234 324.5 MiB 0.0 MiB i += 1 235 324.5 MiB 0.0 MiB return wh_match Filename: /Users/mfekadu/GitHub/api/nimbus_nlp/question_classifier.py Line # Mem usage Increment Line Contents ================================================ 237 284.6 MiB 284.6 MiB @profile 238 def classify_question(self, test_question): 239 """ 240 Match a user query with a question in the database based on the classifier we trained and overall features we calculated. 241 Return relevant question. 242 """ 243 284.6 MiB 0.0 MiB if self.classifier is None: 244 raise ValueError("Classifier not initialized") 245 246 #if self.use_new: 247 323.9 MiB 39.3 MiB test_features = self.get_question_features(test_question) 248 #else: 249 # test_features = self.get_question_features_old_algorithm( 250 # test_question) 251 323.9 MiB 0.0 MiB test_vector = dict.fromkeys(self.overall_features, 0) 252 323.9 MiB 0.0 MiB for key in test_features: 253 323.9 MiB 0.0 MiB if key in test_vector: 254 323.9 MiB 0.0 MiB test_vector[key] = test_features[key] 255 #else: 256 # IF A WORD IS NOT IN THE EXISTING FEATURE SET, IT MAY BE A QUESTION WE CANNOT ANSWER. 257 # test_vector["not related"] += 250 258 323.9 MiB 0.0 MiB test_vector = np.array(list(test_vector.values())) 259 323.9 MiB 0.0 MiB test_vector = test_vector.reshape(1, -1) 260 324.0 MiB 0.1 MiB min_dist = np.min(self.classifier.kneighbors(test_vector, n_neighbors=1)[0]) 261 324.0 MiB 0.0 MiB if min_dist > 150: 262 return "I don't think that's a Statistics related question! Try asking something about the STAT curriculum." 263 264 324.0 MiB 0.0 MiB predicted_question = self.classifier.predict(test_vector)[0] 265 266 324.5 MiB 0.4 MiB wh_words_match = self.validate_WH(test_question, predicted_question) 267 # Uncomment to print whether the WH words match 268 # print("WH Words Match?:", wh_words_match) 269 270 324.5 MiB 0.0 MiB if (not wh_words_match): 271 return "WH Words Don't Match" 272 273 324.5 MiB 0.0 MiB return predicted_question Filename: /Users/mfekadu/GitHub/api/nimbus_nlp/NIMBUS_NLP.py Line # Mem usage Increment Line Contents ================================================ 26 147.1 MiB 147.1 MiB @staticmethod 27 @profile 28 def predict_question(input_question): 29 ''' 30 Runs through variable extraction and the question classifier to 31 predict the intended question. 32 33 Args: input_question (string) - user input question to answer 34 35 Return: nlp_props (dict) - contains the user's input question, 36 the variable extracted input question, 37 the entity extracted, and the predicted 38 answer 39 40 ''' 41 42 # Instantiate the variable extraction class 43 147.1 MiB 0.0 MiB variable_extraction = Variable_Extraction() 44 45 # Obtain the properties from variable extraction 46 153.5 MiB 6.4 MiB nlp_props = variable_extraction.extract_variables(input_question) 47 48 # Instantiate the question classifier class 49 281.3 MiB 127.9 MiB classifier = QuestionClassifier() 50 284.6 MiB 3.3 MiB classifier.load_latest_classifier() 51 52 # Classify the question and add it to the nlp properties dictionary 53 284.6 MiB 0.0 MiB nlp_props["question class"] = classifier.\ 54 324.5 MiB 39.9 MiB classify_question(nlp_props["normalized question"]) 55 56 324.5 MiB 0.0 MiB return nlp_props {'entity': 'Color Coded', 'tag': 'CLUB', 'normalized entity': 'Color Coded', 'input question': 'Who is the contact for Color Coded?', 'normalized question': 'Who is the contact for [CLUB]?', 'question class': 'Who is the chief contact for [CLUB]?'} Filename: /Users/mfekadu/GitHub/api/QA.py Line # Mem usage Increment Line Contents ================================================ 61 214.3 MiB 214.3 MiB @profile 62 def answer(self, extracted_vars): 63 214.6 MiB 0.3 MiB db_data = self._get_data_from_db(extracted_vars) 64 214.6 MiB 0.0 MiB return self._format_answer(extracted_vars, db_data) Filename: /Users/mfekadu/GitHub/api/nimbus.py Line # Mem usage Increment Line Contents ================================================ 22 147.1 MiB 147.1 MiB @profile 23 def answer_question(self, question): 24 214.3 MiB 67.2 MiB ans_dict = NIMBUS_NLP.predict_question(question) 25 214.3 MiB 0.0 MiB print(ans_dict) 26 214.3 MiB 0.0 MiB try: 27 214.3 MiB 0.0 MiB qa = self.qa_dict[ans_dict["question class"]] 28 except KeyError: 29 return "I'm sorry, I don't understand. Please try another question." 30 else: 31 214.6 MiB 0.3 MiB answer = qa.answer(ans_dict) 32 214.6 MiB 0.0 MiB if answer is None: 33 return("I'm sorry, I understand your question but was unable to find an answer. " 34 "Please try another question.") 35 else: 36 214.6 MiB 0.0 MiB return answer ```