Closed littlemex closed 2 years ago
from typing import Optional
from fastapi import FastAPI,logger,responses
from configparser import ConfigParser
import os, logging
import importlib
import requests
import time
import json
models={}
model_dir = './'
# FastAPI server
app = FastAPI()
# Server healthcheck
@app.get("/")
async def read_root():
return {"Status": "Healthy"}
# Model inference API endpoint
@app.get("/model/{model_id}")
async def web_serve(model_id, sequence_0, sequence_1):
status=200
HEADERS = {'content-type': 'application/json'}
MODEL_API_URL = 'http://127.0.0.1:8010/model/{}'.format(model_id)
feed_data = {
"sequence_0" : sequence_0,
"sequence_1" : sequence_1
}
#dummy for pre-processing or post-processing
time.sleep(1)
session = requests.Session()
answer_text = session.get(MODEL_API_URL, params=feed_data)
answer_text = json.loads(answer_text.text)['detail']
return responses.JSONResponse(status_code=status, content={"detail": answer_text})
from typing import Optional
from fastapi import FastAPI,logger,responses
from configparser import ConfigParser
import torch, os, logging
import importlib
import torch_neuron
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
import transformers
import json
# Build tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased-finetuned-mrpc")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased-finetuned-mrpc", return_dict=False)
# Setup some example inputs
sequence_0 = "The company HuggingFace is based in New York City"
sequence_1 = "Apples are especially bad for your health"
sequence_2 = "HuggingFace's headquarters are situated in Manhattan"
max_length=128
paraphrase = tokenizer.encode_plus(sequence_0, sequence_2, max_length=max_length, padding='max_length', truncation=True, return_tensors="pt")
# Run the original PyTorch model on compilation exaple
paraphrase_classification_logits = model(**paraphrase)[0]
# Convert example inputs to a format that is compatible with TorchScript tracing
example_inputs_paraphrase = paraphrase['input_ids'], paraphrase['attention_mask'], paraphrase['token_type_ids']
num_models=4
models={}
model_dir = './'
# FastAPI server
app = FastAPI()
# Server healthcheck
@app.get("/")
async def read_root():
return {"Status": "Healthy"}
# Model inference API endpoint
@app.get("/model/{model_id}")
async def infer(model_id, sequence_0, sequence_1):
status=200
if model_id in models.keys():
paraphrase = tokenizer.encode_plus(sequence_0, sequence_1, max_length=max_length, padding='max_length', truncation=True, return_tensors="pt")
example_inputs_paraphrase = paraphrase['input_ids'], paraphrase['attention_mask'], paraphrase['token_type_ids']
# model_selection
model = models[model_id]
# Verify the TorchScript works on example inputs
paraphrase_classification_logits_neuron = model(*example_inputs_paraphrase)
classes = ['not paraphrase', 'paraphrase']
paraphrase_prediction = paraphrase_classification_logits_neuron[0][0].argmax().item()
answer_text = 'BERT model "{}" says that "{}" and "{}" are {}'.format(model_id, sequence_0, sequence_1, classes[paraphrase_prediction])
else:
status=404
answer_text = f"Model {model_id} does not exist. Try a model name up to model{num_models-1}"
return responses.JSONResponse(status_code=status, content={"detail": answer_text})
for i in range(num_models):
model_id = 'bert_neuron0' + str(i)
print(f" {model_id} ...")
models[model_id] = torch.jit.load(os.path.join(model_dir, "model", model_id + ".pt"))
infer(model_id, sequence_0, sequence_1)
print(" ... warmup completed")
Model ServerとWeb Serverを分ければいい話なのでは?
Webサーバーを4ワーカー起動(4プロセス)
モデルサーバーを起動(1プロセス)
テスト結果