Model ServerとWeb Serverの分離

littlemex commented 2 years ago

Model ServerとWeb Serverを分ければいい話なのでは？
- (参考) https://github.com/zhongli1990/covid-ai-demo-deployment
Webサーバーを４ワーカー起動（４プロセス）
- gunicorn fastapi-server2:app -w 4 -k uvicorn.workers.UvicornWorker --bind 127.0.0.1:8008
モデルサーバーを起動（１プロセス）
- uvicorn model-server:app --port 8010
テスト結果
- num_thread=4
- (aws_neuron_pytorch_p36) ubuntu@ip-172-31-75-48:~/web_server_test$ python client.py num_inferences: 50[req], elapsed_time: 13.20[sec], Throughput: 3.79[req/sec] <Response [200]> b'{"detail":"BERT model \"bert_neuron02\" says that \"The company HuggingFace is based in New York City\" and \"Apples are especially bad for your health\" are not paraphrase"}'
- num_thread=1
- (aws_neuron_pytorch_p36) ubuntu@ip-172-31-75-48:~/web_server_test$ python client.py num_inferences: 50[req], elapsed_time: 50.69[sec], Throughput: 0.99[req/sec] <Response [200]> b'{"detail":"BERT model \"bert_neuron02\" says that \"The company HuggingFace is based in New York City\" and \"Apples are especially bad for your health\" are not paraphrase"}'

littlemex commented 2 years ago

from typing import Optional
from fastapi import FastAPI,logger,responses
from configparser import ConfigParser
import os, logging
import importlib
import requests
import time
import json

models={}
model_dir = './'

# FastAPI server
app = FastAPI()

# Server healthcheck
@app.get("/")
async def read_root():
    return {"Status": "Healthy"}

# Model inference API endpoint
@app.get("/model/{model_id}")
async def web_serve(model_id, sequence_0, sequence_1):

    status=200
    HEADERS = {'content-type': 'application/json'}
    MODEL_API_URL = 'http://127.0.0.1:8010/model/{}'.format(model_id)

    feed_data = {
        "sequence_0" : sequence_0,
        "sequence_1" : sequence_1
    }

    #dummy for pre-processing or post-processing
    time.sleep(1)

    session = requests.Session()
    answer_text = session.get(MODEL_API_URL, params=feed_data)

    answer_text = json.loads(answer_text.text)['detail']

    return responses.JSONResponse(status_code=status, content={"detail": answer_text})

littlemex commented 2 years ago

from typing import Optional
from fastapi import FastAPI,logger,responses
from configparser import ConfigParser
import torch, os, logging
import importlib

import torch_neuron
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
import transformers
import json

# Build tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased-finetuned-mrpc")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased-finetuned-mrpc", return_dict=False)

# Setup some example inputs
sequence_0 = "The company HuggingFace is based in New York City"
sequence_1 = "Apples are especially bad for your health"
sequence_2 = "HuggingFace's headquarters are situated in Manhattan"

max_length=128
paraphrase = tokenizer.encode_plus(sequence_0, sequence_2, max_length=max_length, padding='max_length', truncation=True, return_tensors="pt")

# Run the original PyTorch model on compilation exaple
paraphrase_classification_logits = model(**paraphrase)[0]

# Convert example inputs to a format that is compatible with TorchScript tracing
example_inputs_paraphrase = paraphrase['input_ids'], paraphrase['attention_mask'], paraphrase['token_type_ids']

num_models=4
models={}
model_dir = './'

# FastAPI server
app = FastAPI()

# Server healthcheck
@app.get("/")
async def read_root():
    return {"Status": "Healthy"}

# Model inference API endpoint
@app.get("/model/{model_id}")
async def infer(model_id, sequence_0, sequence_1):

    status=200
    if model_id in models.keys():

        paraphrase = tokenizer.encode_plus(sequence_0, sequence_1, max_length=max_length, padding='max_length', truncation=True, return_tensors="pt")
        example_inputs_paraphrase = paraphrase['input_ids'], paraphrase['attention_mask'], paraphrase['token_type_ids']

        # model_selection
        model = models[model_id]

        # Verify the TorchScript works on example inputs
        paraphrase_classification_logits_neuron = model(*example_inputs_paraphrase)
        classes = ['not paraphrase', 'paraphrase']
        paraphrase_prediction = paraphrase_classification_logits_neuron[0][0].argmax().item()
        answer_text = 'BERT model "{}" says that "{}" and "{}" are {}'.format(model_id, sequence_0, sequence_1, classes[paraphrase_prediction])

    else:
        status=404
        answer_text = f"Model {model_id} does not exist. Try a model name up to model{num_models-1}"

    return responses.JSONResponse(status_code=status, content={"detail": answer_text})

for i in range(num_models):
    model_id = 'bert_neuron0' + str(i)
    print(f"   {model_id} ...")
    models[model_id] = torch.jit.load(os.path.join(model_dir, "model", model_id + ".pt"))

    infer(model_id, sequence_0, sequence_1)
    print("    ... warmup completed")

littlemex / inference-samples

Model ServerとWeb Serverの分離 #6