littlemex / inference-samples

inference samples
3 stars 0 forks source link

Model ServerとWeb Serverの分離 #6

Closed littlemex closed 2 years ago

littlemex commented 2 years ago
littlemex commented 2 years ago
from typing import Optional
from fastapi import FastAPI,logger,responses
from configparser import ConfigParser
import os, logging
import importlib
import requests
import time
import json

models={}
model_dir = './'

# FastAPI server
app = FastAPI()

# Server healthcheck
@app.get("/")
async def read_root():
    return {"Status": "Healthy"}

# Model inference API endpoint
@app.get("/model/{model_id}")
async def web_serve(model_id, sequence_0, sequence_1):

    status=200
    HEADERS = {'content-type': 'application/json'}
    MODEL_API_URL = 'http://127.0.0.1:8010/model/{}'.format(model_id)

    feed_data = {
        "sequence_0" : sequence_0,
        "sequence_1" : sequence_1
    }

    #dummy for pre-processing or post-processing
    time.sleep(1)

    session = requests.Session()
    answer_text = session.get(MODEL_API_URL, params=feed_data)

    answer_text = json.loads(answer_text.text)['detail']

    return responses.JSONResponse(status_code=status, content={"detail": answer_text})
littlemex commented 2 years ago
from typing import Optional
from fastapi import FastAPI,logger,responses
from configparser import ConfigParser
import torch, os, logging
import importlib

import torch_neuron
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
import transformers
import json

# Build tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased-finetuned-mrpc")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased-finetuned-mrpc", return_dict=False)

# Setup some example inputs
sequence_0 = "The company HuggingFace is based in New York City"
sequence_1 = "Apples are especially bad for your health"
sequence_2 = "HuggingFace's headquarters are situated in Manhattan"

max_length=128
paraphrase = tokenizer.encode_plus(sequence_0, sequence_2, max_length=max_length, padding='max_length', truncation=True, return_tensors="pt")

# Run the original PyTorch model on compilation exaple
paraphrase_classification_logits = model(**paraphrase)[0]

# Convert example inputs to a format that is compatible with TorchScript tracing
example_inputs_paraphrase = paraphrase['input_ids'], paraphrase['attention_mask'], paraphrase['token_type_ids']

num_models=4
models={}
model_dir = './'

# FastAPI server
app = FastAPI()

# Server healthcheck
@app.get("/")
async def read_root():
    return {"Status": "Healthy"}

# Model inference API endpoint
@app.get("/model/{model_id}")
async def infer(model_id, sequence_0, sequence_1):

    status=200
    if model_id in models.keys():

        paraphrase = tokenizer.encode_plus(sequence_0, sequence_1, max_length=max_length, padding='max_length', truncation=True, return_tensors="pt")
        example_inputs_paraphrase = paraphrase['input_ids'], paraphrase['attention_mask'], paraphrase['token_type_ids']

        # model_selection
        model = models[model_id]

        # Verify the TorchScript works on example inputs
        paraphrase_classification_logits_neuron = model(*example_inputs_paraphrase)
        classes = ['not paraphrase', 'paraphrase']
        paraphrase_prediction = paraphrase_classification_logits_neuron[0][0].argmax().item()
        answer_text = 'BERT model "{}" says that "{}" and "{}" are {}'.format(model_id, sequence_0, sequence_1, classes[paraphrase_prediction])

    else:
        status=404
        answer_text = f"Model {model_id} does not exist. Try a model name up to model{num_models-1}"

    return responses.JSONResponse(status_code=status, content={"detail": answer_text})

for i in range(num_models):
    model_id = 'bert_neuron0' + str(i)
    print(f"   {model_id} ...")
    models[model_id] = torch.jit.load(os.path.join(model_dir, "model", model_id + ".pt"))

    infer(model_id, sequence_0, sequence_1)
    print("    ... warmup completed")