Tạo API cho các model

CoderHung commented 4 months ago

Cần tạo các api dịch và tóm tắt

CoderHung commented 4 months ago

Hiện tại thì sẽ có 2 api dịch(mTet, chatGPT) và 2 api tóm tắt(bart, chatGPT)

các api dịch

mTet

@app.get("/mTetTranslate")
async def translate(Tr: TranslationRequest):
    """mTet translation api

    Args:
        Tr (TranslationRequest): translation request

    Returns:
        json response: {"translation" : str}
    """
    # call the mTet translate method
    translation = mTet.translate(Tr.text, Tr.isEnglish)
    return {"translation": translation}

chatGPT

@app.get("/ChatGptTransalate")
async def summarize(Tr: TranslationRequest):
    """chatGpt translate api

    Args:
        Tr (TranslationRequest):

    Raises:
        HTTPException: status_code = 400 if tokens exceed limit

    Returns:
        json response body : {"Translation" : str}
    """
    # encode the input to get tokens
    encoding = tiktoken.get_encoding("cl100k_base")
    tokens = encoding.encode(Tr.text)
    text_length = len(tokens)

    Language = (
        "english to vietnamese" if Tr.isEnglish else "vietnamese to english"
    )
    # check limit
    if text_length > 16000:
        raise HTTPException(
            status_code=400, detail="Text exceeds maximum token limit"
        )
    # make chatGpt call
    client = openai.OpenAI(
        api_key="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"
    )
    completion = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {
                "role": "system",
                "content": "Please answer as if you are a natural language processing model made for "
                + Language
                + " translation",
            },
            {
                "role": "user",
                "content": "Please translate the following text : "
                + Tr.text,
            },
        ],
    )

    return {"Translation": completion.choices[0].message}

các api tóm tắt

Bart

@app.get("/BartSummarize")
async def summarize(Sr: SummarizationRequest):
    """bart model summarize api
    Args:
        Tr (SummarizationRequest): {"text": str, "length": int}

    Raises:
        HTTPException: status_code = 400 if tokens exceed limit

    Returns:
        json response body : {"Summarization" : str}
    """
    # translate the input to english
    input = mTet.translate(Sr.text, False)
    # get input tokens
    encoding = tiktoken.get_encoding("cl100k_base")
    tokens = encoding.encode(input)
    # check token limit and make according function calls

    if len(tokens) > 2048:
        raise HTTPException(
            status_code=400, detail="Text exceeds maximum token limit "
        )
    elif len(tokens) > 1024:
        return_value = bart.summarize_large_text(
            input, 2000, 800, 400, 300, Sr.length + 100, Sr.length
        )
    else:
        return_value = bart.summarize(input, 400, 200)[0]["summary_text"]

    return {"Summarization": return_value}

chatGPT

@app.get("/ChatGptTransalate")
async def summarize(Tr: TranslationRequest):
    """chatGpt translate api

    Args:
        Tr (TranslationRequest):

    Raises:
        HTTPException: status_code = 400 if tokens exceed limit

    Returns:
        json response body : {"Translation" : str}
    """
    # encode the input to get tokens
    encoding = tiktoken.get_encoding("cl100k_base")
    tokens = encoding.encode(Tr.text)
    text_length = len(tokens)

    Language = (
        "english to vietnamese" if Tr.isEnglish else "vietnamese to english"
    )
    # check limit
    if text_length > 16000:
        raise HTTPException(
            status_code=400, detail="Text exceeds maximum token limit"
        )
    # make chatGpt call
    client = openai.OpenAI(
        api_key="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"
    )
    completion = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {
                "role": "system",
                "content": "Please answer as if you are a natural language processing model made for "
                + Language
                + " translation",
            },
            {
                "role": "user",
                "content": "Please translate the following text : "
                + Tr.text,
            },
        ],
    )

    return {"Translation": completion.choices[0].message}

CoderHung commented 4 months ago

mTet.py

import nltk
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

def translate(text, isEnglish):
    """_summary_

    Args:
        text (str): text input
        isEnglish (bool): boolean value

    Returns:
        str : translation
    """
    # load model
    model_name = "VietAI/envit5-translation"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

    # select language
    Language = "en: " if isEnglish else "vi: "

    # splits the text into  segments to translate
    sentences = nltk.sent_tokenize(text)
    segments = []
    segment_token_count = 0
    segment = []
    threshold_token_count = 300
    for sentence in sentences:
        tokens = nltk.word_tokenize(sentence)
        segment_token_count += len(tokens)
        if segment_token_count >= threshold_token_count:
            segments.append(" ".join(segment))
            segment = []
            segment_token_count = len(tokens)
        segment.append(sentence)
    if segment:
        segments.append(" ".join(segment))

    # check if cuda is available with code:
    if torch.cuda.is_available():
        model.cuda()
        translated_segments = []
        for segment in segments:
            outputs = model.generate(
                tokenizer(
                    Language + segment, return_tensors="pt", padding=True
                ).input_ids.to("cuda"),
                max_length=512,
            )
            translated_segments.append(
                tokenizer.batch_decode(outputs, skip_special_tokens=True)[0][4:]
            )
        answer = " ".join(translated_segments)
    else:
        print("CUDA is not available. Only CPU will be used.")
        translated_segments = []
        for segment in segments:
            outputs = model.generate(
                tokenizer(
                    Language + segment, return_tensors="pt", padding=True
                ).input_ids,
                max_length=512,
            )
            translated_segments.append(
                tokenizer.batch_decode(outputs, skip_special_tokens=True)[0][4:]
            )
        answer = " ".join(translated_segments)
    final = answer
    return final

bart.py

import tiktoken
from langchain.text_splitter import NLTKTextSplitter
from transformers import pipeline

# load model
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

def summarize(text, max_value, min_value):
    """Summarize method called when input length <= 1024 tokens

    Args:
        text (str): input text
        max_value (int): max output token length
        min_value (int): min output token length

    Returns:
        List[Dict[str, str]]: list of a dict containing the summary
    """
    return summarizer(
        text, max_length=max_value, min_length=min_value, do_sample=False
    )

def split_chunk(text, chunk_size, overlap_size):
    """sliding window chunking method using .split()

    Args:
        text (str): input text
        chunk_length (int): length of chunks
        overlap_size (int): the overlap size of chunks

    Returns:
        List[str]: list of chunks of type str
    """
    chunks = []
    words = text.split()
    start = 0
    ncoding = tiktoken.get_encoding("cl100k_base")
    while start < len(words):
        end = min(start + chunk_size, len(words))
        chunks.append(" ".join(words[start:end]))
        start += chunk_size - overlap_size
    return chunks

def nltk_chunk(text, chunk_length, overlap_size):
    """sliding window chunking method using using NLTKTextSplitter

    Args:
        text (str): input text
        chunk_length (int): length of chunks
        overlap_size (int): the overlap size of chunks

    Returns:
        List[str]: list of chunks of type str
    """

    # 2000,1000
    nltk_splitter = NLTKTextSplitter(
        separator=" ", chunk_size=chunk_length, chunk_overlap=overlap_size
    )
    splits = nltk_splitter.split_text(text)
    return splits

def summarize_chunks(chunks, chunk_size, max_value, min_value):
    """Summarize each chunk, returns combined chunk summaries

    Args:
        chunks (_type_): list of chunk strings
        chunk_size (int): size of chunks in length
        max_value (int): max output token length
        min_value (int): min output token length

    Returns:
        str: the combined chunk summaries
    """
    summarized_chunks = []
    encoding = tiktoken.get_encoding("cl100k_base")
    for chunk in chunks:
        if len(chunk) < chunk_size:
            continue
        summarized_chunk = summarizer(
            chunk, max_length=max_value, min_length=min_value, do_sample=False
        )
        summarized_chunks.append(summarized_chunk[0]["summary_text"])
    return " ".join(summarized_chunks)

def summarize_large_text(
    text,
    chunk_size,
    overlap_size,
    max_chunk_value,
    min_chunk_value,
    max_value,
    min_value,
):
    """summarize method when 1024 tokens < input token length < 2048 tokens

    Args:
        text (str): text input
        chunk_size (int): size of the chunk in terms of length
        overlap_size (int): the chunks overlap size in terms of length
        max_chunk_value (int): max chunk summary output token length
        min_chunk_value (int): min chunk summary output token length
        max_value (int): max output token length
        min_value (int): min output token length

    Returns:
        str : the summary
    """
    # chunks = split_chunk(text, chunk_size, overlap_size)
    chunks = nltk_chunk(text, chunk_size, overlap_size)

    combined_summary = summarize_chunks(
        chunks, chunk_size, max_chunk_value, min_chunk_value
    )

    # run a final summary on combined summarize chunks text
    final_summary = summarizer(
        combined_summary,
        max_length=max_value,
        min_length=min_value,
        do_sample=False,
    )

    return final_summary[0]["summary_text"]

CoderHung commented 4 months ago

Thêm Api trả về tokens và token count của một input(giúp người dùng check input của mình giống như https://platform.openai.com/tokenizer)

@app.get("/TokenCheck")
async def summarize(Tr: TokenRequest):
    """returns the token and token count of text

    Args:
        Tr (TokenRequest): {"text": str}

    Returns:
        json response body: {"token" : List[str] , "length" : int}
    """
    # get input tokens
    encoding = tiktoken.get_encoding("cl100k_base")
    tokens = encoding.encode(Tr.text)
    return {
        "tokens": [
            encoding.decode_single_token_bytes(token) for token in tokens
        ],
        "length": len(tokens),
    }

CoderHung commented 4 months ago

sequence gọi API bart tóm tắt 1.API bart được gọi 2.xác định token length của input 2.1 nếu input token count lớn hơn 2048 trả về lỗi 2.2 nếu input token count nhỏ hơn 2048 nhưng lớn hơn 1024 token thì dùng sliding window chunking method để nhét cái input vào model(mất nhiều thời gian hơn) 2.3 nếu input token count nhỏ hơn 1024 tokens thì gọi model tóm tắt như bình thường 3.trả về text tóm tắt

CoderHung commented 4 months ago

Các parameters cần phải xem xét là : +,threshhold_token_count trong mTet.py (số token của mỗi segment mTet) +,chunk_size(độ dài của các chunk bart) +,overlap_size(độ lớn của overlap giữa các chunk) +,max_chunk_value và min_chunk_value(các param để tóm tắt các chunk) +,mã_value và min_value(các param dùng để có được tóm tắt cuối)

96ers / summerizIT