GTX1050과 RTX2080으로 모델 각각 할당

CutTheWire commented 1 week ago

1050과 2080을 각각 할당하는 방법

서로 다른 GPU를 활용하여 facebook/bart-large-mnli와 meta-llama/Llama-3.1-8B-Instruct 모델을 병렬적으로 할당하고 처리 속도를 높이는 방법을 구현할 수 있습니다. 이를 위해서는 PyTorch의 device 설정을 사용해 각 모델을 다른 GPU에 명시적으로 할당할 수 있습니다.

방법 1: GPU 분리 작업

1050 GPU: facebook/bart-large-mnli는 입력 질문의 복잡도를 분석하는 데 사용되고, 이 작업은 상대적으로 가벼우므로 메모리가 적은 1050 GPU에 할당하는 것이 합리적입니다.
2080 GPU: meta-llama/Llama-3.1-8B-Instruct 모델은 더 많은 VRAM을 요구하는 응답 생성 작업을 수행하므로 2080 GPU에 할당합니다.

아래 코드는 두 모델을 각각 다른 GPU에 할당하고, 병렬로 처리하는 방법을 설명합니다.

import torch
import transformers
from torch.cuda.amp import autocast, GradScaler
from accelerate import Accelerator
from dotenv import load_dotenv
import os
from transformers import pipeline

# 현재 파일의 경로를 기준으로 부모 디렉토리의 .env 파일 경로 설정
current_dir = os.path.dirname(os.path.abspath(__file__))
parent_dir = os.path.dirname(current_dir)
dotenv_path = os.path.join(parent_dir, '.env')
load_dotenv(dotenv_path)

class LlamaChatModel:
    def __init__(self):
        '''
        LlamaChatModel 클래스 초기화
        '''
        self.model_id = "meta-llama/Llama-3.1-8B-Instruct"
        self.cache_dir = "./fastapi/ai_model/"
        self.model_kwargs = {
            "torch_dtype": torch.float16,  # float16으로 설정
            "trust_remote_code": True,
            "load_in_8bit": True  # 양자화 적용
        }

        # Hugging Face Token 설정
        self.hf_token = os.getenv("HUGGING_FACE_TOKEN")

        print("토크나이저 로드 중...")
        self.tokenizer = self.load_tokenizer()
        print("모델 로드 중...")

        # GPU를 2080으로 설정하여 Accelerate 객체 초기화
        self.accelerator = Accelerator(mixed_precision="fp16", device_placement=True, split_batches=True)
        self.device_2080 = torch.device("cuda:0")  # 2080 GPU에 할당
        self.model, self.optimizer = self.load_model_with_accelerator()
        self.scaler = GradScaler()

        print("모델과 토크나이저 로드 완료!")

        # 1050 GPU에 복잡도 분석용 파이프라인 할당
        self.device_1050 = torch.device("cuda:1")  # 1050 GPU에 할당
        self.complexity_analyzer = pipeline("text-classification", model="facebook/bart-large-mnli", device=1)

        # Gradient Checkpointing 활성화
        self.model.gradient_checkpointing_enable()

        self.conversation_history = []  # 대화 히스토리 초기화

    def load_tokenizer(self) -> transformers.PreTrainedTokenizerBase:
        '''
        토크나이저를 로드합니다.
        :return: 로드된 토크나이저
        '''
        tokenizer = transformers.AutoTokenizer.from_pretrained(
            self.model_id,
            cache_dir=self.cache_dir,
            token=self.hf_token
        )
        tokenizer.pad_token_id = tokenizer.eos_token_id
        return tokenizer

    def load_model_with_accelerator(self) -> tuple:
        '''
        모델을 Accelerator를 사용하여 로드하고 옵티마이저를 준비합니다.
        :return: 모델과 옵티마이저
        '''
        model = transformers.AutoModelForCausalLM.from_pretrained(
            self.model_id,
            cache_dir=self.cache_dir,
            token=self.hf_token,
            **self.model_kwargs
        ).to(self.device_2080)  # 모델을 2080 GPU에 할당
        optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

        # Accelerator로 모델과 옵티마이저 준비
        model, optimizer = self.accelerator.prepare(model, optimizer)
        return model, optimizer

    def analyze_complexity(self, input_text: str) -> str:
        '''
        입력 텍스트의 복잡성을 분석합니다.
        :param input_text: 입력 텍스트
        :return: 복잡도 결과 (low, medium, high)
        '''
        result = self.complexity_analyzer(input_text)
        label = result[0]['label']
        if "ENTAILMENT" in label:
            return "low"
        elif "CONTRADICTION" in label:
            return "high"
        return "medium"

    def adjust_max_tokens(self, complexity: str) -> int:
        '''
        복잡도에 따라 생성할 최대 토큰 수를 조정합니다.
        :param complexity: 입력 텍스트의 복잡도
        :return: 조정된 최대 토큰 수
        '''
        if complexity == "low":
            return 100  # 간단한 질문일 경우 100토큰 제한
        elif complexity == "high":
            return 1000  # 복잡한 질문일 경우 1000토큰 제한
        return 500  # 중간 수준의 복잡도일 경우 500토큰

    def generate_response(self, input_text: str) -> str:
        '''
        주어진 입력 텍스트에 대한 응답을 생성합니다.
        :param input_text: 입력 텍스트
        :return: 생성된 응답 텍스트
        '''
        # 1050 GPU에서 복잡도 분석
        complexity = self.analyze_complexity(input_text)
        max_new_tokens = self.adjust_max_tokens(complexity)

        full_input = f"{input_text}"
        input_ids = self.tokenizer.encode(full_input, return_tensors="pt").to(torch.device("cpu"))
        attention_mask = (input_ids != self.tokenizer.pad_token_id).long()

        # Mixed Precision과 함께 generate 함수 직접 호출
        with autocast(dtype=torch.float16):  # Mixed Precision 사용
            with torch.no_grad():
                output = self.model.generate(
                    input_ids.to(self.device_2080),  # 2080 GPU에 전송
                    attention_mask=attention_mask.to(self.device_2080),  # 2080 GPU에 전송
                    max_new_tokens=max_new_tokens,
                    do_sample=True,
                    temperature=0.64,
                    top_k=51,
                    top_p=0.63,
                    eos_token_id=self.tokenizer.eos_token_id,
                    pad_token_id=self.tokenizer.eos_token_id,
                    repetition_penalty=1.21,
                    stopping_criteria=transformers.StoppingCriteriaList([self.CustomStoppingCriteria()])
                )

        # GPU 메모리 비우기
        torch.cuda.empty_cache()

        response = self.tokenizer.decode(output[0], skip_special_tokens=True)
        self.conversation_history.append(f"AI: {response.strip()}")
        return response.strip()

    class CustomStoppingCriteria(transformers.StoppingCriteria):
        def __init__(self, min_length: int = 10, min_ending_tokens: int = 2):
            self.min_length = min_length
            self.min_ending_tokens = min_ending_tokens

        def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
            if input_ids.shape[1] > self.min_length and (
                input_ids[0, -1] == self.min_ending_tokens or input_ids[0, -2] == self.min_ending_tokens):
                return True
            return False

'''
테스트용 코드
'''
if __name__ == "__main__":
    LCM = LlamaChatModel()
    while True:
        print("입력: ")
        user_input = input("")  # 사용자로부터 입력 받기
        if user_input.lower() == "exit":
            print("종료합니다.")
            break
        response = LCM.generate_response(user_input)
        print(f"응답: {response}")

해당 방법의 장단점

장점:

하드웨어 효율성: 각 작업을 최적의 GPU에 할당하여 처리 속도를 높일 수 있습니다. 복잡도 분석 작업을 상대적으로 가벼운 1050에 할당하고, 응답 생성을 메모리 요구량이 큰 2080에서 처리합니다.
병렬 처리 가능: 두 GPU가 동시에 작업을 수행하므로 처리 시간이 절약될 수 있습니다.

단점:

복잡한 시스템 관리: 서로 다른 GPU에 작업을 할당하고 관리하는 것이 복잡할 수 있으며, 오류 발생 시 디버깅이 어려울 수 있습니다.
데이터 전송 오버헤드: GPU 간의 데이터 전송 오버헤드가 발생할 수 있어 실제 성능이 예상만큼 향상되지 않을 수 있습니다.

CutTheWire commented 1 week ago

24.10.10. 해당 기능으로 답변 속도가 얼마나 향상되는지 판단 할 예정.

CutTheWire commented 1 week ago

또한 CustomStoppingCriteria클래스로 답변 길이를 자동적으로 끝맺음하지 않도록 CustomStoppingCriteria의 클래스는 일단 사용 중단.

CutTheWire commented 1 week ago

fastapi\src\utils\AI_Llama_8B.py


'''
CustomStoppingCriteria는 일단 사용 중지.
기본적으로 모델이 제공하는 generate() 메서드는 max_new_tokens, eos_token_id, pad_token_id와 같은 매개변수로 텍스트 생성을 잘 조절할 수 있기 때문,
별도의 CustomStoppingCriteria를 사용하지 않아도 충분히 원하는 대로 동작할 가능성이 높음.
'''
# def generate_response(self, input_text: str) -> str:
#     '''
#     주어진 입력 텍스트에 대한 응답을 생성합니다.
#     :param input_text: 입력 텍스트
#     :return: 생성된 응답 텍스트
#     '''
#     # 복잡도 분석
#     complexity = self.analyze_complexity(input_text)
#     max_new_tokens = self.adjust_max_tokens(complexity)

#     full_input = f"{input_text}"
#     input_ids = self.tokenizer.encode(full_input, return_tensors="pt").to(torch.device("cpu"))
#     attention_mask = (input_ids != self.tokenizer.pad_token_id).long()

#     # Pinned Memory로 전송
#     input_ids = self.allocate_pinned_memory(input_ids)
#     attention_mask = self.allocate_pinned_memory(attention_mask)

#     # Mixed Precision과 함께 generate 함수 직접 호출
#     with autocast(dtype=torch.float16):  # Mixed Precision 사용
#         with torch.no_grad():
#             output = self.model.generate(
#                 input_ids.to(self.accelerator.device, non_blocking=True),  # 비동기로 전송
#                 attention_mask=attention_mask.to(self.accelerator.device, non_blocking=True),  # 비동기로 전송
#                 max_new_tokens=max_new_tokens,
#                 do_sample=True,
#                 temperature=0.64,
#                 top_k=51,
#                 top_p=0.63,
#                 eos_token_id=self.tokenizer.eos_token_id,
#                 pad_token_id=self.tokenizer.eos_token_id,
#                 repetition_penalty=1.21,
#                 stopping_criteria=transformers.StoppingCriteriaList([self.CustomStoppingCriteria()])
#             )

#     # GPU 메모리 비우기
#     torch.cuda.empty_cache()

#     response = self.tokenizer.decode(output[0], skip_special_tokens=True)
#     self.conversation_history.append(f"AI: {response.strip()}")
#     return response.strip()

# class CustomStoppingCriteria(transformers.StoppingCriteria):
#     def __init__(self, min_length: int = 10, min_ending_tokens: int = 2):
#         self.min_length = min_length
#         self.min_ending_tokens = min_ending_tokens

#     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
#         if input_ids.shape[1] > self.min_length and (
#             input_ids[0, -1] == self.min_ending_tokens or input_ids[0, -2] == self.min_ending_tokens):
#             return True
#         return False

CutTheWire commented 6 days ago

fastapi\src\utils\AI_Llama_8B.py

RTX 2080 8GB로 Llama-3.1-8B-Instruct의 모델을 4bit 양자화로 할당. GTX 1050 2GB로 bart-large-mnli의 모델을 할당.

import os
import torch
import transformers
from torch.cuda.amp import autocast, GradScaler
from accelerate import Accelerator
from dotenv import load_dotenv
from transformers import BitsAndBytesConfig, pipeline

# 현재 파일의 경로를 기준으로 부모 디렉토리의 .env 파일 경로 설정
current_dir = os.path.dirname(os.path.abspath(__file__))
parent_dir = os.path.dirname(current_dir)
dotenv_path = os.path.join(parent_dir, '.env')
load_dotenv(dotenv_path)

class LlamaChatModel:
    def __init__(self):
        '''
        LlamaChatModel 클래스 초기화
        '''
        self.cache_dir = "./fastapi/ai_model/"
        self.model_id = "meta-llama/Llama-3.1-8B-Instruct"  # 원하는 모델 ID 설정
        self.bart_model_id = "facebook/bart-large-mnli"  # 복잡도 분석용 BART 모델
        self.model_kwargs = {
            "torch_dtype": torch.float16,  # float16으로 설정
            "trust_remote_code": True,
            "quantization_config": BitsAndBytesConfig(load_in_4bit=True)  # 양자화 적용
        }

        # Hugging Face Token 설정
        self.hf_token = os.getenv("HUGGING_FACE_TOKEN")

        # Accelerate 객체 초기화
        self.accelerator = Accelerator(mixed_precision="fp16")  # Mixed Precision 설정
        self.device_2080 = torch.device("cuda:0")  # RTX 2080 GPU에 할당
        self.device_1050 = torch.device("cuda:1")  # GTX 1050 GPU에 할당

        print("토크나이저 로드 중...")
        self.tokenizer = self.load_tokenizer()
        print("모델 로드 중...")
        self.model, self.optimizer = self.load_model_with_accelerator()
        print("복잡도 분석 모델 로드 중...")
        self.complexity_analyzer = self.load_complexity_analyzer()  # GTX 1050에 할당
        self.scaler = GradScaler()
        print("모델과 토크나이저 로드 완료!")

        # Gradient Checkpointing 활성화
        self.model.gradient_checkpointing_enable()

        self.conversation_history = []  # 대화 히스토리 초기화

    def load_tokenizer(self) -> transformers.PreTrainedTokenizerBase:
        '''
        토크나이저를 로드합니다.
        :return: 로드된 토크나이저
        '''
        tokenizer = transformers.AutoTokenizer.from_pretrained(
            self.model_id,
            token=self.hf_token  # cache_dir 제거
        )
        tokenizer.pad_token_id = tokenizer.eos_token_id
        return tokenizer

    def load_complexity_analyzer(self) -> transformers.Pipeline:
        '''
        복잡도 분석 모델을 GTX 1050에서 로드합니다.
        :return: 로드된 복잡도 분석 파이프라인
        '''
        return pipeline(
            "text-classification",
            model=self.bart_model_id,  # 복잡도 분석 모델로 BART 사용
            device=self.device_1050.index,  # GTX 1050에 할당
        )

    def load_model_with_accelerator(self) -> tuple:
        '''
        모델을 Accelerator를 사용하여 로드하고 옵티마이저를 준비합니다.
        :return: 모델과 옵티마이저
        '''
        model = transformers.AutoModelForCausalLM.from_pretrained(
            self.model_id,
            cache_dir=self.cache_dir,
            token=self.hf_token,
            **self.model_kwargs
        )
        optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

        # Accelerator로 모델과 옵티마이저 준비
        model, optimizer = self.accelerator.prepare(model, optimizer)

        # 2080 GPU에 모델 전송
        model.to(self.device_2080)
        return model, optimizer

    def analyze_complexity(self, input_text: str) -> str:
        '''
        입력 텍스트의 복잡성을 GTX 1050에서 분석합니다.
        :param input_text: 입력 텍스트
        :return: 복잡도 결과 (low, medium, high)
        '''
        result = self.complexity_analyzer(input_text)
        label = result[0]['label']
        if "ENTAILMENT" in label:
            return "low"
        elif "CONTRADICTION" in label:
            return "high"
        return "medium"

    def adjust_max_tokens(self, complexity: str) -> int:
        '''
        복잡도에 따라 생성할 최대 토큰 수를 조정합니다.
        :param complexity: 입력 텍스트의 복잡도
        :return: 조정된 최대 토큰 수
        '''
        if complexity == "low":
            return 100  # 간단한 질문일 경우 100토큰 제한
        elif complexity == "high":
            return 500  # 복잡한 질문일 경우 500토큰 제한
        return 250  # 중간 수준의 복잡도일 경우 250토큰

    def generate_response(self, input_text: str) -> str:
        '''
        주어진 입력 텍스트에 대한 응답을 생성합니다.
        :param input_text: 입력 텍스트
        :return: 생성된 응답 텍스트
        '''
        # GTX 1050에서 복잡도 분석
        complexity = self.analyze_complexity(input_text)
        max_new_tokens = self.adjust_max_tokens(complexity)

        full_input = f"{input_text}"
        input_ids = self.tokenizer.encode(full_input, return_tensors="pt").to(torch.device("cpu"))
        attention_mask = (input_ids != self.tokenizer.pad_token_id).long()

        # Mixed Precision과 함께 generate 함수 직접 호출
        with autocast(dtype=torch.float16):  # Mixed Precision 사용
            with torch.no_grad():
                output = self.model.generate(
                    input_ids.to(self.device_2080),  # RTX 2080에서 Llama 모델 사용
                    attention_mask=attention_mask.to(self.device_2080),
                    max_new_tokens=max_new_tokens,
                    do_sample=True,
                    temperature=0.64,
                    top_k=51,
                    top_p=0.63,
                    eos_token_id=self.tokenizer.eos_token_id,
                    pad_token_id=self.tokenizer.eos_token_id,
                    repetition_penalty=1.21,
                )
        response = self.tokenizer.decode(output[0], skip_special_tokens=True)
        return response

if __name__ == "__main__":
    LCM = LlamaChatModel()
    while True:
        print("입력: ")
        user_input = input("")  # 사용자로부터 입력 받기
        if user_input.lower() == "exit":
            print("종료합니다.")
            break
        response = LCM.generate_response(user_input)
        print(f"응답: {response}")

TreeNut-KR / ChatBot-AI