多gpu、fastllm和量化不能同时使用吗

按照这个代码来看，多gpu、fastllm和量化不能同时使用吗
def get_model(args):
    if not args.cpu:
        if torch.cuda.is_available():
            device = f"cuda:{args.gpu}"
        elif torch.backends.mps.is_built():
            device = "mps"
        else:
            device = "cpu"
    else:
        device = "cpu"

    tokenizer = AutoTokenizer.from_pretrained(args.model_path, trust_remote_code=True)

    if args.n_gpus > 1 and enable_multiple_gpus:
        # 如需实现多显卡模型加载,传入"n_gpus"为需求的显卡数量 / To enable Multiple GPUs model loading, please adjust "n_gpus" to the desired number of graphics cards.
        print(f"Runing on {args.n_gpus} GPUs.")
        model = load_model_on_gpus(args.model_path, num_gpus=args.n_gpus)
        model = model.eval()
    elif enable_chatglm_cpp and args.chatglm_cpp:
        print("Using chatglm-cpp to improve performance")
        dtype = "f16"
        if args.quantize in [4, 5, 8]:
            dtype = f"q{args.quantize}_0"
        model = chatglm_cpp.Pipeline(args.model_path, dtype=dtype)
    else:
        model = AutoModel.from_pretrained(args.model_path, trust_remote_code=True)
        model = model.eval()

        if enable_fastllm and args.fastllm:
            print("fastllm enabled.")
            model = model.half()
            llm.set_device_map(device)
            if args.quantize in [4, 8]:
                model = llm.from_hf(model, dtype=f"int{args.quantize}")
            else:
                model = llm.from_hf(model, dtype="float16")
        else:
            print("chatglm-cpp and fastllm not installed, using transformers.")
            if args.quantize in [4, 8]:
                print(f"Model is quantized to INT{args.quantize} format.")
                model = model.half().quantize(args.quantize)
            model = model.to(device)

    return tokenizer, model
THUDM / CodeGeeX2

多gpu、fastllm和量化不能同时使用吗 #78