def get_model(args):
if not args.cpu:
if torch.cuda.is_available():
device = f"cuda:{args.gpu}"
elif torch.backends.mps.is_built():
device = "mps"
else:
device = "cpu"
else:
device = "cpu"
tokenizer = AutoTokenizer.from_pretrained(args.model_path, trust_remote_code=True)
if args.n_gpus > 1 and enable_multiple_gpus:
# 如需实现多显卡模型加载,传入"n_gpus"为需求的显卡数量 / To enable Multiple GPUs model loading, please adjust "n_gpus" to the desired number of graphics cards.
print(f"Runing on {args.n_gpus} GPUs.")
model = load_model_on_gpus(args.model_path, num_gpus=args.n_gpus)
model = model.eval()
elif enable_chatglm_cpp and args.chatglm_cpp:
print("Using chatglm-cpp to improve performance")
dtype = "f16"
if args.quantize in [4, 5, 8]:
dtype = f"q{args.quantize}_0"
model = chatglm_cpp.Pipeline(args.model_path, dtype=dtype)
else:
model = AutoModel.from_pretrained(args.model_path, trust_remote_code=True)
model = model.eval()
if enable_fastllm and args.fastllm:
print("fastllm enabled.")
model = model.half()
llm.set_device_map(device)
if args.quantize in [4, 8]:
model = llm.from_hf(model, dtype=f"int{args.quantize}")
else:
model = llm.from_hf(model, dtype="float16")
else:
print("chatglm-cpp and fastllm not installed, using transformers.")
if args.quantize in [4, 8]:
print(f"Model is quantized to INT{args.quantize} format.")
model = model.half().quantize(args.quantize)
model = model.to(device)
return tokenizer, model
按照这个代码来看,多gpu、fastllm和量化不能同时使用吗