Actually I had this change locally and forgot to make a PR in time. Could you modify it according the following diff? or I can create another PR.
diff --git a/examples/inference-deployments/mpt/mpt_7b_ft_handler.py b/examples/inference-deployments/mpt/mpt_7b_ft_handler.py
index 02eb745..65c61dc 100644
--- a/examples/inference-deployments/mpt/mpt_7b_ft_handler.py
+++ b/examples/inference-deployments/mpt/mpt_7b_ft_handler.py
@@ -11,7 +11,7 @@ import torch
from torch.nn.utils.rnn import pad_sequence
from transformers import AutoTokenizer
-from FasterTransformer.examples.pytorch.gpt.utils.parallel_gpt import ParallelGPT # isort: skip # type: ignore
+from examples.pytorch.gpt.utils.parallel_gpt import ParallelGPT # isort: skip # type: ignore
from scripts.inference.convert_hf_mpt_to_ft import convert_mpt_to_ft # isort: skip # type: ignore
@@ -51,7 +51,8 @@ class MPTFTModelHandler:
ft_lib_path: str,
inference_data_type: str = 'bf16',
int8_mode: int = 0,
- gpus: int = 1):
+ gpus: int = 1,
+ force: bool = False):
"""Fastertransformer model handler for MPT foundation series.
Args:
@@ -61,6 +62,7 @@ class MPTFTModelHandler:
int8_mode (int): The level of quantization to perform. 0: No quantization. All computation in data_type,
1: Quantize weights to int8, all compute occurs in fp16/bf16. Not supported when data_type is fp32
gpus (int): Number of gpus to use for inference (Default: 1)
+ force (bool): Force conversion to FT even if some features may not work as expected in FT (Default: False)
"""
self.device = torch.cuda.current_device()
self.model_name = model_name
@@ -73,7 +75,7 @@ class MPTFTModelHandler:
# Datatype of weights in the HF checkpoint
weight_data_type = 'fp32'
convert_mpt_to_ft(self.model_name, LOCAL_CHECKPOINT_PATH, gpus,
- weight_data_type, False)
+ weight_data_type, force)
if not os.path.isfile(ckpt_config_path):
raise RuntimeError('Failed to create FT checkpoint')
else:
@@ -285,12 +287,18 @@ if __name__ == '__main__':
type=int,
default=1,
help='The number of gpus to use for inference.')
+ parser.add_argument(
+ '--force',
+ action='store_true',
+ help=
+ 'Force conversion to FT even if some features may not work as expected in FT'
+ )
args = parser.parse_args()
model_handle = MPTFTModelHandler(args.name_or_dir, args.ft_lib_path,
args.inference_data_type, args.int8_mode,
- args.gpus)
+ args.gpus, args.force)
inputs = {'input_strings': ['Who is the president of the USA?']}
out = model_handle.predict([inputs])
print(out[0])
Actually I had this change locally and forgot to make a PR in time. Could you modify it according the following diff? or I can create another PR.