mosaicml / examples

Fast and flexible reference benchmarks
Apache License 2.0
424 stars 122 forks source link

Add kwarg to force FT conversion #355

Closed margaretqian closed 1 year ago

dskhudia commented 1 year ago

Actually I had this change locally and forgot to make a PR in time. Could you modify it according the following diff? or I can create another PR.

diff --git a/examples/inference-deployments/mpt/mpt_7b_ft_handler.py b/examples/inference-deployments/mpt/mpt_7b_ft_handler.py
index 02eb745..65c61dc 100644
--- a/examples/inference-deployments/mpt/mpt_7b_ft_handler.py
+++ b/examples/inference-deployments/mpt/mpt_7b_ft_handler.py
@@ -11,7 +11,7 @@ import torch
 from torch.nn.utils.rnn import pad_sequence
 from transformers import AutoTokenizer

-from FasterTransformer.examples.pytorch.gpt.utils.parallel_gpt import ParallelGPT  # isort: skip # type: ignore
+from examples.pytorch.gpt.utils.parallel_gpt import ParallelGPT  # isort: skip # type: ignore

 from scripts.inference.convert_hf_mpt_to_ft import convert_mpt_to_ft  # isort: skip # type: ignore

@@ -51,7 +51,8 @@ class MPTFTModelHandler:
                  ft_lib_path: str,
                  inference_data_type: str = 'bf16',
                  int8_mode: int = 0,
-                 gpus: int = 1):
+                 gpus: int = 1,
+                 force: bool = False):
         """Fastertransformer model handler for MPT foundation series.
         Args:
@@ -61,6 +62,7 @@ class MPTFTModelHandler:
             int8_mode (int): The level of quantization to perform. 0: No quantization. All computation in data_type,
                 1: Quantize weights to int8, all compute occurs in fp16/bf16. Not supported when data_type is fp32
             gpus (int): Number of gpus to use for inference (Default: 1)
+            force (bool): Force conversion to FT even if some features may not work as expected in FT (Default: False)
         """
         self.device = torch.cuda.current_device()
         self.model_name = model_name
@@ -73,7 +75,7 @@ class MPTFTModelHandler:
             # Datatype of weights in the HF checkpoint
             weight_data_type = 'fp32'
             convert_mpt_to_ft(self.model_name, LOCAL_CHECKPOINT_PATH, gpus,
-                              weight_data_type, False)
+                              weight_data_type, force)
             if not os.path.isfile(ckpt_config_path):
                 raise RuntimeError('Failed to create FT checkpoint')
         else:
@@ -285,12 +287,18 @@ if __name__ == '__main__':
                         type=int,
                         default=1,
                         help='The number of gpus to use for inference.')
+    parser.add_argument(
+        '--force',
+        action='store_true',
+        help=
+        'Force conversion to FT even if some features may not work as expected in FT'
+    )

     args = parser.parse_args()

     model_handle = MPTFTModelHandler(args.name_or_dir, args.ft_lib_path,
                                      args.inference_data_type, args.int8_mode,
-                                     args.gpus)
+                                     args.gpus, args.force)
     inputs = {'input_strings': ['Who is the president of the USA?']}
     out = model_handle.predict([inputs])
     print(out[0])
margaretqian commented 1 year ago

@dskhudia for sure happy to incorporate your changes!

margaretqian commented 1 year ago

closing in favor of https://github.com/mosaicml/examples/pull/359