Open endomorphosis opened 3 months ago
also the json files in the example are no longer supported in the intel neural compressor, it claims that this key value pair is invalid (as of version 3.0)
"method": "HOOKS",
root@8fb421541c5d:~/optimum-habana/examples/text-generation# QUANT_CONFIG=./quantization_config/maxabs_quant.json python run_generation.py \
--model_name_or_path meta-llama/Meta-Llama-3.1-405B-Instruct-FP8 \ --use_hpu_graphs \ --use_kv_cache \ --limit_hpu_graphs \ --bucket_size 128 \ --max_new_tokens 2048 \ --batch_size 16 \ --bf16 /usr/local/lib/python3.10/dist-packages/transformers/deepspeed.py:24: FutureWarning: transformers.deepspeed module is deprecated and will be removed in a future version. Please import deepspeed modules directly from transformers.integrations warnings.warn( 08/11/2024 06:00:46 - INFO - main - Single-device run. Traceback (most recent call last): File "/root/optimum-habana/examples/text-generation/run_generation.py", line 692, in
main() File "/root/optimum-habana/examples/text-generation/run_generation.py", line 337, in main model, assistant_model, tokenizer, generation_config = initialize_model(args, logger) File "/root/optimum-habana/examples/text-generation/utils.py", line 633, in initialize_model setup_model(args, model_dtype, model_kwargs, logger) File "/root/optimum-habana/examples/text-generation/utils.py", line 261, in setup_model model = AutoModelForCausalLM.from_pretrained( File "/usr/local/lib/python3.10/dist-packages/transformers/models/auto/auto_factory.py", line 564, in from_pretrained return model_class.from_pretrained( File "/usr/local/lib/python3.10/dist-packages/transformers/modeling_utils.py", line 3376, in from_pretrained hf_quantizer.validate_environment( File "/usr/local/lib/python3.10/dist-packages/transformers/quantizers/quantizer_fbgemm_fp8.py", line 68, in validate_environment raise RuntimeError("Using FP8 quantized models with fbgemm kernels requires a GPU") RuntimeError: Using FP8 quantized models with fbgemm kernels requires a GPU
QUANT_CONFIG=./quantization_config/maxabs_quant.json python run_generation.py \ --model_name_or_path meta-llama/Meta-Llama-3.1-8B \ --use_hpu_graphs \ --use_kv_cache \ --limit_hpu_graphs \ --bucket_size 128 \ --max_new_tokens 2048 \ --batch_size 16 \ --bf16
tokenizer_config.json: 100%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 50.5k/50.5k [00:00<00:00, 891kB/s]
tokenizer.json: 100%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 9.09M/9.09M [00:00<00:00, 19.3MB/s]
special_tokens_map.json: 100%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 73.0/73.0 [00:00<00:00, 639kB/s]
08/11/2024 06:03:56 - INFO - main - Args: Namespace(device='hpu', model_name_or_path='meta-llama/Meta-Llama-3.1-8B', bf16=True, max_new_tokens=2048, max_input_tokens=0, batch_size=16, warmup=3, n_iterations=5, local_rank=0, use_kv_cache=True, use_hpu_graphs=True, dataset_name=None, column_name=None, do_sample=False, num_beams=1, top_k=None, penalty_alpha=None, trim_logits=False, seed=27, profiling_warmup_steps=0, profiling_steps=0, profiling_record_shapes=False, prompt=None, bad_words=None, force_words=None, assistant_model=None, peft_model=None, num_return_sequences=1, token=None, model_revision='main', attn_softmax_bf16=False, output_dir=None, bucket_size=128, bucket_internal=False, dataset_max_samples=-1, limit_hpu_graphs=True, reuse_cache=False, verbose_workers=False, simulate_dyn_prompt=None, reduce_recompile=False, use_flash_attention=False, flash_attention_recompute=False, flash_attention_causal_mask=False, flash_attention_fast_softmax=False, book_source=False, torch_compile=False, ignore_eos=True, temperature=1.0, top_p=1.0, const_serialization_path=None, disk_offload=False, trust_remote_code=False, load_quantized_model=False, parallel_strategy='none', quant_config='', world_size=0, global_rank=0)
08/11/2024 06:03:56 - INFO - main - device: hpu, n_hpu: 0, bf16: True
08/11/2024 06:03:56 - INFO - main - Model initialization took 23.027s
08/11/2024 06:03:56 - INFO - main - Graph compilation...
Warming up iteration 1/3
/usr/local/lib/python3.10/dist-packages/transformers/generation/configuration_utils.py:567: UserWarning: do_sample
is set to False
. However, temperature
is set to 0.6
-- this flag is only used in sample-based generation modes. You should set do_sample=True
or unset temperature
.
warnings.warn(
/usr/local/lib/python3.10/dist-packages/transformers/generation/configuration_utils.py:572: UserWarning: do_sample
is set to False
. However, top_p
is set to 0.9
-- this flag is only used in sample-based generation modes. You should set do_sample=True
or unset top_p
.
warnings.warn(
The attention layers in this model are transitioning from computing the RoPE embeddings internally through position_ids
(2D tensor with the indexes of the tokens), to using externally computed position_embeddings
(Tuple of tensors, containing cos and sin). In v4.45 position_ids
will be removed and position_embeddings
will be mandatory.
Traceback (most recent call last):
File "/root/optimum-habana/examples/text-generation/run_generation.py", line 692, in
root@8fb421541c5d:~/optimum-habana/examples/text-generation# python quantization_tools/unify_measurements.py -g 01234567 -m /root/optimum-habana/examples/text-generation/quantization_config/ -o /root/optimum-h
abana/examples/text-generation/test_1x_measure/
Traceback (most recent call last):
File "/root/optimum-habana/examples/text-generation/quantization_tools/unify_measurements.py", line 198, in
SRAM_SLICER_SHARED_MME_INPUT_EXPANSION_ENABLED=false ENABLE_EXPERIMENTAL_FLAGS=1 python run_lm_eval.py -o llama_405b_load_uint4_model.txt --model_name_or_path hugging-quants/Meta-Llama-3.1-405B-Instruct-AWQ-INT4 --use_hpu_graphs --use_kv_cache --trim_logits --batch_size 1 --bf16 --attn_softmax_bf16 --bucket_size=128 --bucket_internal
Traceback (most recent call last):
File "/root/optimum-habana/examples/text-generation/run_lm_eval.py", line 229, in
SRAM_SLICER_SHARED_MME_INPUT_EXPANSION_ENABLED=false ENABLE_EXPERIMENTAL_FLAGS=1 python run_lm_eval.py -o acc_load_uint4_model.txt --model_name_or_path hugging-quants/Meta-Llama-3.1-405B-Instruct-GPTQ-INT4 --use_hpu_graphs --use_kv_cache --trim_logits --batch_size 1 --bf16 --attn_softmax_bf16 --bucket_size=128 --bucket_internal --load_quantized_model
/usr/local/lib/python3.10/dist-packages/transformers/deepspeed.py:24: FutureWarning: transformers.deepspeed module is deprecated and will be removed in a future version. Please import deepspeed modules directly from transformers.integrations
warnings.warn(
08/11/2024 21:43:21 - INFO - __main__ - Single-device run.
/usr/local/lib/python3.10/dist-packages/habana_frameworks/torch/hpu/__init__.py:158: UserWarning: torch.hpu.setDeterministic is deprecated and will be removed in next release. Please use torch.use_deterministic_algorithms instead.
warnings.warn(
[2024-08-11 21:43:23,310] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to hpu (auto detect)
============================= HABANA PT BRIDGE CONFIGURATION ===========================
PT_HPU_LAZY_MODE = 1
PT_RECIPE_CACHE_PATH =
PT_CACHE_FOLDER_DELETE = 0
PT_HPU_RECIPE_CACHE_CONFIG =
PT_HPU_MAX_COMPOUND_OP_SIZE = 9223372036854775807
PT_HPU_LAZY_ACC_PAR_MODE = 1
PT_HPU_ENABLE_REFINE_DYNAMIC_SHAPES = 0
---------------------------: System Configuration :---------------------------
Num CPU Cores : 160
CPU RAM : 1056375276 KB
------------------------------------------------------------------------------
Traceback (most recent call last):
File "/root/optimum-habana/examples/text-generation/run_lm_eval.py", line 229, in <module>
main()
File "/root/optimum-habana/examples/text-generation/run_lm_eval.py", line 195, in main
model, _, tokenizer, generation_config = initialize_model(args, logger)
File "/root/optimum-habana/examples/text-generation/utils.py", line 633, in initialize_model
setup_model(args, model_dtype, model_kwargs, logger)
File "/root/optimum-habana/examples/text-generation/utils.py", line 250, in setup_model
from neural_compressor.torch.quantization import load
ImportError: cannot import name 'load' from 'neural_compressor.torch.quantization' (/usr/local/lib/python3.10/dist-packages/neural_compressor/torch/quantization/__init__.py)```
System Info
Information
Tasks
examples
folder (such as GLUE/SQuAD, ...)Reproduction
QUANT_CONFIG=./quantization_config/maxabs_quant.json TQDM_DISABLE=1 python run_generation.py --model_name_or_path meta-llama/Meta-Llama-3.1-70B-Instruct --attn_softmax_bf16 --use_hpu_graphs --trim_logits --use_kv_cache --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 2048 --max_input_tokens 2048 --bf16 --batch_size 1 --disk_offload --use_flash_attention --flash_attention_recompute
Expected behavior
trying to use quantized llama 3.1 70b models