intel / neural-compressor

SOTA low-bit LLM quantization (INT8/FP8/INT4/FP4/NF4) & sparsity; leading model compression techniques on TensorFlow, PyTorch, and ONNX Runtime
https://intel.github.io/neural-compressor/
Apache License 2.0
2.18k stars 252 forks source link

Model Size Increase After PTQ #1968

Closed zhangxu223 closed 1 month ago

zhangxu223 commented 1 month ago

I've encountered an issue where the quantized model size is twice as large as the original model, which contradicts the expected result of reducing the model size after quantization.

Original Model Size: 130.96 MB Quantized Model Size: Approximately 261.92 MB (double the original size)

This is my code:

def eval_func(model):
    model.to(model_device)  
    model.eval()
    all_metrics = []

    loss_func = PITLossWrapper(pairwise_neg_snr, pit_from='pw_mtx')

    with torch.no_grad():
        for batch_idx, batch in enumerate(tqdm(val_loader1, desc="Evaluating")):  # 使用 tqdm 显示进度条
            mix, sources = tensors_to_device(batch, device=model_device)

            logger.debug(f"Processing batch {batch_idx}...")
            start_time = datetime.now()  # 记录开始时间

            # Forward the network on the mixture.
            forward_start_time = datetime.now()  # Record forward pass start time
            mix = mix.squeeze(0).to(model_device)  # Remove batch dimension and ensure mix is on the correct device

            est_s, _ = model(mix[None,], sources, do_test=True, pretrain=True)
            forward_end_time = datetime.now()  # Record forward pass end time
            forward_elapsed_time = (forward_end_time - forward_start_time).total_seconds()
            logger.debug(f"Batch {batch_idx} forward pass completed in {forward_elapsed_time:.2f} seconds.")

            sources = sources.squeeze(0)[None, :, 0, :]  # Remove batch dimension and use a single channel

            est_s = est_s[:, :, 0, :]  # Ensure the estimated sources have the correct dimensions
            # est_s = est_s[:, :, 0, :]  # Ensure the estimated sources have the correct dimensions
            reordered_s = loss_func(est_s, sources, return_est=True)[-1]

            mix_np = mix[[0], :].cpu().data.numpy()
            sources_np = sources.squeeze(0).cpu().data.numpy()
            reordered_s_np = reordered_s.squeeze(0).cpu().data.numpy()

            metrics_start_time = datetime.now()  # Record metrics computation start time
            sgs_metrics = get_metrics(mix_np, sources_np, reordered_s_np,
                                      sample_rate=8000,
                                      metrics_list=compute_metrics)
            metrics_end_time = datetime.now()  # Record metrics computation end time
            metrics_elapsed_time = (metrics_end_time - metrics_start_time).total_seconds()
            logger.debug(f"Batch {batch_idx} metrics computation completed in {metrics_elapsed_time:.2f} seconds.")

            all_metrics.append(pd.Series(sgs_metrics))

            end_time = datetime.now()  # Record end time
            elapsed_time = (end_time - start_time).total_seconds()
            logger.debug(f"Batch {batch_idx} processed in {elapsed_time:.2f} seconds.")

    # 计算并返回平均指标
    avg_metrics = pd.DataFrame(all_metrics).mean()
    avg_sdr = avg_metrics['sdr']
    logger.info(f"Average SDR: {avg_sdr}")

    return avg_sdr

def main(conf):  # 将 current_time 作为参数传递给 main 函数
    global model, val_loader1, model_device

    # Define model and optimizer
    print("Creating model and optimizer")
    model, optimizer = make_model_and_optimizer(conf)

    model_device = torch.device('cpu')  # 使用 CPU 作为设备

    logger.info(f"Using device: {model_device}")

    # model.to(model_device)  # 移动模型到适当的设备
    exp_dir = conf['main_args']['exp_dir']
    best_ckpt = conf['main_args']['best_ckpt']

    # 加载最好的模型
    if best_ckpt:
        checkpoint = torch.load(best_ckpt, map_location=model_device)  # 在CPU上加载模型
        if 'state_dict' in checkpoint:
            model.load_state_dict(checkpoint['state_dict'], strict=False)
            print(f"Loaded model from {best_ckpt}")
        else:
            print("Checkpoint does not contain 'state_dict'.")
    else:
        print("No best checkpoint provided.")

    train_loader, val_loader = make_dataloaders(**conf['data'], **conf['training'], channels=slice(0, 4))

    # 截取数据集的前, 100个
    def truncate_loader(loader, percentage=0.1):
        dataset = loader.dataset
        original_len = len(dataset)
        truncated_len = int(original_len * percentage)
        indices = list(range(truncated_len))
        return torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=False,
                                           num_workers=20,
                                           sampler=torch.utils.data.SubsetRandomSampler(indices))

    val_loader1 = truncate_loader(val_loader, percentage=0.03)

    os.makedirs(exp_dir, exist_ok=True)
    conf_path = os.path.join(exp_dir, 'conf.yml')
    with open(conf_path, 'w') as outfile:
        yaml.safe_dump(conf, outfile)

    # 打印原始模型的评估结果
    # logger.info("Evaluating the original model...")
    # original_metrics = eval_func(model)
    # logger.info("Original FP32 model SDR: %s", original_metrics)

    # Save the FP32 model
    fp32_model_path = "./saved_model/fp32_model.pth"
    torch.save(model.state_dict(), fp32_model_path)
    logger.info("FP32 model has been successfully saved.")
    if not os.path.exists(fp32_model_path):
        torch.save(model.state_dict(), fp32_model_path)
        logger.info("FP32 model has been successfully saved.")
    else:
        logger.info("FP32 model already exists. Skipping saving.")

    # Check the size of the FP32 model
    fp32_model_size = os.path.getsize(fp32_model_path) / (1024 ** 2)  # Convert size to MB
    logger.info(f"FP32 model size: {fp32_model_size:.2f} MB")

    # 使用 neural_compressor 进行后训练量化
    print("Starting post-training quantization...")

    # accuracy_criterion = AccuracyCriterion(higher_is_better=False, criterion="absolute",tolerable_loss=0.01)
    accuracy_criterion = AccuracyCriterion(higher_is_better=False, criterion="absolute", tolerable_loss=0.5)
    tuning_criterion = TuningCriterion(timeout=0, max_trials=50, objective="performance", strategy='basic')

    conf = PostTrainingQuantConfig(
        approach="static",
        tuning_criterion=tuning_criterion,
        accuracy_criterion=accuracy_criterion,
        device="cpu",
        backend="default",
        quant_level=1,
    )

    # 打印量化配置
    logger.info("Quantization Configuration:")
    logger.info(conf)

    # 确保在调用 model 的地方传递了所有必要的参数
    def model_forward_wrapper(model, input):
        mixture, sources = input
        return model(mixture, sources, do_test=False, pretrain=True)

    print(conf)  # 打印量化配置,检查 LayerNorm 的量化设置
    q_model = fit(
        model=model,
        conf=conf,
        calib_dataloader=val_loader1,
        eval_func=eval_func,
        model_forward=model_forward_wrapper)

    logger.info("Post-training quantization completed.")

    # 检查每个层的量化信息
    for name, module in q_model.named_modules():
        if hasattr(module, 'observer'):
            logger.info(f"Layer: {name}, Observer: {module.observer}")

    # Save the quantized model
    q_model_path = f"./saved_model/quantized_model.pth"
    torch.save(q_model.state_dict(), q_model_path)
    logger.info("Quantized model has been successfully saved.")

    # Check the size of the quantized model
    model_size = os.path.getsize(q_model_path) / (1024 ** 2)  # Convert size to MB
    logger.info(f"Quantized model size: {model_size:.2f} MB")

This is my logs:

2024-08-09 15:07:04 [INFO] Using device: cpu
2024-08-09 15:07:04 [INFO] FP32 model has been successfully saved.
2024-08-09 15:07:04 [INFO] FP32 model already exists. Skipping saving.
2024-08-09 15:07:04 [INFO] FP32 model size: 130.87 MB
2024-08-09 15:07:04 [INFO] Quantization Configuration:
2024-08-09 15:07:04 [INFO] <neural_compressor.config.PostTrainingQuantConfig object at 0x000001619B415D90>
2024-08-09 15:07:04 [INFO] Start basic tuning.
2024-08-09 15:07:04 [INFO] Execute the tuning process due to detect the evaluation function.
2024-08-09 15:07:04 [INFO] Adaptor has 5 recipes.
2024-08-09 15:07:04 [INFO] 0 recipes specified by user.
2024-08-09 15:07:04 [INFO] 3 recipes require future tuning.
2024-08-09 15:07:04 [INFO] {
2024-08-09 15:07:04 [INFO]     'PostTrainingQuantConfig': {
2024-08-09 15:07:04 [INFO]         'AccuracyCriterion': {
2024-08-09 15:07:04 [INFO]             'criterion': 'absolute',
2024-08-09 15:07:04 [INFO]             'higher_is_better': False,
2024-08-09 15:07:04 [INFO]             'tolerable_loss': 0.5,
2024-08-09 15:07:04 [INFO]             'absolute': 0.5,
2024-08-09 15:07:04 [INFO]             'keys': <bound method AccuracyCriterion.keys of <neural_compressor.config.AccuracyCriterion object at 0x000001619B415130>>,
2024-08-09 15:07:04 [INFO]             'relative': None
2024-08-09 15:07:04 [INFO]         },
2024-08-09 15:07:04 [INFO]         'approach': 'post_training_static_quant',
2024-08-09 15:07:04 [INFO]         'backend': 'default',
2024-08-09 15:07:04 [INFO]         'calibration_sampling_size': [
2024-08-09 15:07:04 [INFO]             100
2024-08-09 15:07:04 [INFO]         ],
2024-08-09 15:07:04 [INFO]         'device': 'cpu',
2024-08-09 15:07:04 [INFO]         'diagnosis': False,
2024-08-09 15:07:04 [INFO]         'domain': 'auto',
2024-08-09 15:07:04 [INFO]         'example_inputs': 'Not printed here due to large size tensors...',
2024-08-09 15:07:04 [INFO]         'excluded_precisions': [
2024-08-09 15:07:04 [INFO]         ],
2024-08-09 15:07:04 [INFO]         'framework': 'pytorch_fx',
2024-08-09 15:07:04 [INFO]         'inputs': [
2024-08-09 15:07:04 [INFO]         ],
2024-08-09 15:07:04 [INFO]         'model_name': '',
2024-08-09 15:07:04 [INFO]         'ni_workload_name': 'quantization',
2024-08-09 15:07:04 [INFO]         'op_name_dict': None,
2024-08-09 15:07:04 [INFO]         'op_type_dict': None,
2024-08-09 15:07:04 [INFO]         'outputs': [
2024-08-09 15:07:04 [INFO]         ],
2024-08-09 15:07:04 [INFO]         'quant_format': 'default',
2024-08-09 15:07:04 [INFO]         'quant_level': 1,
2024-08-09 15:07:04 [INFO]         'recipes': {
2024-08-09 15:07:04 [INFO]             'smooth_quant': False,
2024-08-09 15:07:04 [INFO]             'smooth_quant_args': {
2024-08-09 15:07:04 [INFO]             },
2024-08-09 15:07:04 [INFO]             'layer_wise_quant': False,
2024-08-09 15:07:04 [INFO]             'layer_wise_quant_args': {
2024-08-09 15:07:04 [INFO]             },
2024-08-09 15:07:04 [INFO]             'fast_bias_correction': False,
2024-08-09 15:07:04 [INFO]             'weight_correction': False,
2024-08-09 15:07:04 [INFO]             'gemm_to_matmul': True,
2024-08-09 15:07:04 [INFO]             'graph_optimization_level': None,
2024-08-09 15:07:04 [INFO]             'first_conv_or_matmul_quantization': True,
2024-08-09 15:07:04 [INFO]             'last_conv_or_matmul_quantization': True,
2024-08-09 15:07:04 [INFO]             'pre_post_process_quantization': True,
2024-08-09 15:07:04 [INFO]             'add_qdq_pair_to_weight': False,
2024-08-09 15:07:04 [INFO]             'optypes_to_exclude_output_quant': [
2024-08-09 15:07:04 [INFO]             ],
2024-08-09 15:07:04 [INFO]             'dedicated_qdq_pair': False,
2024-08-09 15:07:04 [INFO]             'rtn_args': {
2024-08-09 15:07:04 [INFO]             },
2024-08-09 15:07:04 [INFO]             'awq_args': {
2024-08-09 15:07:04 [INFO]             },
2024-08-09 15:07:04 [INFO]             'gptq_args': {
2024-08-09 15:07:04 [INFO]             },
2024-08-09 15:07:04 [INFO]             'teq_args': {
2024-08-09 15:07:04 [INFO]             },
2024-08-09 15:07:04 [INFO]             'autoround_args': {
2024-08-09 15:07:04 [INFO]             }
2024-08-09 15:07:04 [INFO]         },
2024-08-09 15:07:04 [INFO]         'reduce_range': None,
2024-08-09 15:07:04 [INFO]         'TuningCriterion': {
2024-08-09 15:07:04 [INFO]             'max_trials': 50,
2024-08-09 15:07:04 [INFO]             'objective': [
2024-08-09 15:07:04 [INFO]                 'performance'
2024-08-09 15:07:04 [INFO]             ],
2024-08-09 15:07:04 [INFO]             'strategy': 'basic',
2024-08-09 15:07:04 [INFO]             'strategy_kwargs': None,
2024-08-09 15:07:04 [INFO]             'timeout': 0
2024-08-09 15:07:04 [INFO]         },
2024-08-09 15:07:04 [INFO]         'use_bf16': True
2024-08-09 15:07:04 [INFO]     }
2024-08-09 15:07:04 [INFO] }
2024-08-09 15:07:04 [WARNING] [Strategy] Please install `mpi4py` correctly if using distributed tuning; otherwise, ignore this warning.
2024-08-09 15:07:04 [INFO] Attention Blocks: 0
2024-08-09 15:07:04 [INFO] FFN Blocks: 0
2024-08-09 15:07:04 [INFO] Pass query framework capability elapsed time: 62.65 ms
2024-08-09 15:07:04 [INFO] Get FP32 model baseline.
2024-08-09 15:14:26 [INFO] Average SDR: -12.649966512875167
2024-08-09 15:14:26 [INFO] Save tuning history to F:\Beam-Guided-TFDPRNN-PTQ\nc_workspace\2024-08-09_15-07-01\./history.snapshot.
2024-08-09 15:14:26 [INFO] FP32 baseline is: [Accuracy: -12.6500, Duration (seconds): 441.3350]
2024-08-09 15:14:26 [INFO] Fx trace of the entire model failed, We will conduct auto quantization
2024-08-09 15:29:18 [INFO] |*********Mixed Precision Statistics********|
2024-08-09 15:29:18 [INFO] +---------------------+-------+------+------+
2024-08-09 15:29:18 [INFO] |       Op Type       | Total | INT8 | FP32 |
2024-08-09 15:29:18 [INFO] +---------------------+-------+------+------+
2024-08-09 15:29:18 [INFO] | quantize_per_tensor |   10  |  10  |  0   |
2024-08-09 15:29:18 [INFO] |        Conv2d       |   4   |  4   |  0   |
2024-08-09 15:29:18 [INFO] |      dequantize     |   10  |  10  |  0   |
2024-08-09 15:29:18 [INFO] |      GroupNorm      |   1   |  0   |  1   |
2024-08-09 15:29:18 [INFO] |        Linear       |   6   |  6   |  0   |
2024-08-09 15:29:18 [INFO] |      LayerNorm      |   6   |  0   |  6   |
2024-08-09 15:29:18 [INFO] +---------------------+-------+------+------+
2024-08-09 15:29:18 [INFO] Pass quantize model elapsed time: 892880.94 ms
2024-08-09 15:36:26 [INFO] Average SDR: -12.712963051830563
2024-08-09 15:36:26 [INFO] Tune 1 result is: [Accuracy (int8|fp32): -12.7130|-12.6500, Duration (seconds) (int8|fp32): 427.7732|441.3350], Best tune result is: [Accuracy: -12.7130, Duration (seconds): 427.7732]
2024-08-09 15:36:26 [INFO] |***********************Tune Result Statistics**********************|
2024-08-09 15:36:26 [INFO] +--------------------+-----------+---------------+------------------+
2024-08-09 15:36:26 [INFO] |     Info Type      |  Baseline | Tune 1 result | Best tune result |
2024-08-09 15:36:26 [INFO] +--------------------+-----------+---------------+------------------+
2024-08-09 15:36:26 [INFO] |      Accuracy      | -12.6500  |   -12.7130    |    -12.7130      |
2024-08-09 15:36:26 [INFO] | Duration (seconds) | 441.3350  |   427.7732    |    427.7732      |
2024-08-09 15:36:26 [INFO] +--------------------+-----------+---------------+------------------+
2024-08-09 15:36:26 [INFO] [Strategy] Found a model that meets the accuracy requirements.
2024-08-09 15:36:26 [INFO] Save tuning history to F:\Beam-Guided-TFDPRNN-PTQ\nc_workspace\2024-08-09_15-07-01\./history.snapshot.
2024-08-09 15:36:26 [INFO] Specified timeout or max trials is reached! Found a quantized model which meet accuracy goal. Exit.
2024-08-09 15:36:26 [INFO] Save deploy yaml to F:\Beam-Guided-TFDPRNN-PTQ\nc_workspace\2024-08-09_15-07-01\deploy.yaml
2024-08-09 15:36:26 [INFO] Post-training quantization completed.
2024-08-09 15:36:26 [INFO] Quantized model has been successfully saved.
2024-08-09 15:36:26 [INFO] Quantized model size: 261.43 MB

I have two questions:

  1. Why did the model size increase after quantization?

When using Neural Compressor for quantization, I noticed that the size of the quantized model is larger than the original one. I expected the model size to decrease after quantization, but it actually increased. The logs indicate that the model was successfully quantized to int8. Is this behavior normal? Did I successfully quantize the model? Additionally, my FP32 model was initially trained on a GPU, but this post-training quantization (PTQ) was performed on a CPU. Could this be related to the increase in model size?

  1. How can I view the precision of each layer in the model?

Is there a way to inspect the precision of each layer in the quantized model? I would like to verify the precision (e.g., FP32, INT8) at which each layer is operating to better understand the impact of the quantization process.

Kaihui-intel commented 1 month ago

Hello, @zhangxu223 Thanks for your interest in Intel(R) Neural Compressor.

For 1, The model has been successfully quantized, you should use q_model.save("saved_results") to save the quantized model and config. The model size of the best_model.pt file in the folder saved_results is approximately one fourth of the original model. You can refer to the document PTQ.

For 2, you can set op_name_dict and op_type_dict in config class to achieve the purpose.

# set op_type_dict
op_type_dict = {"Conv": {"weight": {"dtype": ["fp32"]}, "activation": {"dtype": ["fp32"]}}}
conf = PostTrainingQuantConfig(op_type_dict=op_type_dict)

# or set op_name_dict
op_name_dict = {
    "layer1.0.conv1": {
        "activation": {
            "dtype": ["fp32"],
        },
        "weight": {
            "dtype": ["fp32"],
        },
    }
}
conf = PostTrainingQuantConfig(op_name_dict=op_name_dict)

You can refer to the document specify-quantization-rules.

I hope the above information is useful to you, and I am looking forward to your reply.

zhangxu223 commented 1 month ago

Hello, @zhangxu223 Thanks for your interest in Intel(R) Neural Compressor.

For 1, The model has been successfully quantized, you should use to save the quantized model and config. The model size of the file in the folder is approximately one fourth of the original model. You can refer to the document PTQ.q_model.save("saved_results")``best_model.pt``saved_results

For 2, you can set and in config class to achieve the purpose.op_name_dict``op_type_dict

# set op_type_dict
op_type_dict = {"Conv": {"weight": {"dtype": ["fp32"]}, "activation": {"dtype": ["fp32"]}}}
conf = PostTrainingQuantConfig(op_type_dict=op_type_dict)

# or set op_name_dict
op_name_dict = {
    "layer1.0.conv1": {
        "activation": {
            "dtype": ["fp32"],
        },
        "weight": {
            "dtype": ["fp32"],
        },
    }
}
conf = PostTrainingQuantConfig(op_name_dict=op_name_dict)

You can refer to the document specify-quantization-rules.

I hope the above information is useful to you, and I am looking forward to your reply.

Hello, thank you very much for your previous response,it has been very helpful.

After switching to the q_model.save("saved_results") saving method as you suggested, I did notice that the model size decreased compared to the original, but it didn't achieve the expected reduction to one-fourth of the original size. The model only shrank from 141MB to 130MB.

What could be the reason for this?Is it possible that some layers were not successfully quantized? How can I check the precision of each layer after quantization to ensure that all layers have been quantized correctly?

I would appreciate any further suggestions you could provide. Thank you.

Kaihui-intel commented 1 month ago

Thanks for your reply. You can compare which layer changes with the FP32 model by printing the model. Use print(q_model) or print(q_model._model) to display model information. q_model.fp32_model show fp32 model. Additionally, you can use named_modules() or state_dict() to check layer details including precision (e.g dtype=torch.qint8).

zhangxu223 commented 1 month ago

Thanks for your reply. You can compare which layer changes with the FP32 model by printing the model. Use print(q_model) or print(q_model._model) to display model information. q_model.fp32_model show fp32 model. Additionally, you can use named_modules() or state_dict() to check layer details including precision (e.g dtype=torch.qint8).

Thank you very much for your reply!!

I added the following code to check the precision of the quantized model:

for name, param in quantized_model.named_parameters():
    logger.info(f"Parameter Name: {name}, Data Type: {param.dtype}, Shape: {param.shape}")

The results are as follows:

2024-08-14 15:13:33 [INFO] Save tuning history to F:\Beam-Guided-TFDPRNN-PTQ\nc_workspace\2024-08-14_15-09-18\./history.snapshot.
2024-08-14 15:13:33 [INFO] FP32 baseline is: [Accuracy: -12.6132, Duration (seconds): 251.4703]
2024-08-14 15:13:33 [INFO] Fx trace of the entire model failed, We will conduct auto quantization
2024-08-14 15:27:42 [INFO] |*********Mixed Precision Statistics********|
2024-08-14 15:27:42 [INFO] +---------------------+-------+------+------+
2024-08-14 15:27:42 [INFO] |       Op Type       | Total | INT8 | FP32 |
2024-08-14 15:27:42 [INFO] +---------------------+-------+------+------+
2024-08-14 15:27:42 [INFO] | quantize_per_tensor |   10  |  10  |  0   |
2024-08-14 15:27:42 [INFO] |        Conv2d       |   4   |  4   |  0   |
2024-08-14 15:27:42 [INFO] |      dequantize     |   10  |  10  |  0   |
2024-08-14 15:27:42 [INFO] |      GroupNorm      |   1   |  0   |  1   |
2024-08-14 15:27:42 [INFO] |        Linear       |   6   |  6   |  0   |
2024-08-14 15:27:42 [INFO] |      LayerNorm      |   6   |  0   |  6   |
2024-08-14 15:27:42 [INFO] +---------------------+-------+------+------+
2024-08-14 15:27:42 [INFO] Pass quantize model elapsed time: 848758.94 ms
2024-08-14 15:31:45 [INFO] Average SDR: -12.738859626272486
2024-08-14 15:31:45 [INFO] Tune 1 result is: [Accuracy (int8|fp32): -12.7389|-12.6132, Duration (seconds) (int8|fp32): 243.6468|251.4703], Best tune result is: [Accuracy: -12.7389, Duration (seconds): 243.6468]
2024-08-14 15:31:45 [INFO] |***********************Tune Result Statistics**********************|
2024-08-14 15:31:45 [INFO] +--------------------+-----------+---------------+------------------+
2024-08-14 15:31:45 [INFO] |     Info Type      |  Baseline | Tune 1 result | Best tune result |
2024-08-14 15:31:45 [INFO] +--------------------+-----------+---------------+------------------+
2024-08-14 15:31:45 [INFO] |      Accuracy      | -12.6132  |   -12.7389    |    -12.7389      |
2024-08-14 15:31:45 [INFO] | Duration (seconds) | 251.4703  |   243.6468    |    243.6468      |
2024-08-14 15:31:45 [INFO] +--------------------+-----------+---------------+------------------+
2024-08-14 15:31:45 [INFO] [Strategy] Found a model that meets the accuracy requirements.
2024-08-14 15:31:45 [INFO] Save tuning history to F:\Beam-Guided-TFDPRNN-PTQ\nc_workspace\2024-08-14_15-09-18\./history.snapshot.
2024-08-14 15:31:45 [INFO] Specified timeout or max trials is reached! Found a quantized model which meet accuracy goal. Exit.
2024-08-14 15:31:45 [INFO] Save deploy yaml to F:\Beam-Guided-TFDPRNN-PTQ\nc_workspace\2024-08-14_15-09-18\deploy.yaml
2024-08-14 15:31:45 [INFO] Post-training quantization completed.
2024-08-14 15:31:45 [INFO] Save config file and weights of quantized model to F:\Beam-Guided-TFDPRNN-PTQ\saved_model\quantization_model.
2024-08-14 15:31:45 [INFO] Parameter Name: _model.freq_net1.encoder.bias, Data Type: torch.float32, Shape: torch.Size([256])
2024-08-14 15:31:45 [INFO] Parameter Name: _model.freq_net1.separator.LayerNormalization.weight, Data Type: torch.float32, Shape: torch.Size([256])
2024-08-14 15:31:45 [INFO] Parameter Name: _model.freq_net1.separator.LayerNormalization.bias, Data Type: torch.float32, Shape: torch.Size([256])
2024-08-14 15:31:45 [INFO] Parameter Name: _model.freq_net1.separator.Stack.0.rnn_freq.weight_ih_l0, Data Type: torch.float32, Shape: torch.Size([512, 64])
2024-08-14 15:31:45 [INFO] Parameter Name: _model.freq_net1.separator.Stack.0.rnn_freq.weight_hh_l0, Data Type: torch.float32, Shape: torch.Size([512, 128])
2024-08-14 15:31:45 [INFO] Parameter Name: _model.freq_net1.separator.Stack.0.rnn_freq.bias_ih_l0, Data Type: torch.float32, Shape: torch.Size([512])
2024-08-14 15:31:45 [INFO] Parameter Name: _model.freq_net1.separator.Stack.0.rnn_freq.bias_hh_l0, Data Type: torch.float32, Shape: torch.Size([512])
2024-08-14 15:31:45 [INFO] Parameter Name: _model.freq_net1.separator.Stack.0.norm_freq.weight, Data Type: torch.float32, Shape: torch.Size([64])
2024-08-14 15:31:45 [INFO] Parameter Name: _model.freq_net1.separator.Stack.0.norm_freq.bias, Data Type: torch.float32, Shape: torch.Size([64])
2024-08-14 15:31:45 [INFO] Parameter Name: _model.freq_net1.separator.Stack.0.rnn_time.weight_ih_l0, Data Type: torch.float32, Shape: torch.Size([512, 64])
2024-08-14 15:31:45 [INFO] Parameter Name: _model.freq_net1.separator.Stack.0.rnn_time.weight_hh_l0, Data Type: torch.float32, Shape: torch.Size([512, 128])
2024-08-14 15:31:45 [INFO] Parameter Name: _model.freq_net1.separator.Stack.0.rnn_time.bias_ih_l0, Data Type: torch.float32, Shape: torch.Size([512])
2024-08-14 15:31:45 [INFO] Parameter Name: _model.freq_net1.separator.Stack.0.rnn_time.bias_hh_l0, Data Type: torch.float32, Shape: torch.Size([512])
2024-08-14 15:31:45 [INFO] Parameter Name: _model.freq_net1.separator.Stack.0.norm_time.weight, Data Type: torch.float32, Shape: torch.Size([64])
2024-08-14 15:31:45 [INFO] Parameter Name: _model.freq_net1.separator.Stack.0.norm_time.bias, Data Type: torch.float32, Shape: torch.Size([64])
2024-08-14 15:31:45 [INFO] Parameter Name: _model.freq_net1.separator.Stack.1.rnn_freq.weight_ih_l0, Data Type: torch.float32, Shape: torch.Size([512, 64])
2024-08-14 15:31:45 [INFO] Parameter Name: _model.freq_net1.separator.Stack.1.rnn_freq.weight_hh_l0, Data Type: torch.float32, Shape: torch.Size([512, 128])
2024-08-14 15:31:45 [INFO] Parameter Name: _model.freq_net1.separator.Stack.1.rnn_freq.bias_ih_l0, Data Type: torch.float32, Shape: torch.Size([512])
2024-08-14 15:31:45 [INFO] Parameter Name: _model.freq_net1.separator.Stack.1.rnn_freq.bias_hh_l0, Data Type: torch.float32, Shape: torch.Size([512])
2024-08-14 15:31:45 [INFO] Parameter Name: _model.freq_net1.separator.Stack.1.norm_freq.weight, Data Type: torch.float32, Shape: torch.Size([64])
2024-08-14 15:31:45 [INFO] Parameter Name: _model.freq_net1.separator.Stack.1.norm_freq.bias, Data Type: torch.float32, Shape: torch.Size([64])
2024-08-14 15:31:45 [INFO] Parameter Name: _model.freq_net1.separator.Stack.1.rnn_time.weight_ih_l0, Data Type: torch.float32, Shape: torch.Size([512, 64])
2024-08-14 15:31:45 [INFO] Parameter Name: _model.freq_net1.separator.Stack.1.rnn_time.weight_hh_l0, Data Type: torch.float32, Shape: torch.Size([512, 128])
2024-08-14 15:31:45 [INFO] Parameter Name: _model.freq_net1.separator.Stack.1.rnn_time.bias_ih_l0, Data Type: torch.float32, Shape: torch.Size([512])
2024-08-14 15:31:45 [INFO] Parameter Name: _model.freq_net1.separator.Stack.1.rnn_time.bias_hh_l0, Data Type: torch.float32, Shape: torch.Size([512])
2024-08-14 15:31:45 [INFO] Parameter Name: _model.freq_net1.separator.Stack.1.norm_time.weight, Data Type: torch.float32, Shape: torch.Size([64])
2024-08-14 15:31:45 [INFO] Parameter Name: _model.freq_net1.separator.Stack.1.norm_time.bias, Data Type: torch.float32, Shape: torch.Size([64])
2024-08-14 15:31:45 [INFO] Parameter Name: _model.freq_net1.separator.Stack.2.rnn_freq.weight_ih_l0, Data Type: torch.float32, Shape: torch.Size([512, 64])
2024-08-14 15:31:45 [INFO] Parameter Name: _model.freq_net1.separator.Stack.2.rnn_freq.weight_hh_l0, Data Type: torch.float32, Shape: torch.Size([512, 128])
2024-08-14 15:31:45 [INFO] Parameter Name: _model.freq_net1.separator.Stack.2.rnn_freq.bias_ih_l0, Data Type: torch.float32, Shape: torch.Size([512])
2024-08-14 15:31:45 [INFO] Parameter Name: _model.freq_net1.separator.Stack.2.rnn_freq.bias_hh_l0, Data Type: torch.float32, Shape: torch.Size([512])
2024-08-14 15:31:45 [INFO] Parameter Name: _model.freq_net1.separator.Stack.2.norm_freq.weight, Data Type: torch.float32, Shape: torch.Size([64])
2024-08-14 15:31:45 [INFO] Parameter Name: _model.freq_net1.separator.Stack.2.norm_freq.bias, Data Type: torch.float32, Shape: torch.Size([64])
2024-08-14 15:31:45 [INFO] Parameter Name: _model.freq_net1.separator.Stack.2.rnn_time.weight_ih_l0, Data Type: torch.float32, Shape: torch.Size([512, 64])
2024-08-14 15:31:45 [INFO] Parameter Name: _model.freq_net1.separator.Stack.2.rnn_time.weight_hh_l0, Data Type: torch.float32, Shape: torch.Size([512, 128])
2024-08-14 15:31:45 [INFO] Parameter Name: _model.freq_net1.separator.Stack.2.rnn_time.bias_ih_l0, Data Type: torch.float32, Shape: torch.Size([512])
2024-08-14 15:31:45 [INFO] Parameter Name: _model.freq_net1.separator.Stack.2.rnn_time.bias_hh_l0, Data Type: torch.float32, Shape: torch.Size([512])
2024-08-14 15:31:45 [INFO] Parameter Name: _model.freq_net1.separator.Stack.2.norm_time.weight, Data Type: torch.float32, Shape: torch.Size([64])
2024-08-14 15:31:45 [INFO] Parameter Name: _model.freq_net1.separator.Stack.2.norm_time.bias, Data Type: torch.float32, Shape: torch.Size([64])
2024-08-14 15:31:45 [INFO] Parameter Name: _model.freq_net1.decoder.bias, Data Type: torch.float32, Shape: torch.Size([2])
2024-08-14 15:31:45 [INFO] Parameter Name: fp32_model.freq_net1.encoder.weight, Data Type: torch.float32, Shape: torch.Size([256, 2, 7, 7])
2024-08-14 15:31:45 [INFO] Parameter Name: fp32_model.freq_net1.encoder.bias, Data Type: torch.float32, Shape: torch.Size([256])
2024-08-14 15:31:45 [INFO] Parameter Name: fp32_model.freq_net1.separator.LayerNormalization.weight, Data Type: torch.float32, Shape: torch.Size([256])
2024-08-14 15:31:45 [INFO] Parameter Name: fp32_model.freq_net1.separator.LayerNormalization.bias, Data Type: torch.float32, Shape: torch.Size([256])
2024-08-14 15:31:45 [INFO] Parameter Name: fp32_model.freq_net1.separator.bottleneck.weight, Data Type: torch.float32, Shape: torch.Size([64, 256, 1, 1])
2024-08-14 15:31:45 [INFO] Parameter Name: fp32_model.freq_net1.separator.bottleneck.bias, Data Type: torch.float32, Shape: torch.Size([64])
2024-08-14 15:31:45 [INFO] Parameter Name: fp32_model.freq_net1.separator.Stack.0.rnn_freq.weight_ih_l0, Data Type: torch.float32, Shape: torch.Size([512, 64])
2024-08-14 15:31:45 [INFO] Parameter Name: fp32_model.freq_net1.separator.Stack.0.rnn_freq.weight_hh_l0, Data Type: torch.float32, Shape: torch.Size([512, 128])
2024-08-14 15:31:45 [INFO] Parameter Name: fp32_model.freq_net1.separator.Stack.0.rnn_freq.bias_ih_l0, Data Type: torch.float32, Shape: torch.Size([512])
2024-08-14 15:31:45 [INFO] Parameter Name: fp32_model.freq_net1.separator.Stack.0.rnn_freq.bias_hh_l0, Data Type: torch.float32, Shape: torch.Size([512])
2024-08-14 15:31:45 [INFO] Parameter Name: fp32_model.freq_net1.separator.Stack.0.fc_freq.weight, Data Type: torch.float32, Shape: torch.Size([64, 128])
2024-08-14 15:31:45 [INFO] Parameter Name: fp32_model.freq_net1.separator.Stack.0.fc_freq.bias, Data Type: torch.float32, Shape: torch.Size([64])
2024-08-14 15:31:45 [INFO] Parameter Name: fp32_model.freq_net1.separator.Stack.0.norm_freq.weight, Data Type: torch.float32, Shape: torch.Size([64])
2024-08-14 15:31:45 [INFO] Parameter Name: fp32_model.freq_net1.separator.Stack.0.norm_freq.bias, Data Type: torch.float32, Shape: torch.Size([64])
2024-08-14 15:31:45 [INFO] Parameter Name: fp32_model.freq_net1.separator.Stack.0.rnn_time.weight_ih_l0, Data Type: torch.float32, Shape: torch.Size([512, 64])
2024-08-14 15:31:45 [INFO] Parameter Name: fp32_model.freq_net1.separator.Stack.0.rnn_time.weight_hh_l0, Data Type: torch.float32, Shape: torch.Size([512, 128])
2024-08-14 15:31:45 [INFO] Parameter Name: fp32_model.freq_net1.separator.Stack.0.rnn_time.bias_ih_l0, Data Type: torch.float32, Shape: torch.Size([512])
2024-08-14 15:31:45 [INFO] Parameter Name: fp32_model.freq_net1.separator.Stack.0.rnn_time.bias_hh_l0, Data Type: torch.float32, Shape: torch.Size([512])
2024-08-14 15:31:45 [INFO] Parameter Name: fp32_model.freq_net1.separator.Stack.0.fc_time.weight, Data Type: torch.float32, Shape: torch.Size([64, 128])
2024-08-14 15:31:45 [INFO] Parameter Name: fp32_model.freq_net1.separator.Stack.0.fc_time.bias, Data Type: torch.float32, Shape: torch.Size([64])
2024-08-14 15:31:45 [INFO] Parameter Name: fp32_model.freq_net1.separator.Stack.0.norm_time.weight, Data Type: torch.float32, Shape: torch.Size([64])
2024-08-14 15:31:45 [INFO] Parameter Name: fp32_model.freq_net1.separator.Stack.0.norm_time.bias, Data Type: torch.float32, Shape: torch.Size([64])
2024-08-14 15:31:45 [INFO] Parameter Name: fp32_model.freq_net1.separator.Stack.1.rnn_freq.weight_ih_l0, Data Type: torch.float32, Shape: torch.Size([512, 64])
2024-08-14 15:31:45 [INFO] Parameter Name: fp32_model.freq_net1.separator.Stack.1.rnn_freq.weight_hh_l0, Data Type: torch.float32, Shape: torch.Size([512, 128])
2024-08-14 15:31:45 [INFO] Parameter Name: fp32_model.freq_net1.separator.Stack.1.rnn_freq.bias_ih_l0, Data Type: torch.float32, Shape: torch.Size([512])
2024-08-14 15:31:45 [INFO] Parameter Name: fp32_model.freq_net1.separator.Stack.1.rnn_freq.bias_hh_l0, Data Type: torch.float32, Shape: torch.Size([512])
2024-08-14 15:31:45 [INFO] Parameter Name: fp32_model.freq_net1.separator.Stack.1.fc_freq.weight, Data Type: torch.float32, Shape: torch.Size([64, 128])
2024-08-14 15:31:45 [INFO] Parameter Name: fp32_model.freq_net1.separator.Stack.1.fc_freq.bias, Data Type: torch.float32, Shape: torch.Size([64])
2024-08-14 15:31:45 [INFO] Parameter Name: fp32_model.freq_net1.separator.Stack.1.norm_freq.weight, Data Type: torch.float32, Shape: torch.Size([64])
2024-08-14 15:31:45 [INFO] Parameter Name: fp32_model.freq_net1.separator.Stack.1.norm_freq.bias, Data Type: torch.float32, Shape: torch.Size([64])
2024-08-14 15:31:45 [INFO] Parameter Name: fp32_model.freq_net1.separator.Stack.1.rnn_time.weight_ih_l0, Data Type: torch.float32, Shape: torch.Size([512, 64])
2024-08-14 15:31:45 [INFO] Parameter Name: fp32_model.freq_net1.separator.Stack.1.rnn_time.weight_hh_l0, Data Type: torch.float32, Shape: torch.Size([512, 128])
2024-08-14 15:31:45 [INFO] Parameter Name: fp32_model.freq_net1.separator.Stack.1.rnn_time.bias_ih_l0, Data Type: torch.float32, Shape: torch.Size([512])
2024-08-14 15:31:45 [INFO] Parameter Name: fp32_model.freq_net1.separator.Stack.1.rnn_time.bias_hh_l0, Data Type: torch.float32, Shape: torch.Size([512])
2024-08-14 15:31:45 [INFO] Parameter Name: fp32_model.freq_net1.separator.Stack.1.fc_time.weight, Data Type: torch.float32, Shape: torch.Size([64, 128])
2024-08-14 15:31:45 [INFO] Parameter Name: fp32_model.freq_net1.separator.Stack.1.fc_time.bias, Data Type: torch.float32, Shape: torch.Size([64])
2024-08-14 15:31:45 [INFO] Parameter Name: fp32_model.freq_net1.separator.Stack.1.norm_time.weight, Data Type: torch.float32, Shape: torch.Size([64])
2024-08-14 15:31:45 [INFO] Parameter Name: fp32_model.freq_net1.separator.Stack.1.norm_time.bias, Data Type: torch.float32, Shape: torch.Size([64])
2024-08-14 15:31:45 [INFO] Parameter Name: fp32_model.freq_net1.separator.Stack.2.rnn_freq.weight_ih_l0, Data Type: torch.float32, Shape: torch.Size([512, 64])
2024-08-14 15:31:45 [INFO] Parameter Name: fp32_model.freq_net1.separator.Stack.2.rnn_freq.weight_hh_l0, Data Type: torch.float32, Shape: torch.Size([512, 128])
2024-08-14 15:31:45 [INFO] Parameter Name: fp32_model.freq_net1.separator.Stack.2.rnn_freq.bias_ih_l0, Data Type: torch.float32, Shape: torch.Size([512])
2024-08-14 15:31:45 [INFO] Parameter Name: fp32_model.freq_net1.separator.Stack.2.rnn_freq.bias_hh_l0, Data Type: torch.float32, Shape: torch.Size([512])
2024-08-14 15:31:45 [INFO] Parameter Name: fp32_model.freq_net1.separator.Stack.2.fc_freq.weight, Data Type: torch.float32, Shape: torch.Size([64, 128])
2024-08-14 15:31:45 [INFO] Parameter Name: fp32_model.freq_net1.separator.Stack.2.fc_freq.bias, Data Type: torch.float32, Shape: torch.Size([64])
2024-08-14 15:31:45 [INFO] Parameter Name: fp32_model.freq_net1.separator.Stack.2.norm_freq.weight, Data Type: torch.float32, Shape: torch.Size([64])
2024-08-14 15:31:45 [INFO] Parameter Name: fp32_model.freq_net1.separator.Stack.2.norm_freq.bias, Data Type: torch.float32, Shape: torch.Size([64])
2024-08-14 15:31:45 [INFO] Parameter Name: fp32_model.freq_net1.separator.Stack.2.rnn_time.weight_ih_l0, Data Type: torch.float32, Shape: torch.Size([512, 64])
2024-08-14 15:31:45 [INFO] Parameter Name: fp32_model.freq_net1.separator.Stack.2.rnn_time.weight_hh_l0, Data Type: torch.float32, Shape: torch.Size([512, 128])
2024-08-14 15:31:45 [INFO] Parameter Name: fp32_model.freq_net1.separator.Stack.2.rnn_time.bias_ih_l0, Data Type: torch.float32, Shape: torch.Size([512])
2024-08-14 15:31:45 [INFO] Parameter Name: fp32_model.freq_net1.separator.Stack.2.rnn_time.bias_hh_l0, Data Type: torch.float32, Shape: torch.Size([512])
2024-08-14 15:31:45 [INFO] Parameter Name: fp32_model.freq_net1.separator.Stack.2.fc_time.weight, Data Type: torch.float32, Shape: torch.Size([64, 128])
2024-08-14 15:31:45 [INFO] Parameter Name: fp32_model.freq_net1.separator.Stack.2.fc_time.bias, Data Type: torch.float32, Shape: torch.Size([64])
2024-08-14 15:31:45 [INFO] Parameter Name: fp32_model.freq_net1.separator.Stack.2.norm_time.weight, Data Type: torch.float32, Shape: torch.Size([64])
2024-08-14 15:31:45 [INFO] Parameter Name: fp32_model.freq_net1.separator.Stack.2.norm_time.bias, Data Type: torch.float32, Shape: torch.Size([64])
2024-08-14 15:31:45 [INFO] Parameter Name: fp32_model.freq_net1.separator.output_con2d.weight, Data Type: torch.float32, Shape: torch.Size([512, 64, 1, 1])
2024-08-14 15:31:45 [INFO] Parameter Name: fp32_model.freq_net1.separator.output_con2d.bias, Data Type: torch.float32, Shape: torch.Size([512])
2024-08-14 15:31:45 [INFO] Parameter Name: fp32_model.freq_net1.decoder.weight, Data Type: torch.float32, Shape: torch.Size([2, 256, 1, 1])
2024-08-14 15:31:45 [INFO] Parameter Name: fp32_model.freq_net1.decoder.bias, Data Type: torch.float32, Shape: torch.Size([2])
2024-08-14 15:31:45 [INFO] Quantized model has been successfully saved.

I don't understand why the precision of the model I'm getting is still in FP32. Why is that?

Kaihui-intel commented 1 month ago

It seems that the quantized model does not support the parameters or named_parameters attributes. If you compare the parameters of the fp32 model and the int8 model, you will find that the parameters of the int8 model are unreasonable, such as the size of the tensor. I suggest you use print(model.state_dict()) instead, and then you can see that the dtype of the tensor is torch.qint8.

zhangxu223 commented 1 month ago

It seems that the quantized model does not support the parameters or named_parameters attributes. If you compare the parameters of the fp32 model and the int8 model, you will find that the parameters of the int8 model are unreasonable, such as the size of the tensor. I suggest you use print(model.state_dict()) instead, and then you can see that the dtype of the tensor is torch.qint8.

Thank you very much for your prompt reply. I followed your suggestion and conducted an evaluation using the following code:

for key, value in q_model.state_dict().items():
    if isinstance(value, torch.Tensor):
        logger.info(f"Tensor: {key}, Data type: {value.dtype}")
    else:
        logger.info(f"Non-Tensor Parameter: {key}, Type: {type(value)}")

This is output:

2024-08-16 14:46:05 [INFO] Tensor: _model.mvdr.stft_model.enc.filterbank._filters, Data type: torch.float32
2024-08-16 14:46:05 [INFO] Tensor: _model.mvdr.stft_model.enc.filterbank.torch_window, Data type: torch.float32
2024-08-16 14:46:05 [INFO] Tensor: _model.mvdr.stft_model.dec.filterbank._filters, Data type: torch.float32
2024-08-16 14:46:05 [INFO] Tensor: _model.mvdr.stft_model.dec.filterbank.torch_window, Data type: torch.float32
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.encoder.bias, Data type: torch.float32
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.encoder.module_input_scale_0, Data type: torch.float32
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.encoder.module_input_zero_point_0, Data type: torch.int64
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.encoder.module.weight, Data type: torch.qint8
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.encoder.module.bias, Data type: torch.float32
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.encoder.module.scale, Data type: torch.float32
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.encoder.module.zero_point, Data type: torch.int64
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.separator.LayerNormalization_scale_0, Data type: torch.float32
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.separator.LayerNormalization_zero_point_0, Data type: torch.int64
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.separator.Stack_0_fc_freq_input_scale_0, Data type: torch.float32
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.separator.Stack_0_fc_freq_input_zero_point_0, Data type: torch.int64
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.separator.Stack_0_fc_time_input_scale_0, Data type: torch.float32
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.separator.Stack_0_fc_time_input_zero_point_0, Data type: torch.int64
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.separator.Stack_1_fc_freq_input_scale_0, Data type: torch.float32
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.separator.Stack_1_fc_freq_input_zero_point_0, Data type: torch.int64
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.separator.Stack_1_fc_time_input_scale_0, Data type: torch.float32
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.separator.Stack_1_fc_time_input_zero_point_0, Data type: torch.int64
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.separator.Stack_2_fc_freq_input_scale_0, Data type: torch.float32
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.separator.Stack_2_fc_freq_input_zero_point_0, Data type: torch.int64
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.separator.Stack_2_fc_time_input_scale_0, Data type: torch.float32
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.separator.Stack_2_fc_time_input_zero_point_0, Data type: torch.int64
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.separator.output_con2d_input_scale_0, Data type: torch.float32
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.separator.output_con2d_input_zero_point_0, Data type: torch.int64
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.separator.LayerNormalization.weight, Data type: torch.float32
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.separator.LayerNormalization.bias, Data type: torch.float32
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.separator.bottleneck.weight, Data type: torch.qint8
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.separator.bottleneck.bias, Data type: torch.float32
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.separator.bottleneck.scale, Data type: torch.float32
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.separator.bottleneck.zero_point, Data type: torch.int64
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.separator.Stack.0.rnn_freq.weight_ih_l0, Data type: torch.float32
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.separator.Stack.0.rnn_freq.weight_hh_l0, Data type: torch.float32
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.separator.Stack.0.rnn_freq.bias_ih_l0, Data type: torch.float32
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.separator.Stack.0.rnn_freq.bias_hh_l0, Data type: torch.float32
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.separator.Stack.0.fc_freq.scale, Data type: torch.float32
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.separator.Stack.0.fc_freq.zero_point, Data type: torch.int64
2024-08-16 14:46:05 [INFO] Non-Tensor Parameter: _model.freq_net1.separator.Stack.0.fc_freq._packed_params.dtype, Type: <class 'torch.dtype'>
2024-08-16 14:46:05 [INFO] Non-Tensor Parameter: _model.freq_net1.separator.Stack.0.fc_freq._packed_params._packed_params, Type: <class 'tuple'>
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.separator.Stack.0.norm_freq.weight, Data type: torch.float32
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.separator.Stack.0.norm_freq.bias, Data type: torch.float32
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.separator.Stack.0.rnn_time.weight_ih_l0, Data type: torch.float32
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.separator.Stack.0.rnn_time.weight_hh_l0, Data type: torch.float32
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.separator.Stack.0.rnn_time.bias_ih_l0, Data type: torch.float32
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.separator.Stack.0.rnn_time.bias_hh_l0, Data type: torch.float32
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.separator.Stack.0.fc_time.scale, Data type: torch.float32
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.separator.Stack.0.fc_time.zero_point, Data type: torch.int64
2024-08-16 14:46:05 [INFO] Non-Tensor Parameter: _model.freq_net1.separator.Stack.0.fc_time._packed_params.dtype, Type: <class 'torch.dtype'>
2024-08-16 14:46:05 [INFO] Non-Tensor Parameter: _model.freq_net1.separator.Stack.0.fc_time._packed_params._packed_params, Type: <class 'tuple'>
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.separator.Stack.0.norm_time.weight, Data type: torch.float32
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.separator.Stack.0.norm_time.bias, Data type: torch.float32
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.separator.Stack.1.rnn_freq.weight_ih_l0, Data type: torch.float32
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.separator.Stack.1.rnn_freq.weight_hh_l0, Data type: torch.float32
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.separator.Stack.1.rnn_freq.bias_ih_l0, Data type: torch.float32
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.separator.Stack.1.rnn_freq.bias_hh_l0, Data type: torch.float32
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.separator.Stack.1.fc_freq.scale, Data type: torch.float32
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.separator.Stack.1.fc_freq.zero_point, Data type: torch.int64
2024-08-16 14:46:05 [INFO] Non-Tensor Parameter: _model.freq_net1.separator.Stack.1.fc_freq._packed_params.dtype, Type: <class 'torch.dtype'>
2024-08-16 14:46:05 [INFO] Non-Tensor Parameter: _model.freq_net1.separator.Stack.1.fc_freq._packed_params._packed_params, Type: <class 'tuple'>
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.separator.Stack.1.norm_freq.weight, Data type: torch.float32
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.separator.Stack.1.norm_freq.bias, Data type: torch.float32
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.separator.Stack.1.rnn_time.weight_ih_l0, Data type: torch.float32
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.separator.Stack.1.rnn_time.weight_hh_l0, Data type: torch.float32
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.separator.Stack.1.rnn_time.bias_ih_l0, Data type: torch.float32
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.separator.Stack.1.rnn_time.bias_hh_l0, Data type: torch.float32
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.separator.Stack.1.fc_time.scale, Data type: torch.float32
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.separator.Stack.1.fc_time.zero_point, Data type: torch.int64
2024-08-16 14:46:05 [INFO] Non-Tensor Parameter: _model.freq_net1.separator.Stack.1.fc_time._packed_params.dtype, Type: <class 'torch.dtype'>
2024-08-16 14:46:05 [INFO] Non-Tensor Parameter: _model.freq_net1.separator.Stack.1.fc_time._packed_params._packed_params, Type: <class 'tuple'>
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.separator.Stack.1.norm_time.weight, Data type: torch.float32
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.separator.Stack.1.norm_time.bias, Data type: torch.float32
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.separator.Stack.2.rnn_freq.weight_ih_l0, Data type: torch.float32
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.separator.Stack.2.rnn_freq.weight_hh_l0, Data type: torch.float32
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.separator.Stack.2.rnn_freq.bias_ih_l0, Data type: torch.float32
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.separator.Stack.2.rnn_freq.bias_hh_l0, Data type: torch.float32
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.separator.Stack.2.fc_freq.scale, Data type: torch.float32
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.separator.Stack.2.fc_freq.zero_point, Data type: torch.int64
2024-08-16 14:46:05 [INFO] Non-Tensor Parameter: _model.freq_net1.separator.Stack.2.fc_freq._packed_params.dtype, Type: <class 'torch.dtype'>
2024-08-16 14:46:05 [INFO] Non-Tensor Parameter: _model.freq_net1.separator.Stack.2.fc_freq._packed_params._packed_params, Type: <class 'tuple'>
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.separator.Stack.2.norm_freq.weight, Data type: torch.float32
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.separator.Stack.2.norm_freq.bias, Data type: torch.float32
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.separator.Stack.2.rnn_time.weight_ih_l0, Data type: torch.float32
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.separator.Stack.2.rnn_time.weight_hh_l0, Data type: torch.float32
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.separator.Stack.2.rnn_time.bias_ih_l0, Data type: torch.float32
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.separator.Stack.2.rnn_time.bias_hh_l0, Data type: torch.float32
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.separator.Stack.2.fc_time.scale, Data type: torch.float32
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.separator.Stack.2.fc_time.zero_point, Data type: torch.int64
2024-08-16 14:46:05 [INFO] Non-Tensor Parameter: _model.freq_net1.separator.Stack.2.fc_time._packed_params.dtype, Type: <class 'torch.dtype'>
2024-08-16 14:46:05 [INFO] Non-Tensor Parameter: _model.freq_net1.separator.Stack.2.fc_time._packed_params._packed_params, Type: <class 'tuple'>
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.separator.Stack.2.norm_time.weight, Data type: torch.float32
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.separator.Stack.2.norm_time.bias, Data type: torch.float32
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.separator.output_con2d.weight, Data type: torch.qint8
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.separator.output_con2d.bias, Data type: torch.float32
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.separator.output_con2d.scale, Data type: torch.float32
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.separator.output_con2d.zero_point, Data type: torch.int64
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.decoder.bias, Data type: torch.float32
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.decoder.module_input_scale_0, Data type: torch.float32
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.decoder.module_input_zero_point_0, Data type: torch.int64
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.decoder.module.weight, Data type: torch.qint8
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.decoder.module.bias, Data type: torch.float32
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.decoder.module.scale, Data type: torch.float32
2024-08-16 14:46:05 [INFO] Tensor: _model.freq_net1.decoder.module.zero_point, Data type: torch.int64

The output shows that only a small portion of the parameters have been quantized to qint8, while most remain in fp32 precision. Could it be related to the fact that I am using PyTorch-based quantization, which might not fully quantize the model? If so, do you have any suggestions for achieving better quantization results? I'm working on a Windows system, so any tips on how to improve the quantization process for better outcomes would be greatly appreciated.

Kaihui-intel commented 1 month ago

To my knowledge, some modules will not be quantized.

https://github.com/pytorch/pytorch/blob/dd69013c7a5f3136814b35a66678f6753bd70aed/torch/ao/quantization/quantization_m.appings.py#L76

setting tolerable_loss in AccuracyCriterion will tune models with better accuracy.

zhangxu223 commented 1 month ago

To my knowledge, some modules will not be quantized.

https://github.com/pytorch/pytorch/blob/dd69013c7a5f3136814b35a66678f6753bd70aed/torch/ao/quantization/quantization_m.appings.py#L76

setting tolerable_loss in AccuracyCriterion will tune models with better accuracy.

Thank you very much!!