Closed hoangtv2000 closed 4 months ago
It seems that you are using QATQuantizer for PTQ. The correct way to do that is listed below.
from tinynn.graph.quantization.fake_quantize import set_ptq_fake_quantize
quantizer = QATQuantizer(
model, dummy_input, work_dir='out', config={'override_qconfig_func': set_ptq_fake_quantize}
)
Hi @peterjc123, Thanks for your reply, I had some experiments and I realized that I have to run model_rewrite() to train my model. Models that do not use this method will lack layers and does not expose correct output. So I ran:
raw_model = torch.load(args.model_pretrained_dir)
model = model_rewrite(raw_model, dummy_input, work_dir='TinyNeuralNet/out')
with model_tracer():
quantizer = QATQuantizer(model, dummy_input, work_dir='TinyNeuralNet/out',
config={'asymmetric': True, 'per_tensor': True, 'override_qconfig_func': set_ptq_fake_quantize})
qat_model = quantizer.quantize()
Now the model can be trained but training loss and validation metrics are terrible. I expect the losses go lower than 1 like the original model. But the optimizer seems got stuck at 3.0.
Epoch GPU_mem box_loss cls_loss dfl_loss Instances Size
1/49 2.27GB 3.01 2.835 2.919 114 320: 100%|██████████| 2004/2004 [06:03<00:00, 5.52it/s]
Do you know any methods that can be tried to resolve this problem?
It is advisable to first attempt Post-Training Quantization (PTQ) to quickly observe the loss introduced by quantization. If the results of PTQ are devastating, such as a drop of more than 30 points, it may indicate that the model is not quantization-friendly and I will assist in conducting a more in-depth quantization analysis to try to recover the model's quantization accuracy.. I think QAT is recommended when the accuracy loss from PTQ is minimal and you want to further enhance the model's quantization precision.
Alternatively, could you share the YOLOv8 model file (or the open-source repository you used) and your QAT training script?
Thanks @zk1998, Here is my script
import sys, os, argparse, random
sys.path.append(os.getcwd())
RANK = int(os.getenv("RANK", -1))
import torch
import torch.nn as nn
import torch.optim as optim
import torch.quantization as torch_q
from ultralytics import YOLO
from ultralytics.yolo.utils.loss import v8DetectionLoss
from tinynn.util.train_util import AverageMeter, DLContext, train, get_device
from tinynn.graph.quantization.quantizer import QATQuantizer
from tinynn.graph.tracer import model_tracer
from tinynn.converter import TFLiteConverter
from tinynn.graph.quantization.algorithm.cross_layer_equalization import cross_layer_equalize, model_rewrite, model_fuse_bn, clear_model_fused_bn
from tinynn.util.bn_restore import model_restore_bn
from tinynn.util.quantization_analysis_util import graph_error_analysis, layer_error_analysis
from tinynn.graph.quantization.fake_quantize import set_ptq_fake_quantize
from datetime import datetime
from pathlib import Path
from ultralytics.yolo.cfg import get_cfg
from ultralytics.utils import *
from ultralytics.yolo.utils import (DEFAULT_CFG, DEFAULT_CFG_DICT, DEFAULT_CFG_KEYS, LOGGER, NUM_THREADS, RANK, ROOT,
callbacks, is_git_dir, yaml_load)
from torchsummary import summary
from copy import copy, deepcopy
from ultralytics.yolo import v8
from ultralytics.yolo.utils.checks import check_imgsz
from ultralytics.yolo.data.utils import check_det_dataset
from ultralytics.yolo.data import build_dataloader
from ultralytics.yolo.utils.torch_utils import de_parallel, torch_distributed_zero_first
from ultralytics.yolo.data.dataset import YOLODataset
from ultralytics.yolo.utils.files import increment_path
from ultralytics.utils.torch_utils import (
EarlyStopping,
de_parallel,
init_seeds,
one_cycle,
select_device,
strip_optimizer,
)
def get_default_config(DEFAULT_CFG_PATH="ultralytics/cfg/default.yaml"):
# Default configuration
DEFAULT_CFG_DICT = yaml_load(DEFAULT_CFG_PATH)
for k, v in DEFAULT_CFG_DICT.items():
if isinstance(v, str) and v.lower() == "none":
DEFAULT_CFG_DICT[k] = None
DEFAULT_CFG_KEYS = DEFAULT_CFG_DICT.keys()
DEFAULT_CFG = IterableSimpleNamespace(**DEFAULT_CFG_DICT)
return DEFAULT_CFG
"""Build Dataset
"""
def get_dataloader(model, dataset_path, cfg, data, batch_size=16, rank=0, mode='train'):
def build_dataset(model, img_path, mode='train', batch=None):
gs = max(int(de_parallel(model).stride.max() if model else 0), 32)
return YOLODataset(
img_path=img_path,
imgsz=320,
batch_size=batch,
augment=False, # augmentation
hyp=cfg, # TODO: probably add a get_hyps_from_cfg function
rect=False, # rectangular batches
cache=None,
single_cls=False,
stride=gs,
pad=0.0 if mode == 'train' else 0.5,
prefix=colorstr(f'{mode}: '),
use_segments=False,
use_keypoints=False,
classes=None,
data=data,
fraction=1.0)
assert mode in ['train', 'val']
with torch_distributed_zero_first(rank): # init dataset *.cache only once if DDP
dataset = build_dataset(model, dataset_path, mode, batch_size)
shuffle = mode == 'train'
if getattr(dataset, 'rect', False) and shuffle:
LOGGER.warning("WARNING ⚠️ 'rect=True' is incompatible with DataLoader shuffle, setting shuffle=False")
shuffle = False
workers = 2 if mode == 'train' else 2
return build_dataloader(dataset, batch_size, workers, shuffle, rank)
def get_validator(batch_size, saved_experiment_dir):
cb = callbacks.get_default_callbacks()
overrides = {'imgsz': 320, 'batch': batch_size, 'conf': 0.25, 'iou': 0.6, 'device': 'cuda:0'}
overrides['rect'] = True
overrides['mode'] = 'val'
overrides.update(overrides)
args = get_cfg(cfg=DEFAULT_CFG, overrides=overrides)
args.data = 'sparseml_quantization/data.yaml'
args.task = 'detect'
args.rect = False
args.imgsz = check_imgsz(320, max_dim=1)
return v8.detect.DetectionValidator(save_dir=saved_experiment_dir, args=args, _callbacks=cb)
def calibrate(model, context: DLContext, eval=True):
model.to(device=context.device)
if eval:
model.eval()
else:
model.train()
avg_batch_time = AverageMeter()
with torch.no_grad():
end = time.time()
for i, batch in enumerate(context.train_loader):
if context.max_iteration is not None and i >= context.max_iteration:
break
img = batch['img'].to(context.device, non_blocking=True).float() / 255.0
model(img)
# measure elapsed time
avg_batch_time.update(time.time() - end)
end = time.time()
if i % 10 == 0:
print(f'Calibrate: [{i}/{len(context.train_loader)}]\tTime {avg_batch_time.avg:.5f}\t')
context.iteration += 1
def setup_seed(seed, cuda_deterministic=True):
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)
os.environ["PYTHONHASHSEED"] = str(seed)
if cuda_deterministic: # slower, more reproducible
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
else: # faster, less reproducible
torch.backends.cudnn.deterministic = False
torch.backends.cudnn.benchmark = True
def train(args, model, optimizer, scheduler, criterion, train_loader, validator, context, max_epochs, cfg, saved_experiment_dir):
loss_names = ['box_loss', 'cls_loss', 'dfl_loss']
model = model.to(context.device)
tloss = None
best_fitness = -1e5
early_stopper = 0
for epoch in range(max_epochs):
# For each epoch
pbar = TQDM(enumerate(train_loader), total=len(train_loader))
if epoch == 5:
train_loader.dataset.close_mosaic(hyp=cfg)
train_loader.reset()
# TRAIN!!!
model.train()
print(('\n' + '%11s' * (4 + len(loss_names))) % ('Epoch', 'GPU_mem', *loss_names, 'Instances', 'Size'))
for i, batch in pbar:
# Warm Up!
# ni = i + len(train_loader) * epoch
# nw = max(round(3 * len(train_loader)), 100)
# if ni <= nw:
# xi = [0, nw]
# for j, x in enumerate(optimizer.param_groups):
# x["lr"] = np.interp(
# ni, xi, [0.1 if j == 0 else 0.0, x["initial_lr"] * one_cycle(1, 0.01, max_epochs)(epoch)])
# if "momentum" in x:
# x["momentum"] = np.interp(ni, xi, [0.8, 0.937])
optimizer.zero_grad()
# Preprocess batch!
batch['img'] = batch['img'].to(context.device, non_blocking=True).float() / 255.0
output = model(batch['img'])
loss, loss_items = criterion(output, batch)
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=10.0)
optimizer.step()
# Loss infos
tloss = ((tloss * i + loss_items) / (i + 1) if tloss is not None else loss_items)
mem = f"{torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0:.3g}GB" #(GB)
loss_len = tloss.shape[0] if len(tloss.shape) else 1
losses = tloss if loss_len > 1 else torch.unsqueeze(tloss, 0)
# Progress bar
pbar.set_description(
("%11s" * 2 + "%11.4g" * (2 + loss_len))
% (f"{epoch}/{max_epochs-1}", mem, *losses, batch["cls"].shape[0], batch["img"].shape[-1])
)
if (epoch == max_epochs // 3):
print("[INFO] Freeze quantizer parameters!")
model.apply(torch.quantization.disable_observer)
elif (epoch == max_epochs // 3 * 2):
print("[INFO] Freeze batch-norm mean and variance estimates!")
model.apply(torch.nn.intrinsic.qat.freeze_bn_stats)
validate_model = deepcopy(model)
metrics = validator(model=validate_model)
fitness = metrics["fitness"]
if not best_fitness or best_fitness < fitness:
best_fitness = fitness
else:
early_stopper += 1
save_model(epoch, best_fitness, fitness, model, optimizer,
save_path=saved_experiment_dir / "weights")
scheduler.step()
del validate_model
torch.cuda.empty_cache()
if early_stopper == 5:
break
return model
def save_model(epoch, best_fitness, fitness, model, optimizer, save_path):
"""Save model checkpoints based on various conditions."""
ckpt = {
'epoch': epoch,
'best_fitness': best_fitness,
'model': deepcopy(de_parallel(model)),
'optimizer': optimizer.state_dict(),
'date': datetime.now().isoformat()}
try:
import dill as pickle
except ImportError:
import pickle
if not os.path.exists(save_path):
os.makedirs(save_path)
last = save_path / "last.pt"
best = save_path / "best.pt"
torch.save(ckpt, last, pickle_module=pickle)
if best_fitness == fitness:
torch.save(ckpt, best, pickle_module=pickle)
del ckpt
def qat(args):
dataset_yaml_path = '/data/hoangtv23/workspace_AIOT/model_compression_flow/sparseml_quantization/data.yaml'
# uncompressed_model_dir = "yolov8n.pt"
saved_experiment_dir = "TinyNeuralNet/tinynn_runs/exp_yolov8n"
saved_experiment_dir = increment_path(Path(saved_experiment_dir))
context = DLContext()
# Config consists of Augmentation Informations
cfg = get_default_config()
cfg.data = dataset_yaml_path
# Declare Model
raw_model = torch.load(args.model_pretrained_dir)
raw_model.args = cfg
dummy_input = torch.rand(1, 3, 320, 320)
data = check_det_dataset(dataset_yaml_path)
trainset, testset = data['train'], data.get('val') or data.get('test')
device = get_device()
context.device = device
train_loader = get_dataloader(raw_model, trainset, cfg, data, args.batch_size, RANK, "train")
context.train_loader = train_loader
test_loader = get_dataloader(raw_model, testset, cfg, data, args.batch_size*2, RANK, "val")
validator = v8.detect.DetectionValidator(test_loader, save_dir=saved_experiment_dir, args=copy(cfg))
# validator = get_validator(batch_size=args.batch_size, saved_experiment_dir=saved_experiment_dir)
model = model_rewrite(raw_model, dummy_input, work_dir='TinyNeuralNet/out')
# context.max_iteration = 100
# calibrate(model, context=context, eval=True)
with model_tracer():
quantizer = QATQuantizer(model, dummy_input, work_dir='TinyNeuralNet/out',
config={'asymmetric': True, 'per_tensor': True, 'override_qconfig_func': set_ptq_fake_quantize})
qat_model = quantizer.quantize()
qat_model.to(device=device)
""" Build Optimizer!
"""
optimizer = torch.optim.Adam(qat_model.parameters(), lr= args.base_lr, weight_decay=args.weight_decay)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=args.num_epochs + 1, eta_min=0)
""" Build Criterion!
"""
criterion = v8DetectionLoss(raw_model)
""" Train!
"""
qat_model.train()
qat_model.apply(torch_q.enable_fake_quant)
qat_model.apply(torch_q.enable_observer)
""" Train Model!
"""
qat_model = train(args, qat_model, optimizer, scheduler, criterion, train_loader,
validator, context, args.num_epochs, cfg, saved_experiment_dir)
""" Error Analysis!
"""
dummy_input_real = next(iter(train_loader))['img'].cuda().type(torch.FloatTensor)
graph_error_analysis(qat_model, dummy_input_real, metric='cosine')
layer_error_analysis(qat_model, dummy_input_real, metric='cosine')
qat_model.apply(torch_q.disable_observer)
metrics = validator(model=deepcopy(qat_model))
print(metrics)
with torch.no_grad():
qat_model.eval()
qat_model.cpu()
qat_model = quantizer.convert(qat_model)
torch.backends.quantized.engine = quantizer.backend
converter = TFLiteConverter(qat_model, dummy_input.cpu(), tflite_path='TinyNeuralNet/out/quantized_model.tflite')
converter.convert()
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--workers', type=int, default=4)
parser.add_argument('--num-epochs', type=int, default=50)
parser.add_argument('--batch-size', type=int, default=32)
parser.add_argument('--base-lr', type=float, default=1e-1)
parser.add_argument('--weight-decay', type=float, default=1e-6)
parser.add_argument('--model-pretrained-dir', type=str, default='oto_pruning/cache/version_split_bbox_conf/DetectionModel_compressed.pt')
setup_seed(seed=2048, cuda_deterministic=False)
args = parser.parse_args()
qat(args)
I saved the rewritten model and load it with QATQuantizer()
with the setting 'rewrite_graph': False, 'force_overwrite': False
. But It takes me 20 epochs to achieve 0.04 mAPs 😃. Although the difference output between the rewritten and the original one is zero. I really don't know what happened here. Maybe the optimization method is used in QAT example is not correct for my case.
If the rewritten model's mAPs has no difference with the original model, I strongly recommend you first use the rewritten model(the py file and pth file in out dir) to try post-quantization and then evaluate the mAPs of the ptq_model(after pytorch quantization converter to see the impact of quantization on mAPs without training.
BTW, I am trying to reproduce the YOLOv8 quantization pipeline on ImageNet. I will discuss the details with you later.
Thank you very much for your support @zk1998, I have my own version for YOLOv8 PTQ. I am trying the QAT version with your method to produce the Integer-arithmetic-only Inference model.
@hoangtv2000 Hi, for reproducing your issue, if we utilize QAT, then it will be very time-consuming. So we use PTQ instead. And we believe if PTQ works, then QAT certainly work better.
But It takes me 20 epochs to achieve 0.04 mAPs
Nope, it doesn't sound right. For a correct setup, it should not be dropping that much.
I use rewrite function in TinyNeuralNet/tinynn/graph/quantization/algorithm/cross_layer_equalization.py
def model_rewrite(model, dummy_input, work_dir='out') -> nn.Module:
"""rewrite model to non-block style"""
with model_tracer():
graph = trace(model, dummy_input)
model_name = type(model).__name__
model_rewrite = f'{model_name}_cle_Rewrite'
model_name_rewrite_lower = model_rewrite.lower()
model_ns = f'out.{model_name_rewrite_lower}'
model_code_path = os.path.join(work_dir, f'{model_name_rewrite_lower}.py')
model_weights_path = os.path.join(work_dir, f'{model_name_rewrite_lower}.pth')
graph.eliminate_dead_graph_pass()
if not os.path.exists(work_dir):
os.makedirs(work_dir)
graph.generate_code(model_code_path, model_weights_path, model_rewrite)
# Import the new model
rewritten_model = import_from_path(model_ns, model_code_path, model_rewrite)()
rewritten_model.load_state_dict(torch.load(model_weights_path))
# os.unlink(model_weights_path)
return rewritten_model
# args.model_pretrained_dir is the trained and pruned model path.
raw_model = torch.load(args.model_pretrained_dir)
dummy_input = torch.rand(1, 3, 320, 320).to(device)
model = model_rewrite(raw_model, dummy_input, work_dir='TinyNeuralNet/out')
model = model_rewrite(raw_model, dummy_input, work_dir='TinyNeuralNet/out')
Would you please skip this step for now?
In my cases, models that are not rewritten will missing layers and cannot expose correct output. Specifically, it lacks some layers for bbox and confidence score prediction, so the model can be used later. The model losses are also not reduced, so I think rewriting the model is not a problem.
Set up:
raw_model = torch.load(args.model_pretrained_dir)
with model_tracer():
raw_model.to(device)
quantizer = QATQuantizer(raw_model, dummy_input, work_dir='TinyNeuralNet/out',
config={'asymmetric': True, 'per_tensor': True, 'override_qconfig_func': set_ptq_fake_quantize})
qat_model = quantizer.quantize()
Error report:
Ultralytics YOLOv8.0.124 🚀 Python-3.8.18 torch-2.0.0+cu117 CUDA:0 (Tesla V100-PCIE-32GB, 32501MiB)
Epoch GPU_mem box_loss cls_loss dfl_loss Instances Size
Ultralytics YOLOv8.0.124 🚀 Python-3.8.18 torch-2.0.0+cu117 CUDA:0 (Tesla V100-PCIE-32GB, 32501MiB)
[INFO] By passing warm up!!!
Class Images Instances Box(P R mAP50 mAP50-95): 0%| | 0/22 [00:00<?, ?it/s]WARNING ⚠️ NMS time limit 6.900s exceeded
OUTPUT TENSOR SHAPE: [torch.Size([128, 65, 40, 40]), torch.Size([128, 65, 20, 20]), torch.Size([128, 65, 10, 10])]
WARNING ⚠️ NMS time limit 6.900s exceeded
Class Images Instances Box(P R mAP50 mAP50-95): 0%| | 0/22 [00:08<?, ?it/s]
Traceback (most recent call last):
File "TinyNeuralNet/yolov8n_quantization.py", line 391, in <module>
qat(args)
File "TinyNeuralNet/yolov8n_quantization.py", line 358, in qat
qat_model = train(args, qat_model, optimizer, scheduler, criterion, train_loader,
File "TinyNeuralNet/yolov8n_quantization.py", line 240, in train
metrics = validator(model=validate_model)
File "/home/vht/anaconda3/envs/perception/lib/python3.8/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
File "/data/hoangtv23/workspace_AIOT/model_compression_flow/ultralytics/yolo/engine/validator.py", line 171, in __call__
self.update_metrics(preds, batch)
File "/data/hoangtv23/workspace_AIOT/model_compression_flow/ultralytics/yolo/v8/detect/val.py", line 109, in update_metrics
self.confusion_matrix.process_batch(predn, labelsn)
File "/data/hoangtv23/workspace_AIOT/model_compression_flow/ultralytics/yolo/utils/metrics.py", line 246, in process_batch
self.matrix[detection_classes[m1[j]], gc] += 1 # correct
IndexError: index 9 is out of bounds for axis 0 with size 2
with model_tracer():
model = YOLO("yolov8n-cls.pt").model
dummy_input = torch.rand((1, 3, 224, 224))
quantizer = PostQuantizer(model, dummy_input, work_dir='out')
ptq_model = quantizer.quantize()
this use case can be traced correctly.
@hoangtv2000 Actually, we will also perform rewrite if you call quantizer = QATQuantizer(model, dummy_input, work_dir='TinyNeuralNet/out')
. If this fails, then it means TinyNN Graph Tracer is unable to capture the computation flow of the model. Please ensure that you pass in instances of nn.Module
for PostQuantizer
or QATQuantizer
.
Yes, the original model is nn.Module
of course.
And the rewritten model is nn.Module
and is equal to the original model.
So, I think the rewritten model is traced right and the problem comes from the optimizer.
Yes, the original model is
nn.Module
of course. And the rewritten model isnn.Module
and is equal to the original model. So, I think the rewritten model is traced right and the problem comes from the optimizer.
I think I know the reason. it is because the computation logic is slightly different between training mode and evaluation mode. Given the fact that we actually rewrite the model and trace only once in a single mode, it is inevitable that the traced graph won't work for the other mode. You may have to patch the generated model code a little bit so that it works for both training and evaluation.
Thank you @peterjc123,
Your suggestion seems not working, I split the model to 2 phases training and evaluation in forward()
and the training result does not change. So, I think the problem comes from the optimization.
Hi @hoangtv2000 , you can try to quantize Yolov8 detection model as below:
You need to use PostQuantizer to properly trace the complete model in eval mode.
with model_tracer():
model = YOLO("yolov8n.pt").model
dummy_input = torch.rand((1, 3, 224, 224))
quantizer = PostQuantizer(model, dummy_input, work_dir='out') ptq_model = quantizer.quantize()
Now, you can see the generated xx_q.py
and xx_q.pth
files in the ./out directory. These are model files that can be loaded and used. To align the differences between train/eval mode in yolov8 , you need to modify the return.
Then,you can use the QAT script normally, you need to check if the QAT fake-quantized model's mAPS is still usable before training. This means you should run a validation before qat script line 96:
with model_tracer():
model = YOLO("yolov8n.pt").model
dummy_input = torch.rand((1, 3, 224, 224))
context = DLContext()
...
# set force_overwrite to False to reuse your modified traced model
quantizer = QATQuantizer(model, dummy_input, work_dir='out1', config={'force_overwrite': False})
ptq_model = quantizer.quantize()
ptq_model.eval() ptq_model.apply(torch.quantization.disable_fake_quant) ptq_model.apply(torch.quantization.enable_observer) calibrate(ptq_model, context)
ptq_model.apply(torch.quantization.disable_observer) ptq_model.apply(torch.quantization.enable_fake_quant) validata(ptq_model, context)
ptq_model.apply(torch.quantization.enable_fake_quant) ptq_model.apply(torch.quantization.enable_observer) ...
Thanks for your response @zk1998, I think I find the cause, I loaded the PTQ rewritten model of your step 1 and validate, it preserves the same mAP of the original model. But when I run
quantizer = QATQuantizer(model, dummy_input, work_dir='TinyNeuralNet/out',
config={'asymmetric': True, 'per_tensor': True, 'force_rewrite': False, 'rewrite_graph': False})
qat_model = quantizer.quantize()
The mAP of QAT is dropped to zero. So, I think the problem comes from here.
PS: I set 'rewrite_graph': False
because if it is True, QATQuantizer
will generate another code .py, model .pth and it would delete my edits of the old .py file.
Thanks for your response @zk1998, I think I find the cause, I loaded the PTQ rewritten model of your step 1 and validate, it preserves the same mAP of the original model. But when I run
quantizer = QATQuantizer(model, dummy_input, work_dir='TinyNeuralNet/out', config={'asymmetric': True, 'per_tensor': True, 'force_rewrite': False, 'rewrite_graph': False}) qat_model = quantizer.quantize()
The mAP of QAT is dropped to zero. So, I think the problem comes from here.
There is no 'force_rewrite': False
but 'force_overwrite': False
and you should also set 'rewrite_graph': True
.
I know but it will delete my edits
if self.training:
return [fake_dequant_1_0, fake_dequant_1_1, fake_dequant_1_2]
else:
return [fake_dequant_0_0, fake_dequant_0_1], [fake_dequant_1_0, fake_dequant_1_1, fake_dequant_1_2]
if I set 'rewrite_graph': True
@hoangtv2000 It will not delete your edits.if you pass 'force_overwrite': False
as can be seen here. Clearly, it was caused by the typo in your code. (force_rewrite
NO!!! force_overwrite
YES!!! ) So you'll need to remove the out
directory and perform the steps in https://github.com/alibaba/TinyNeuralNetwork/issues/337#issuecomment-2205330949 again.
Wow that was strange, here is my script and it breaks my model. I will try your solution, thank you for helping me so much.
from TinyNeuralNet.out.detectionmodel_q import QDetectionModel
model = QDetectionModel()
dummy_input = torch.rand(1, 3, 320, 320)
model.load_state_dict(torch.load('TinyNeuralNet/out/detectionmodel_q.pth'))
model.to(device=device)
quantizer = QATQuantizer(model, dummy_input, work_dir='TinyNeuralNet/out',
config={'asymmetric': True, 'per_tensor': True, 'force_rewrite': False, 'rewrite_graph': True})
qat_model = quantizer.quantize()
qat_model.to(device=device)
No, you didn't follow our usage. Come on. It's force_overwrite
not force_rewrite
, okay?
from TinyNeuralNet.out.detectionmodel_q import QDetectionModel model = QDetectionModel() dummy_input = torch.rand(1, 3, 320, 320) model.load_state_dict(torch.load('TinyNeuralNet/out/detectionmodel_q.pth')) model.to(device=device) quantizer = QATQuantizer(model, dummy_input, work_dir='TinyNeuralNet/out', config={'asymmetric': True, 'per_tensor': True, 'force_rewrite': False, 'rewrite_graph': True}) qat_model = quantizer.quantize() qat_model.to(device=device)
If you pass in the rewritten model as shown in this piece of code, then you should pass "rewrite_graph": False
. Does it work btw?
I saved the rewritten model and load it with
QATQuantizer()
with the setting'rewrite_graph': False, 'force_overwrite': False
. But It takes me 20 epochs to achieve 0.04 mAPs 😃. Although the difference output between the rewritten and the original one is zero. I really don't know what happened here. Maybe the optimization method is used in QAT example is not correct for my case.
I was so silly due to not pay attention to answer, sorry for that.
At the first time I rewrite the model with PostQuantizer and save, then run the rewritten model with QATQuantizer with the setting 'asymmetric': True, 'per_tensor': True, 'force_overwrite': False, 'rewrite_graph': False
and it causes huge losses. Even I set no_catch() in the ultralytics source code and seperate the rewritten code for training/evaluation.
Btw, if I calibrate the model before training, I got AssertionError: min nan should be less than max nan
error at the input fake quantization node.
Epoch GPU_mem box_loss cls_loss dfl_loss Instances Size
Traceback (most recent call last):
File "TinyNeuralNet/yolov8n_quantization.py", line 399, in <module>
qat(args)
File "TinyNeuralNet/yolov8n_quantization.py", line 365, in qat
qat_model = train(args, qat_model, optimizer, scheduler, criterion, train_loader,
File "TinyNeuralNet/yolov8n_quantization.py", line 212, in train
output = model(batch['img'])
File "/home/vht/anaconda3/envs/perception/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/data/hoangtv23/workspace_AIOT/model_compression_flow/TinyNeuralNet/out/detectionmodel_q.py", line 190, in forward
model_0_conv = self.model_0_conv(fake_quant_0)
File "/home/vht/anaconda3/envs/perception/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1538, in _call_impl
result = forward_call(*args, **kwargs)
File "/home/vht/anaconda3/envs/perception/lib/python3.8/site-packages/torch/ao/nn/intrinsic/qat/modules/conv_fused.py", line 224, in forward
return self._forward(input)
File "/home/vht/anaconda3/envs/perception/lib/python3.8/site-packages/torch/ao/nn/intrinsic/qat/modules/conv_fused.py", line 101, in _forward
return self._forward_approximate(input)
File "/home/vht/anaconda3/envs/perception/lib/python3.8/site-packages/torch/ao/nn/intrinsic/qat/modules/conv_fused.py", line 114, in _forward_approximate
scaled_weight = self.weight_fake_quant(self.weight * scale_factor.reshape(weight_shape))
File "/home/vht/anaconda3/envs/perception/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/home/vht/anaconda3/envs/perception/lib/python3.8/site-packages/torch/ao/quantization/fake_quantize.py", line 194, in forward
_scale, _zero_point = self.calculate_qparams()
File "/home/vht/anaconda3/envs/perception/lib/python3.8/site-packages/torch/ao/quantization/fake_quantize.py", line 189, in calculate_qparams
return self.activation_post_process.calculate_qparams()
File "/home/vht/anaconda3/envs/perception/lib/python3.8/site-packages/torch/ao/quantization/observer.py", line 508, in calculate_qparams
return self._calculate_qparams(self.min_val, self.max_val)
File "/home/vht/anaconda3/envs/perception/lib/python3.8/site-packages/torch/ao/quantization/observer.py", line 315, in _calculate_qparams
if not check_min_max_valid(min_val, max_val):
File "/home/vht/anaconda3/envs/perception/lib/python3.8/site-packages/torch/ao/quantization/utils.py", line 317, in check_min_max_valid
assert min_val <= max_val, "min {} should be less than max {}".format(
AssertionError: min nan should be less than max nan
So, how about the fake-quantized qat-prepared model's mAPs before training which I mentioned at https://github.com/alibaba/TinyNeuralNetwork/issues/337#issuecomment-2205330949 step3
Hi @zk1998, Here are some of my experiments,
(0) Validate original model
Class Images Instances Box(P R mAP50 mAP50-95): 100%|██████████| 22/22 [00:24<00:00, 1.13s/it]
all 2692 10774 0.77 0.564 0.652 0.42
all 2692 10774 0.77 0.564 0.652
(1) PostQuantize to get rewritten model => Load rewritten model and Validate model performace
Class Images Instances Box(P R mAP50 mAP50-95): 100%|██████████| 22/22 [00:24<00:00, 1.10s/it]
all 2692 10774 0.77 0.564 0.652 0.42
all 2692 10774 0.77 0.564 0.652 0.42
(2) PostQuantize to get rewritten model => Load rewritten model and perform QATQuantizer => Validate model performace
Class Images Instances Box(P R mAP50 mAP50-95): 100%|██████████| 22/22 [01:44<00:00, 4.76s/it]
all 2692 10774 0 0 0 0
all 2692 10774 0 0 0 0
(3) PostQuantize to get rewritten model => Load rewritten model and perform QATQuantizer => Calibrate by training set => Validate model performace
Class Images Instances Box(P R mAP50 mAP50-95): 100%|██████████| 22/22 [02:04<00:00, 5.67s/it]
all 2692 10774 0.0227 0.05 0.00356 0.00121
all 2692 10774 0.0227 0.05 0.00356 0.00121
Hi @hoangtv2000 . It seems that the quantization error is unacceptable, and QAT will degrade to training from scratch. I am trying to reproduce the YOLOv8 detection quantization experiment, which will take some time.
Yeah, I tried the quantization with PostQuantizer()
and it got the same behavior with QAT quantization method. The mAPs after quantized are reduced to zero. I checked whether the QAT or PTQ ready model lost pretrained weights, but it did not.
So, the QAT ready model is still preserved pretrained weights but why does its performance drop catastrophically?
with model_tracer():
model = torch.load(args.model_pretrained_dir).cpu()
quantizer = PostQuantizer(model, torch.rand(1, 3, 320, 320), work_dir='TinyNeuralNet/out')
ptq_model = quantizer.quantize()
# Do calibration/inference to get quantization param
ptq_model.eval()
ptq_model.apply(torch_q.disable_fake_quant)
ptq_model.apply(torch_q.enable_observer)
context.max_iteration = 100
calibrate(ptq_model, context)
# Disable observer and enable fake quantization to validate model with quantization error
ptq_model.apply(torch_q.disable_observer)
ptq_model.apply(torch_q.enable_fake_quant)
metrics = validator(model=deepcopy(ptq_model))
And the mAPs come to nearly zero.
(4) PostQuantize to get rewritten model => Load rewritten model and perform PostQuantizer => Calibrate by training set => Validate model performace
Class Images Instances Box(P R mAP50 mAP50-95): 100%|██████████| 22/22 [01:38<00:00, 4.49s/it]
all 2692 10774 0.00177 0.127 0.001 0.000334
@hoangtv2000 Would you please organize the data and the code you used to perform training? I think an end-to-end example helps here because we are new to the YOLO framework. There's just too much info in this thread and we may get distracted by something unrelated.
I tested the dummy input/output and evaluation mAPs with the rewritten model and the original one. The output tensors of them when tested on dummy input are the same. But the mAPs are so difference, so I think the problem comes from the conflict of training/evaluation code and the rewritten model. I will further investigate the cause and solution then notify you later.
Update, the rewritten model is still perserve mAPs, but the qat ready model mAPs are changed. So the problem comes from the QATQuantizer.quantize()
function. The insertion of FakeQuantize to weights and activations drops peformance drastically. I tried the experiment of MobileOne and it is just reduced performance from 0.8 to 0.56 and achieved 0.78 after a few training epochs. Please look at this file qat_model_architecture.txt, let me know if the qat_model
is problematic?
from TinyNeuralNet.out.detectionmodel_q import QDetectionModel
model = QDetectionModel()
model.load_state_dict(torch.load('TinyNeuralNet/out/detectionmodel_q.pth'))
model.to(device=device)
metrics = validator(model=deepcopy(model)) # mAP = 0.65, not changed.
quantizer = QATQuantizer(model, dummy_input, work_dir='TinyNeuralNet/out',
config={'asymmetric': True, 'per_tensor': True, 'force_overwrite': False, 'rewrite_graph': False})
qat_model = quantizer.quantize()
qat_model.to(device=device)
metrics = validator(model=deepcopy(qat_model )) # mAP = 0.0286, dropped from scratch
I had a new experiment on yolov5 and its mAP50 is also dropped from 0.7 to 0.0.
@hoangtv2000 Would you please organize the data and the code you used to perform training? I think an end-to-end example helps here because we are new to the YOLO framework. There's just too much info in this thread and we may get distracted by something unrelated.
All of the code is used to train and evaluate YOLOv8 is come from offical repo of ultralytics and the code for YOLOv5 can be found here.
I just took their pretrained model, loaded it and tested on your QAT engine.
@hoangtv2000 Would you please organize the data and the code you used to perform training? I think an end-to-end example helps here because we are new to the YOLO framework. There's just too much info in this thread and we may get distracted by something unrelated.
All of the code is used to train and evaluate YOLOv8 is come from offical repo of ultralytics and the code for YOLOv5 can be founded here.
I just took their pretrained model, loaded it and tested on your QAT engine.
It will take some time for us to replicate the issue since we are not familiar with yolo pipeline. Let's just not talk about something unrelated here and focus on one problem because there are already plenty of them here. Thanks for your patience and understanding.
Hi @hoangtv2000 , I replicate the quantization pipeline in COCO8 on yolov8n.pt using :
And the script is modified from your code https://github.com/alibaba/TinyNeuralNetwork/issues/337#issuecomment-2199675183. Here is my example script, and the quantization error is acceptable.
import sys, os, argparse, random
sys.path.append(os.getcwd())
RANK = int(os.getenv("RANK", -1))
import torch
import torch.nn as nn
import torch.optim as optim
import torch.quantization as torch_q
from ultralytics import YOLO
from ultralytics.utils.loss import v8DetectionLoss
from tinynn.util.train_util import AverageMeter, DLContext, train, get_device
from tinynn.graph.quantization.quantizer import QATQuantizer, PostQuantizer
from tinynn.graph.tracer import model_tracer
from ultralytics.cfg import get_cfg
from ultralytics.utils import *
from ultralytics.utils import (DEFAULT_CFG, DEFAULT_CFG_DICT, DEFAULT_CFG_KEYS, LOGGER, NUM_THREADS, RANK, ROOT,
callbacks, is_git_dir, yaml_load)
# from torchsummary import summary
from copy import copy, deepcopy
# from ultralytics import v8
import ultralytics.models.yolo as v8
from ultralytics.utils.checks import check_imgsz
from ultralytics.data.utils import check_det_dataset
from ultralytics.data import build_dataloader
from ultralytics.utils.torch_utils import de_parallel, torch_distributed_zero_first
from ultralytics.data.dataset import YOLODataset
from ultralytics.utils.files import increment_path
from ultralytics.utils.torch_utils import (
EarlyStopping,
de_parallel,
init_seeds,
one_cycle,
select_device,
strip_optimizer,
)
def get_default_config(DEFAULT_CFG_PATH="ultralytics/cfg/default.yaml"):
# Default configuration
DEFAULT_CFG_DICT = yaml_load(DEFAULT_CFG_PATH)
for k, v in DEFAULT_CFG_DICT.items():
if isinstance(v, str) and v.lower() == "none":
DEFAULT_CFG_DICT[k] = None
DEFAULT_CFG_KEYS = DEFAULT_CFG_DICT.keys()
DEFAULT_CFG = IterableSimpleNamespace(**DEFAULT_CFG_DICT)
return DEFAULT_CFG
"""Build Dataset
"""
def get_dataloader(model, dataset_path, cfg, data, batch_size=16, rank=0, mode='train'):
def build_dataset(model, img_path, mode='train', batch=None):
gs = max(int(de_parallel(model).stride.max() if model else 0), 32)
return YOLODataset(
img_path=img_path,
imgsz=640,
batch_size=batch,
augment=False, # augmentation
hyp=cfg, # TODO: probably add a get_hyps_from_cfg function
rect=False, # rectangular batches
cache=None,
single_cls=False,
stride=gs,
pad=0.0 if mode == 'train' else 0.5,
prefix=colorstr(f'{mode}: '),
# use_segments=False,
# use_keypoints=False,
classes=None,
data=data,
fraction=1.0)
assert mode in ['train', 'val']
with torch_distributed_zero_first(rank): # init dataset *.cache only once if DDP
dataset = build_dataset(model, dataset_path, mode, batch_size)
shuffle = mode == 'train'
if getattr(dataset, 'rect', False) and shuffle:
LOGGER.warning("WARNING ⚠️ 'rect=True' is incompatible with DataLoader shuffle, setting shuffle=False")
shuffle = False
workers = 2 if mode == 'train' else 2
return build_dataloader(dataset, batch_size, workers, shuffle, rank)
def get_validator(batch_size, saved_experiment_dir):
cb = callbacks.get_default_callbacks()
overrides = {'imgsz': 640, 'batch': batch_size, 'conf': 0.25, 'iou': 0.6, 'device': 'cuda:0'}
overrides['rect'] = True
overrides['mode'] = 'val'
overrides.update(overrides)
args = get_cfg(cfg=DEFAULT_CFG, overrides=overrides)
args.data = 'sparseml_quantization/data.yaml'
args.task = 'detect'
args.rect = False
args.imgsz = check_imgsz(640, max_dim=1)
return v8.detect.DetectionValidator(save_dir=saved_experiment_dir, args=args, _callbacks=cb)
def setup_seed(seed, cuda_deterministic=True):
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)
os.environ["PYTHONHASHSEED"] = str(seed)
if cuda_deterministic: # slower, more reproducible
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
else: # faster, less reproducible
torch.backends.cudnn.deterministic = False
torch.backends.cudnn.benchmark = True
def post_tnn(args):
raw_model = torch.load('yolov8n.pt')
dataset_yaml_path = '/data/zhouye/0603/TinyNeuralNetwork/examples/quantization/yolo/coco8.yml'
cfg = get_default_config(DEFAULT_CFG_PATH='/data/miniconda3/envs/torch113/lib/python3.8/site-packages/ultralytics/cfg/default.yaml')
cfg.data = dataset_yaml_path
data = check_det_dataset(dataset_yaml_path)
trainset, testset = data['train'], data.get('val') or data.get('test')
calib_loader = get_dataloader(raw_model['model'], trainset, cfg, data, args.batch_size, RANK, "val")
device = get_device()
context = DLContext()
context.device = device
context.train_loader = calib_loader
test_loader = get_dataloader(raw_model['model'], testset, cfg, data, args.batch_size, RANK, "val")
saved_experiment_dir = "out/exp_yolov8_og_"
saved_experiment_dir = increment_path(Path(saved_experiment_dir))
validator = v8.detect.DetectionValidator(test_loader, save_dir=saved_experiment_dir, args=copy(cfg))
calib_dir_tmp = "out/exp_yolov8_calib_tmp_"
calib_dir_tmp = increment_path(Path(calib_dir_tmp))
calibrator = v8.detect.DetectionValidator(calib_loader, save_dir=calib_dir_tmp, args=copy(cfg))
calibrator.training = False
model = raw_model['model'].float()
print('origin model validating...')
validator(model=deepcopy(model))
dummy_input = torch.rand((1, 3, 640, 640))
with model_tracer():
# Firstly, you should trace model in eval mode,
# then manually modify the traced model to accommodate the differences between evaluation and training modes.
# quantizer = PostQuantizer(
# model, dummy_input, work_dir='out'
# )
# ptq_model = quantizer.quantize()
# exit()
# Once you generate traced model and modified the output and input, use the code below to set qat quantizer.
from out.detectionmodel_q import QDetectionModel
rewrite_model = QDetectionModel()
rewrite_model.load_state_dict(torch.load('out/detectionmodel_q.pth'))
quantizer = QATQuantizer(rewrite_model, dummy_input, work_dir='out', config={'force_rewrite': False, "rewrite_graph": False,})
ptq_model = quantizer.quantize()
# add attr to fit AutoBackend when validating using ultralytics.
def ptq_fuse(self, verbose=None):
return self
import types
ptq_model.fuse = types.MethodType(ptq_fuse, ptq_model)
ptq_model.stride = model.stride
ptq_model.names = model.names
ptq_model.nc = model.nc
# Use ultralytics's validator to do calibrating(or training if use qat),
# !!!!you must keep the img-preprocessing is the same as validating (keep in mind when trianing).
ptq_model.apply(torch.quantization.disable_fake_quant)
ptq_model.apply(torch.quantization.enable_observer)
ptq_model.eval()
calibrator.args.imgsz = 640
bar = TQDM(calib_loader, desc="calibrating", total=len(calib_loader))
# we do one inference to simulate qat init state
for batch_i, batch in enumerate(bar):
batch = calibrator.preprocess(batch)
ptq_model(batch["img"])
break
# Disable observer and enable fake quantization to validate model with quantization error
ptq_model = deepcopy(ptq_model)
ptq_model = ptq_model.to(device)
dummy_input = dummy_input.to(device)
ptq_model.apply(torch.quantization.disable_observer)
ptq_model.apply(torch.quantization.enable_fake_quant)
ptq_model(dummy_input)
validator.args.imgsz = 640
# Remove the postprocess(DFL block) to improve acc which hurts quantized model's mAPs a lot.
unq_flag = False
print('here are unquantized module')
for name, module in ptq_model.named_modules():
if 'fake_dequant_inner_0_0_0' in name:
unq_flag = True
if (
# the decoding of boxes and class from multiple-level feature maps is hard to quantize
# btw, yolov8 trainer will not train this block, so it should use float for inference.
(unq_flag and name.split('_')[-1].isdigit() and int(name.split('_')[-1]) >= 22)
):
module.apply(torch.quantization.disable_fake_quant)
print(name)
print('validating fake-quantized model...')
quant_result_dir = "out/exp_yolov8_quant"
quant_result_dir = increment_path(Path(quant_result_dir))
validator = v8.detect.DetectionValidator(test_loader, save_dir=quant_result_dir, args=copy(cfg))
validator(model=deepcopy(ptq_model))
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--workers', type=int, default=4)
parser.add_argument('--num-epochs', type=int, default=50)
parser.add_argument('--batch-size', type=int, default=1)
parser.add_argument('--base-lr', type=float, default=1e-1)
parser.add_argument('--weight-decay', type=float, default=1e-6)
parser.add_argument('--model-pretrained-dir', type=str,
default='oto_pruning/cache/version_split_bbox_conf/DetectionModel_compressed.pt')
setup_seed(seed=2048, cuda_deterministic=False)
args = parser.parse_args()
post_tnn(args)
Something you must pay attention:
So I believe YOLOv8 can achieve full INT8 quantization, whether using PTQ or QAT.
You can check the script to get more details.
Hi @zk1998, I really did it, thank you so much!!!
Hi @peterjc123, I am adapting your quantized-aware training method to my YOLOv8 compression flow, I follow this pipeline to train my model:
And I met this error, the loss metrics of QAT flow are huge. I think the wrong fake quantization of input cause this error.
Do you know how to fix it? Thank you.