OML-Team / open-metric-learning

Metric learning and retrieval pipelines, models and zoo.
https://open-metric-learning.readthedocs.io/en/latest/index.html
Apache License 2.0
877 stars 61 forks source link

Train cars196 dataset with Pairwise postprocessing meet problem #555

Closed snow-wind-001 closed 4 months ago

snow-wind-001 commented 5 months ago

I would like help with the training problem. I used the cars196 data set to test the training process of Pairwise postprocessing, but the hanging chain is very slow and the index is very low. Please help me how to set this up.

snow-wind-001 commented 5 months ago

@chang48 @dapladoc @churnikov @alexmelekhin

AlekseySh commented 5 months ago

Hi, @snow-wind-001

Thank you for your interest in OML. What is the problem exactly? Please, provide us with more infromation

snow-wind-001 commented 5 months ago

My level is very limited, and the problem may also be due to my lack of understanding of this project, but then I ran the postprocessor.jpynb file and made modifications, but there were many problems. First, in order to save the training code and enable testing, I modified the code, but as the training progressed, the Metrics almost remained unchanged. Please help me point out my mistake.

from pprint import pprint import torch from torch.nn import BCEWithLogitsLoss from torch.utils.data import DataLoader from oml.datasets.base import DatasetWithLabels, DatasetQueryGallery from oml.inference.flat import inference_on_dataframe from oml.metrics.embeddings import EmbeddingMetrics from oml.miners.pairs import PairsMiner from oml.models import ConcatSiamese, ViTExtractor from oml.retrieval.postprocessors.pairwise import PairwiseImagesPostprocessor from oml.samplers.balance import BalanceSampler from oml.transforms.images.torchvision import get_normalisation_resize_torch from oml.utils.download_mock_dataset import download_mock_dataset from typing import Optional, Union from pathlib import Path

dataset_root = "mock_dataset/" download_mock_dataset(dataset_root)

class CustomViTExtractor(ViTExtractor): def init(self, arch: str = "vits16", normalise_features: bool = False, use_multi_scale: bool = False, weights: Optional[Union[Path, str]] = None): super().init(weights=None, arch=arch, normalise_features=normalise_features, use_multi_scale=use_multi_scale) if weights is not None: self.load_pretrained_weights(weights)

def load_pretrained_weights(self, weights: Union[Path, str]):
    if isinstance(weights, str) and Path(weights).exists():
        ckpt = torch.load(weights, map_location="cpu")
        state_dict = ckpt["state_dict"] if "state_dict" in ckpt else ckpt
    elif weights in self.pretrained_models:
        pretrained = self.pretrained_models[weights]
        downloaded_weights = download_checkpoint_one_of(url_or_fid_list=pretrained["url"], hash_md5=pretrained["hash"], fname=pretrained["fname"])
        ckpt = torch.load(downloaded_weights, map_location="cpu")
        state_dict = ckpt["state_dict"] if "state_dict" in ckpt else ckpt
    else:
        raise ValueError(f"Cannot locate the weights file at {weights} and it's not a predefined model key.")
    self.model.load_state_dict(state_dict, strict=False)
    pprint('加载权重到模型!!')

@classmethod
def from_pretrained(cls, path: Union[Path, str], arch: str = "vits16", normalise_features: bool = False, use_multi_scale: bool = False):
    return cls(weights=path, arch=arch, normalise_features=normalise_features, use_multi_scale=use_multi_scale)

extractor = CustomViTExtractor.from_pretrained('/content/best_siamese_model.pth', arch='vits16') transform = get_normalisation_resize_torch(im_size=64) embeddings_train, embeddings_val, df_train, df_val = inference_on_dataframe(dataset_root, "df.csv", extractor=extractor, transforms=transform) siamese = ConcatSiamese(extractor=extractor, mlp_hidden_dims=[100]) optimizer = torch.optim.SGD(siamese.parameters(), lr=1e-6) miner = PairsMiner(hard_mining=True) criterion = BCEWithLogitsLoss() train_dataset = DatasetWithLabels(df=df_train, transform=transform, extra_data={"embeddings": embeddings_train}) batch_sampler = BalanceSampler(train_dataset.get_labels(), n_labels=3, n_instances=4) train_loader = DataLoader(train_dataset, batch_sampler=batch_sampler) epochs = 1000 best_map = 0 best_model_path = "best_siamese_model.pth"

for epoch in range(epochs): for batch in train_loader: ids1, ids2, is_negative_pair = miner.sample(features=batch["embeddings"], labels=batch["labels"]) probs = siamese(x1=batch["input_tensors"][ids1], x2=batch["input_tensors"][ids2]) loss = criterion(probs, is_negative_pair.float()) loss.backward() optimizer.step() optimizer.zero_grad() print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item()}") if epoch % 20 == 0: val_dataset = DatasetQueryGallery(df=df_val, extra_data={"embeddings": embeddings_val}, transform=transform) valid_loader = DataLoader(val_dataset, batch_size=4, shuffle=False) postprocessor = PairwiseImagesPostprocessor(top_n=3, pairwise_model=siamese, transforms=transform) calculator = EmbeddingMetrics(postprocessor=postprocessor) calculator.setup(num_samples=len(val_dataset)) for batch in valid_loader: calculator.update_data(data_dict=batch) metrics = calculator.compute_metrics() pprint(metrics['OVERALL']['map'][5]) current_map = metrics['OVERALL']['map'][5] if current_map > best_map: best_map = current_map torch.save(siamese.state_dict(), best_model_path) print(f"New best mAP: {best_map} saved")

siamese.load_state_dict(torch.load(best_model_path)) val_dataset = DatasetQueryGallery(df=df_val, extra_data={"embeddings": embeddings_val}, transform=transform) valid_loader = DataLoader(val_dataset, batch_size=4, shuffle=False) postprocessor = PairwiseImagesPostprocessor(top_n=3, pairwise_model=siamese, transforms=transform) calculator = EmbeddingMetrics(postprocessor=postprocessor) calculator.setup(num_samples=len(val_dataset)) for batch in valid_loader: calculator.update_data(data_dict=batch) metrics = calculator.compute_metrics() pprint(metrics)

Epoch 782/1000, Loss: 0.723181962966919 Epoch 783/1000, Loss: 0.6584874391555786 Epoch 784/1000, Loss: 0.6846531629562378 Epoch 785/1000, Loss: 0.7408323884010315 Epoch 786/1000, Loss: 0.7202926874160767 Epoch 787/1000, Loss: 0.7067816257476807 Epoch 788/1000, Loss: 0.7001203298568726 Epoch 789/1000, Loss: 0.7115796208381653 Epoch 790/1000, Loss: 0.7126930356025696 Epoch 791/1000, Loss: 0.6817541122436523 Epoch 792/1000, Loss: 0.6886244416236877 Epoch 793/1000, Loss: 0.7140399217605591 Epoch 794/1000, Loss: 0.7016043663024902 Epoch 795/1000, Loss: 0.7147329449653625 Epoch 796/1000, Loss: 0.7019691467285156 Epoch 797/1000, Loss: 0.7078322172164917 Epoch 798/1000, Loss: 0.6625533699989319 Epoch 799/1000, Loss: 0.7391482591629028 Epoch 800/1000, Loss: 0.6767370104789734 Epoch 801/1000, Loss: 0.7265196442604065

Postprocessor's inference has been started... cpu: 100%  1/1 [00:00<00:00,  3.66it/s]

Metrics: {'OVERALL': {'cmc': {5: tensor(0.7500)}, 'map': {5: tensor(0.4167)}, 'pcf': {0.5: tensor(0.0052)}, 'precision': {5: tensor(0.6250)}}} @AlekseySh

snow-wind-001 commented 5 months ago

https://colab.research.google.com/drive/1o5CQbbafdKyiooewOHfAqmxJ1b--6UJ8?usp=drive_link

AlekseySh commented 5 months ago

@snow-wind-001 have you tried to increase learning rate?

there is no access to your colab

AlekseySh commented 4 months ago

Check a new example on how to train and use model as a postprocessor in a new OML 3.0: https://open-metric-learning.readthedocs.io/en/latest/postprocessing/postprocessing_home.html

If you still need help here, just re-open this issue