Open ciaoyizhen opened 2 weeks ago
MODEL_NAME = "microsoft/conditional-detr-resnet-50" # or "facebook/detr-resnet-50"
IMAGE_SIZE = 480
from datasets import load_dataset
cppe5 = load_dataset("cppe-5")
import numpy as np
import os
from PIL import Image, ImageDraw
categories = cppe5["train"].features["objects"].feature["category"].names
id2label = {index: x for index, x in enumerate(categories, start=0)}
label2id = {v: k for k, v in id2label.items()}
from transformers import AutoImageProcessor
MAX_SIZE = IMAGE_SIZE
image_processor = AutoImageProcessor.from_pretrained(
MODEL_NAME,
do_resize=True,
size={"max_height": MAX_SIZE, "max_width": MAX_SIZE},
do_pad=True,
pad_size={"height": MAX_SIZE, "width": MAX_SIZE},
)
# import albumentations as A
# train_augment_and_transform = A.Compose(
# [
# A.Perspective(p=0.1),
# A.HorizontalFlip(p=0.5),
# A.RandomBrightnessContrast(p=0.5),
# A.HueSaturationValue(p=0.1),
# ],
# bbox_params=A.BboxParams(format="coco", label_fields=["category"], clip=True, min_area=25),
# )
# validation_transform = A.Compose(
# [A.NoOp()],
# bbox_params=A.BboxParams(format="coco", label_fields=["category"], clip=True),
# )
train_augment_and_transform = None
validation_transform = None
def format_image_annotations_as_coco(image_id, categories, areas, bboxes):
"""Format one set of image annotations to the COCO format
Args:
image_id (str): image id. e.g. "0001"
categories (List[int]): list of categories/class labels corresponding to provided bounding boxes
areas (List[float]): list of corresponding areas to provided bounding boxes
bboxes (List[Tuple[float]]): list of bounding boxes provided in COCO format
([center_x, center_y, width, height] in absolute coordinates)
Returns:
dict: {
"image_id": image id,
"annotations": list of formatted annotations
}
"""
annotations = []
for category, area, bbox in zip(categories, areas, bboxes):
formatted_annotation = {
"image_id": image_id,
"category_id": category,
"iscrowd": 0,
"area": area,
"bbox": list(bbox),
}
annotations.append(formatted_annotation)
return {
"image_id": image_id,
"annotations": annotations,
}
def augment_and_transform_batch(examples, transform, image_processor, return_pixel_mask=False):
"""Apply augmentations and format annotations in COCO format for object detection task"""
images = []
annotations = []
for image_id, image, objects in zip(examples["image_id"], examples["image"], examples["objects"]):
image = np.array(image.convert("RGB"))
# apply augmentations
# output = transform(image=image, bboxes=objects["bbox"], category=objects["category"])
images.append(image)
# format annotations in COCO format
formatted_annotations = format_image_annotations_as_coco(
image_id, objects["category"], objects["area"], objects["bbox"]
)
annotations.append(formatted_annotations)
# Apply the image processor transformations: resizing, rescaling, normalization
result = image_processor(images=images, annotations=annotations, return_tensors="pt")
if not return_pixel_mask:
result.pop("pixel_mask", None)
return result
from functools import partial
# Make transform functions for batch and apply for dataset splits
train_transform_batch = partial(
augment_and_transform_batch, transform=train_augment_and_transform, image_processor=image_processor
)
validation_transform_batch = partial(
augment_and_transform_batch, transform=validation_transform, image_processor=image_processor
)
cppe5["train"] = cppe5["train"].with_transform(train_transform_batch)
cppe5["test"] = cppe5["test"].with_transform(validation_transform_batch)
import torch
def collate_fn(batch):
data = {}
data["pixel_values"] = torch.stack([x["pixel_values"] for x in batch])
data["labels"] = [x["labels"] for x in batch]
if "pixel_mask" in batch[0]:
data["pixel_mask"] = torch.stack([x["pixel_mask"] for x in batch])
return data
from transformers.image_transforms import center_to_corners_format
def convert_bbox_yolo_to_pascal(boxes, image_size):
"""
Convert bounding boxes from YOLO format (x_center, y_center, width, height) in range [0, 1]
to Pascal VOC format (x_min, y_min, x_max, y_max) in absolute coordinates.
Args:
boxes (torch.Tensor): Bounding boxes in YOLO format
image_size (Tuple[int, int]): Image size in format (height, width)
Returns:
torch.Tensor: Bounding boxes in Pascal VOC format (x_min, y_min, x_max, y_max)
"""
# convert center to corners format
boxes = center_to_corners_format(boxes)
# convert to absolute coordinates
height, width = image_size
boxes = boxes * torch.tensor([[width, height, width, height]])
return boxes
import numpy as np
from dataclasses import dataclass
from torchmetrics.detection.mean_ap import MeanAveragePrecision
@dataclass
class ModelOutput:
logits: torch.Tensor
pred_boxes: torch.Tensor
@torch.no_grad()
def compute_metrics(evaluation_results, image_processor, threshold=0.0, id2label=None):
"""
Compute mean average mAP, mAR and their variants for the object detection task.
Args:
evaluation_results (EvalPrediction): Predictions and targets from evaluation.
threshold (float, optional): Threshold to filter predicted boxes by confidence. Defaults to 0.0.
id2label (Optional[dict], optional): Mapping from class id to class name. Defaults to None.
Returns:
Mapping[str, float]: Metrics in a form of dictionary {<metric_name>: <metric_value>}
"""
predictions, targets = evaluation_results.predictions, evaluation_results.label_ids
# For metric computation we need to provide:
# - targets in a form of list of dictionaries with keys "boxes", "labels"
# - predictions in a form of list of dictionaries with keys "boxes", "scores", "labels"
image_sizes = []
post_processed_targets = []
post_processed_predictions = []
# Collect targets in the required format for metric computation
for batch in targets:
# collect image sizes, we will need them for predictions post processing
batch_image_sizes = torch.tensor(np.array([x["orig_size"] for x in batch]))
image_sizes.append(batch_image_sizes)
# collect targets in the required format for metric computation
# boxes were converted to YOLO format needed for model training
# here we will convert them to Pascal VOC format (x_min, y_min, x_max, y_max)
for image_target in batch:
boxes = torch.tensor(image_target["boxes"])
boxes = convert_bbox_yolo_to_pascal(boxes, image_target["orig_size"])
labels = torch.tensor(image_target["class_labels"])
post_processed_targets.append({"boxes": boxes, "labels": labels})
# Collect predictions in the required format for metric computation,
# model produce boxes in YOLO format, then image_processor convert them to Pascal VOC format
for batch, target_sizes in zip(predictions, image_sizes):
batch_logits, batch_boxes = batch[1], batch[2]
output = ModelOutput(logits=torch.tensor(batch_logits), pred_boxes=torch.tensor(batch_boxes))
post_processed_output = image_processor.post_process_object_detection(
output, threshold=threshold, target_sizes=target_sizes
)
post_processed_predictions.extend(post_processed_output)
# Compute metrics
metric = MeanAveragePrecision(box_format="xyxy", class_metrics=True)
metric.update(post_processed_predictions, post_processed_targets)
metrics = metric.compute()
# Replace list of per class metrics with separate metric for each class
classes = metrics.pop("classes")
map_per_class = metrics.pop("map_per_class")
mar_100_per_class = metrics.pop("mar_100_per_class")
for class_id, class_map, class_mar in zip(classes, map_per_class, mar_100_per_class):
class_name = id2label[class_id.item()] if id2label is not None else class_id.item()
metrics[f"map_{class_name}"] = class_map
metrics[f"mar_100_{class_name}"] = class_mar
metrics = {k: round(v.item(), 4) for k, v in metrics.items()}
return metrics
eval_compute_metrics_fn = partial(
compute_metrics, image_processor=image_processor, id2label=id2label, threshold=0.0
)
from transformers import AutoModelForObjectDetection
model = AutoModelForObjectDetection.from_pretrained(
MODEL_NAME,
id2label=id2label,
label2id=label2id,
ignore_mismatched_sizes=True,
)
from transformers import TrainingArguments
training_args = TrainingArguments(
output_dir="detr_finetuned_cppe5",
num_train_epochs=30,
fp16=False,
per_device_train_batch_size=4,
dataloader_num_workers=0,
learning_rate=5e-5,
lr_scheduler_type="cosine",
weight_decay=1e-4,
max_grad_norm=0.01,
metric_for_best_model="eval_map",
greater_is_better=True,
load_best_model_at_end=True,
eval_strategy="epoch",
save_strategy="epoch",
# save_total_limit=2,
remove_unused_columns=False,
eval_do_concat_batches=False,
# push_to_hub=True,
)
from transformers import Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=cppe5["train"],
eval_dataset=cppe5["test"],
tokenizer=image_processor,
data_collator=collate_fn,
compute_metrics=eval_compute_metrics_fn,
)
trainer.train()
this is the code which is ok in colab but not ok in my computer
this is colab. the loss can be reduce to 1
this is in my computer, I try many time but loss only reduce to 3. the code is same.
Hi @ciaoyizhen thanks for opening an issue!
Can you please add information regarding the environment used in Colab? Is it the same?
@qubvel colab result transformers-cli env
24-08-28 12:04:26.243851: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-28 12:04:26.306691: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-28 12:04:26.319043: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-08-28 12:04:26.341982: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-08-28 12:04:28.593070: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
WARNING:tensorflow:From /usr/local/lib/python3.10/dist-packages/transformers/commands/env.py:102: is_gpu_available (from tensorflow.python.framework.test_util) is deprecated and will be removed in a future version.
Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.
WARNING: All log messages before absl::InitializeLog() is called are written to STDERR
I0000 00:00:1724846673.592282 704 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1724846673.973127 704 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1724846673.973433 704 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1724846674.474514 704 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1724846674.474981 704 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-08-28 12:04:34.475204: W tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc:47] Overriding orig_value setting because the TF_FORCE_GPU_ALLOW_GROWTH environment variable is set. Original config value was 0.
I0000 00:00:1724846674.475303 704 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-08-28 12:04:34.480232: I tensorflow/core/common_runtime/gpu/gpu_device.cc:2021] Created device /device:GPU:0 with 13949 MB memory: -> device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5
Copy-and-paste the text below in your GitHub issue and FILL OUT the two last points.
- `transformers` version: 4.42.4
- Platform: Linux-6.1.85+-x86_64-with-glibc2.35
- Python version: 3.10.12
- Huggingface_hub version: 0.23.5
- Safetensors version: 0.4.4
- Accelerate version: 0.32.1
- Accelerate config: not found
- PyTorch version (GPU?): 2.4.0+cu121 (True)
- Tensorflow version (GPU?): 2.17.0 (True)
- Flax version (CPU?/GPU?/TPU?): 0.8.4 (gpu)
- Jax version: 0.4.26
- JaxLib version: 0.4.26
- Using distributed or parallel set-up in script?: <fill in>
- Using GPU in script?: <fill in>
- GPU type: Tesla T4
Copy-and-paste the text below in your GitHub issue and FILL OUT the two last points.
- `transformers` version: 4.44.2
- Platform: Linux-5.15.0-119-generic-x86_64-with-glibc2.31
- Python version: 3.11.8
- Huggingface_hub version: 0.24.6
- Safetensors version: 0.4.4
- Accelerate version: 0.32.1
- Accelerate config: not found
- PyTorch version (GPU?): 2.2.1+cu121 (True)
- Tensorflow version (GPU?): not installed (NA)
- Flax version (CPU?/GPU?/TPU?): not installed (NA)
- Jax version: not installed
- JaxLib version: not installed
- Using distributed or parallel set-up in script?: <fill in>
- Using GPU in script?: <fill in>
- GPU type: NVIDIA GeForce RTX 3060
I rented a 3060. He is also capable of convergence
Also I retrained my macbook once! Still no convergence
I think this is a macbook error???
Hi @ciaoyizhen, thanks for sharing these details! With two different transformers and pytorch versions running in the environment, it's very difficult to compare the two runs or pinpoint what might be the issue, as there could be many contributing factors.
I'd suggest creating a virtual environment for running on each hardware and ensure the installed libraries are the same and then re-run the comparison
System Info
Copy-and-paste the text below in your GitHub issue and FILL OUT the two last points.
transformers
version: 4.44.0Who can help?
@amyeroberts I'm working on an open source project The purpose is to use huggingface to train a task, only need to modify the yaml file can be adapted to a variety of model training. I've already completed the image classification Currently working on object detection object detect This is an unfinished product, and while the training works fine, there is a bug that the model does not converge. So I checked huggingface object detect tutorial. Then I ran him on a colab and he was able to converge, then I ran the same code on my own computer, a macbook, and the model did not converge. I really can't clear what's causing this, can any of you tell me how to fix it?
Information
Tasks
examples
folder (such as GLUE/SQuAD, ...)Reproduction
you can clone my code in github
running code The model does not converge
Expected behavior
the model can be converge