ocean-data-factory-sweden / kso

Notebooks to upload/download marine footage, connect to a citizen science project, train machine learning models and publish marine biological observations.
GNU General Public License v3.0
5 stars 12 forks source link

Tutorial 5 Model training, issue with training the model using #286

Closed Bergylta closed 1 year ago

Bergylta commented 1 year ago

🐛 Bug

A clear and concise description of what the bug is.

To Reproduce (REQUIRED)

Input: Project: KSO Select model: Yolo5m-classifier Model type: Object detection model

mlp.train_yolov5(
    exp_name.value,
    weights.artifact_path,
    project,
    epochs=epochs.value,
    batch_size=batch_size.value,
    img_size=(img_h.value, img_w.value),
)

Output:

AttributeError                            Traceback (most recent call last)
Cell In[16], line 1
----> 1 mlp.train_yolov5(
      2     exp_name.value,
      3     weights.artifact_path,
      4     project,
      5     epochs=epochs.value,
      6     batch_size=batch_size.value,
      7     img_size=(img_h.value, img_w.value),
      8 )

File /usr/src/app/kso-dev/kso_utils/project.py:1255, in MLProjectProcessor.train_yolov5(self, exp_name, weights, project, epochs, batch_size, img_size)
   1251 def train_yolov5(
   1252     self, exp_name, weights, project, epochs=50, batch_size=16, img_size=[640, 640]
   1253 ):
   1254     if self.model_type == 1:
-> 1255         self.modules["train"].run(
   1256             entity=self.team_name,
   1257             data=self.data_path,
   1258             hyp=self.hyp_path,
   1259             weights=weights,
   1260             project=project,
   1261             name=exp_name,
   1262             imgsz=img_size,
   1263             batch_size=int(batch_size),
   1264             epochs=epochs,
   1265             single_cls=False,
   1266             cache_images=True,
   1267             upload_dataset=True,
   1268         )
   1269     elif self.model_type == 2:
   1270         self.modules["train"].run(
   1271             entity=self.team_name,
   1272             data=self.data_path,
   (...)
   1278             epochs=epochs,
   1279         )

File /usr/src/app/kso/yolov5/train.py:627, in run(**kwargs)
    625 for k, v in kwargs.items():
    626     setattr(opt, k, v)
--> 627 main(opt)
    628 return opt

File /usr/src/app/kso/yolov5/train.py:527, in main(opt, callbacks)
    525 # Train
    526 if not opt.evolve:
--> 527     train(opt.hyp, opt, device, callbacks)
    529 # Evolve hyperparameters (optional)
    530 else:
    531     # Hyperparameter evolution metadata (mutation scale 0-1, lower_limit, upper_limit)
    532     meta = {
    533         'lr0': (1, 1e-5, 1e-1),  # initial learning rate (SGD=1E-2, Adam=1E-3)
    534         'lrf': (1, 0.01, 1.0),  # final OneCycleLR learning rate (lr0 * lrf)
   (...)
    560         'mixup': (1, 0.0, 1.0),  # image mixup (probability)
    561         'copy_paste': (1, 0.0, 1.0)}  # segment copy-paste (probability)

File /usr/src/app/kso/yolov5/train.py:124, in train(hyp, opt, device, callbacks)
    122     weights = attempt_download(weights)  # download if not found locally
    123 ckpt = torch.load(weights, map_location='cpu')  # load checkpoint to CPU to avoid CUDA memory leak
--> 124 model = Model(cfg or ckpt['model'].yaml, ch=3, nc=nc, anchors=hyp.get('anchors')).to(device)  # create
    125 exclude = ['anchor'] if (cfg or hyp.get('anchors')) and not resume else []  # exclude keys
    126 csd = ckpt['model'].float().state_dict()  # checkpoint state_dict as FP32

File /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1614, in Module.__getattr__(self, name)
   1612     if name in modules:
   1613         return modules[name]
-> 1614 raise AttributeError("'{}' object has no attribute '{}'".format(
   1615     type(self).__name__, name))

AttributeError: 'ClassificationModel' object has no attribute 'yaml'
jannesgg commented 1 year ago

@Bergylta The problem could be here: Model type: Object detection model

You should be using the classification model type if you are using the classification weights, otherwise for object detection you need to select the yolov5m model instead, not the classifier.

Bergylta commented 1 year ago

Thanks @jannesgg , that might have been it, tried the baseline-Yolov5 model instead, but got a different error

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In[19], line 1
----> 1 mlp.train_yolov5(
      2     exp_name.value,
      3     weights.artifact_path,
      4     project,
      5     epochs=epochs.value,
      6     batch_size=batch_size.value,
      7     img_size=(img_h.value, img_w.value),
      8 )

File /usr/src/app/kso-dev/kso_utils/project.py:1255, in MLProjectProcessor.train_yolov5(self, exp_name, weights, project, epochs, batch_size, img_size)
   1251 def train_yolov5(
   1252     self, exp_name, weights, project, epochs=50, batch_size=16, img_size=[640, 640]
   1253 ):
   1254     if self.model_type == 1:
-> 1255         self.modules["train"].run(
   1256             entity=self.team_name,
   1257             data=self.data_path,
   1258             hyp=self.hyp_path,
   1259             weights=weights,
   1260             project=project,
   1261             name=exp_name,
   1262             imgsz=img_size,
   1263             batch_size=int(batch_size),
   1264             epochs=epochs,
   1265             single_cls=False,
   1266             cache_images=True,
   1267             upload_dataset=True,
   1268         )
   1269     elif self.model_type == 2:
   1270         self.modules["train"].run(
   1271             entity=self.team_name,
   1272             data=self.data_path,
   (...)
   1278             epochs=epochs,
   1279         )

File /usr/src/app/kso/yolov5/train.py:627, in run(**kwargs)
    625 for k, v in kwargs.items():
    626     setattr(opt, k, v)
--> 627 main(opt)
    628 return opt

File /usr/src/app/kso/yolov5/train.py:527, in main(opt, callbacks)
    525 # Train
    526 if not opt.evolve:
--> 527     train(opt.hyp, opt, device, callbacks)
    529 # Evolve hyperparameters (optional)
    530 else:
    531     # Hyperparameter evolution metadata (mutation scale 0-1, lower_limit, upper_limit)
    532     meta = {
    533         'lr0': (1, 1e-5, 1e-1),  # initial learning rate (SGD=1E-2, Adam=1E-3)
    534         'lrf': (1, 0.01, 1.0),  # final OneCycleLR learning rate (lr0 * lrf)
   (...)
    560         'mixup': (1, 0.0, 1.0),  # image mixup (probability)
    561         'copy_paste': (1, 0.0, 1.0)}  # segment copy-paste (probability)

File /usr/src/app/kso/yolov5/train.py:187, in train(hyp, opt, device, callbacks)
    184     LOGGER.info('Using SyncBatchNorm()')
    186 # Trainloader
--> 187 train_loader, dataset = create_dataloader(train_path,
    188                                           imgsz,
    189                                           batch_size // WORLD_SIZE,
    190                                           gs,
    191                                           single_cls,
    192                                           hyp=hyp,
    193                                           augment=True,
    194                                           cache=None if opt.cache == 'val' else opt.cache,
    195                                           rect=opt.rect,
    196                                           rank=LOCAL_RANK,
    197                                           workers=workers,
    198                                           image_weights=opt.image_weights,
    199                                           quad=opt.quad,
    200                                           prefix=colorstr('train: '),
    201                                           shuffle=True)
    202 labels = np.concatenate(dataset.labels, 0)
    203 mlc = int(labels[:, 0].max())  # max label class

File /usr/src/app/kso/yolov5/utils/dataloaders.py:123, in create_dataloader(path, imgsz, batch_size, stride, single_cls, hyp, augment, cache, pad, rect, rank, workers, image_weights, quad, prefix, shuffle)
    121     shuffle = False
    122 with torch_distributed_zero_first(rank):  # init dataset *.cache only once if DDP
--> 123     dataset = LoadImagesAndLabels(
    124         path,
    125         imgsz,
    126         batch_size,
    127         augment=augment,  # augmentation
    128         hyp=hyp,  # hyperparameters
    129         rect=rect,  # rectangular batches
    130         cache_images=cache,
    131         single_cls=single_cls,
    132         stride=int(stride),
    133         pad=pad,
    134         image_weights=image_weights,
    135         prefix=prefix)
    137 batch_size = min(batch_size, len(dataset))
    138 nd = torch.cuda.device_count()  # number of CUDA devices

File /usr/src/app/kso/yolov5/utils/dataloaders.py:456, in LoadImagesAndLabels.__init__(self, path, img_size, batch_size, augment, hyp, rect, image_weights, cache_images, single_cls, stride, pad, min_items, prefix)
    454 self.rect = False if image_weights else rect
    455 self.mosaic = self.augment and not self.rect  # load 4 images at a time into a mosaic (only during training)
--> 456 self.mosaic_border = [-img_size // 2, -img_size // 2]
    457 self.stride = stride
    458 self.path = path

TypeError: bad operand type for unary -: 'list'