Refactor code in train.py

pvti commented 4 years ago

Hi @ZQPei , currently I'm working on a tracking-by-detection project very related to deepsort. I've tried a lot to make your repo works for me. Do you mind rearrange your code in file train.py? Here, I rewrite your train.py code, I push all the related block in functions like the following:

import os, time
import numpy as np
import argparse
import torch
import torchvision
import torch.backends.cudnn as cudnn
from model import Net

def get_parser():
    parser = argparse.ArgumentParser(description='Train feature extractor for DeepSort')
    parser.add_argument('--data-dir', default='./Mars', type=str, help='Path to data directory, e.g. ./Mars or ./Market1501')
    parser.add_argument('--no-cuda', action='store_true')
    parser.add_argument("--gpu-id", default=0, type=int)
    parser.add_argument('--learning-rate', default=0.1, type=float, help='learning rate')
    parser.add_argument('--interval', '-i', default=20, type=int)
    parser.add_argument('--resume', '-r', action='store_true')
    return parser

def setup_device(gpu_id, no_cuda):
    device = "cuda:{}".format(gpu_id) if torch.cuda.is_available() and not no_cuda else "cpu"
    if torch.cuda.is_available() and not no_cuda:
        cudnn.benchmark = True
    return device

def load_data(data_dir):
    train_dir = os.path.join(data_dir, 'train')
    test_dir = os.path.join(data_dir, 'test')
    transform_train = torchvision.transforms.Compose([torchvision.transforms.RandomCrop((128, 64), padding=4), torchvision.transforms.RandomHorizontalFlip(), torchvision.transforms.ToTensor(), torchvision.transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])
    transform_test = torchvision.transforms.Compose([torchvision.transforms.RandomCrop((128, 64), padding=4), torchvision.transforms.RandomHorizontalFlip(), torchvision.transforms.ToTensor(), torchvision.transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])
    trainloader = torch.utils.data.DataLoader(torchvision.datasets.ImageFolder(train_dir, transform=transform_train), batch_size=64, shuffle=True)
    testloader = torch.utils.data.DataLoader(torchvision.datasets.ImageFolder(test_dir, transform=transform_test), batch_size=64, shuffle=True)
    num_classes = len(trainloader.dataset.classes)
    return trainloader, testloader, num_classes

def define_net(num_classes, resume):
    start_epoch = 0
    best_acc = 0.
    net = Net(num_classes=num_classes)
    if resume:
        assert os.path.isfile('./checkpoint/ckpt.t7'), 'Error: no checkpoint file found!'
        print('Loading from ./checkpoint/ckpt.t7')
        checkpoint = torch.load('./checkpoint/ckpt.t7')
        net_dict = checkpoint['net_dict']
        net.load_state_dict(net_dict)
        best_acc = checkpoint['acc']
        start_epoch = checkpoint['epoch']
    return net, best_acc, start_epoch

def setup_loss_optimizer(net, learning_rate):
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(net.parameters(), learning_rate, momentum=0.9, weight_decay=5e-4)
    return criterion, optimizer

def train(interval, epoch, net, trainloader, device, criterion, optimizer):
    print('\nEpoch: %d'%(epoch+1))
    net.train()
    training_loss = 0.
    train_loss = 0.
    correct = 0
    total = 0
    start = time.time()
    for idx, (inputs, labels) in enumerate(trainloader):
        #forward
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = net(inputs)
        loss = criterion(outputs, labels)

        #backward
        optimizer.zero_grad()
        loss.backward
        optimizer.step()

        #accumurating
        training_loss += loss.item()
        train_loss += loss.item()
        correct += outputs.max(dim=1)[1].eq(labels).sum().item()
        total += labels.size(0)

        #print
        if (idx+1)%interval == 0:
            end = time.time()
            print("[progress:{:.1f}%]time:{:.2f}s Loss:{:.5f} Correct:{}/{} Acc:{:.3f}%".format(100.*(idx+1)/len(trainloader), end-start, train_loss/len(trainloader), correct, total, 100.*correct/total))
            training_loss = 0.
            start = time.time()
    return train_loss/len(trainloader), 1. - correct/total, net

def test(epoch, net, testloader, device, criterion, best_acc):
    net.eval()
    test_loss = 0.
    correct = 0
    total = 0
    start = time.time()
    with torch.no_grad():
        for idx, (inputs, labels) in enumerate(testloader):
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = net(inputs)
            loss = criterion(outputs, labels)

            test_loss += loss.item()
            correct += outputs.max(dim=1)[1].eq(labels).sum().item()
            total += labels.size(0)

        print('Testing ...')
        end = time.time()
        print("[progress:{:.1f}%]time:{:.2f}s Loss:{:.5f} Correct:{}/{} Acc:{:.3f}%".format(100.*(idx+1)/len(testloader), end-start, test_loss/len(testloader), correct, total, 100.*correct/total))

    #save ckpt
    acc = 100.*correct/total
    if acc > best_acc:
        best_acc = acc
        print('Saving parameters to ./checkpoint/ckpt.t7')
        checkpoint = {
            'net_dict': net.state_dict(),
            'acc': acc,
            'epoch': epoch,
        }
        if not os.path.isdir('./checkpoint'):
            os.mkdir('checkpoint')
        torch.save(checkpoint, './checkpoint/ckpt.t7')
    return test_loss/len(testloader), 1. - correct/total, best_acc, net

def lr_decay(learning_rate):
    global optimizer
    for params in optimizer.param_groups:
        params['lr'] *= learning_rate
        lr = params['lr']
        print('Learning rate adjusted to {}'.format(lr))
    return lr
def main():
    args = get_parser().parse_args()
    device = setup_device(args.gpu_id, args.no_cuda)
    trainloader, testloader, num_classes = load_data(args.data_dir)
    net, best_acc, start_epoch = define_net(num_classes, args.resume)
    net.to(device)
    criterion, optimizer = setup_loss_optimizer(net, args.learning_rate)
    for epoch in range(start_epoch, start_epoch+40):
        train_loss, train_err, net = train(args.interval, epoch, net, trainloader, device, criterion, optimizer)
        test_loss, test_err, best_acc, net = test(epoch, net, testloader, device, criterion, best_acc)
        if (epoch+1)%20 == 0:
            lr_decay(args.learning_rate)
    return

if __name__ == '__main__':
    main()

And I think we should puts the training log to a log-directory in order to show the training processing by tensorboard? Any suggestion? Thanks for reading!

ZQPei commented 4 years ago

Hi, @pvtien96 You can open a pull request and I will merge it to our code. Thank you for your contribution to this repo!

394781865 commented 4 years ago

I have three questions:

why training the model without use yolo3 to detect? 2 . I try to using MARS-v160809 dataset to tain a model, it's ok in traing, but get a very low accuracy in test.what's the problem?
And how to train the model with using MARS-v160809 dataset?

pvti commented 4 years ago

@394781865 I'm not sure whether I truly understood your questions.

You can combine deepsort with any detection model, like yolo and its variations (yolov2, yolov3, yolov4...), faster rcnn, ssd...
I didn't test the model successfully.
I used to ask like you. You should read all the issues in this repo and find out yourself. Hope this helps.

china56321 commented 4 years ago

I have three questions:

why training the model without use yolo3 to detect? 2 . I try to using MARS-v160809 dataset to tain a model, it's ok in traing, but get a very low accuracy in test.what's the problem?

And how to train the model with using MARS-v160809 dataset?

Hi , i encountered the same problem, i am training the MARS datasets, and get very low accuracy as well as the training speed is very very very slow, i trained the datasets for 3 days ,just run 9 epoches . Did you figure out the problem ?

WurmD commented 3 years ago

Hi @pvtien96

I'm curious that you refactored the code above but did not mention/change that the current torchvision.datasets.ImageFolder(train_dir, transform=transform_train) simply does not work with the Market1501 dataset structure.

I created this custom Dataset loader:

import natsort
import os
from PIL import Image

from torch.utils.data import Dataset

class Market1501DataSet(Dataset):
    def __init__(self, main_dir, transform):
        self.main_dir = main_dir
        self.transform = transform
        all_imgs = os.listdir(main_dir)
        self.total_imgs = natsort.natsorted(all_imgs)
        # __getitem__ must return an index of list "classes"

        image_names = [x[0:4] for x in self.total_imgs]
        self.classes_dict = dict.fromkeys(image_names)
        for i, key in enumerate(self.classes_dict):
            self.classes_dict[key] = i
        self.classes = list(self.classes_dict)

    def __len__(self):
        return len(self.total_imgs)

    ## Returns: Tuple (image, target) where target is the index of the target category.
    def __getitem__(self, idx):
        img_loc = os.path.join(self.main_dir, self.total_imgs[idx])
        image = Image.open(img_loc).convert("RGB")
        tensor_image = self.transform(image)

        label = self.total_imgs[idx][0:4]
        index_of_label = self.classes_dict[label]

        return (tensor_image, index_of_label)

and then in train.py

# train_dir = os.path.join(root,"bounding_box_train") # EDITed this change out after rearranging the Market1501 dataset structure
# test_dir = os.path.join(root,"bounding_box_test") # EDITed this change out after rearranging the Market1501 dataset structure
train_dir = os.path.join(root,"train")
test_dir = os.path.join(root,"test")

market_train = Market1501DataSet(train_dir, transform=transform_train)
market_test = Market1501DataSet(test_dir, transform=transform_test)

trainloader = torch.utils.data.DataLoader(
    market_train,
    batch_size=64,shuffle=True
)
testloader = torch.utils.data.DataLoader(
    market_test,
    batch_size=64,shuffle=True
)

What did you do?

(I'd like to discuss this with you and the author because I'm not sure if this is strictly correct, since we'll be training on 751 classes, but then testing in 750 completely different classes but with the same index ids.. (so the network should be confused))

EDIT: K rearranged the Market1501 dataset so that two new folders train and test contain half the images of each person, so that we train with 1500 classes, and test on the same 1500 classes

ZQPei / deep_sort_pytorch

Refactor code in train.py #122