fangwei123456 / spikingjelly

SpikingJelly is an open-source deep learning framework for Spiking Neural Network (SNN) based on PyTorch.
https://spikingjelly.readthedocs.io
Other
1.22k stars 233 forks source link

使用sjds.split_to_train_test_set提取训练集、测试集,主函数训练报错 #515

Closed 2ephyrus closed 3 months ago

2ephyrus commented 3 months ago

作者您好! 我想在[Multi-Level Firing with Spiking DS-ResNet: Enabling Better and Deeper Directly-Trained Spiking Neural Networks]这篇文章提供的代码基础上进行N-Caltech数据集的测试,使用sj的sjds.split_to_train_test_set函数得到了trainset与testset,并使用MLF模型在其他数据集上的主函数进行了测试,报错如下:

C:\cb\pytorch_1000000000000\work\aten\src\ATen\native\cuda\Loss.cu:240: block: [0,0,0], thread: [0,0,0] Assertion `t >= 0 && t < n_classes` failed.
Traceback (most recent call last):
  File "E:\GitCode\MLF-DSResNet\train_for_Caltech101.py", line 157, in <module>
    train(args, model, train_loader1, optimizer, device, epoch, writer)
  File "E:\GitCode\MLF-DSResNet\train_for_Caltech101.py", line 53, in train
    loss.backward()
  File "E:\anaconda3\lib\site-packages\torch\_tensor.py", line 487, in backward
    torch.autograd.backward(
  File "E:\anaconda3\lib\site-packages\torch\autograd\__init__.py", line 200, in backward
    Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.

提供N-Caltech主函数如下:

import argparse
import torch
from parallel_nets.ResNet_for_NCaltech import *
import torch.optim as optim
import os
import torch.nn.functional as F
import time
from tensorboardX import SummaryWriter
import torch.utils.data
import NCaltech_preprocess

def data_model_load(args, model, kwargs):
    train_dataset, test_dataset = NCaltech_preprocess.splitdataset()
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, **kwargs)
    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=args.test_batch_size, shuffle=True, **kwargs)
    if args.pretrained:
        checkpoint = torch.load(args.checkpoint_path)
        model.load_state_dict(checkpoint['model'])
        start_epoch = checkpoint['epoch']
        print('Pretrained model loaded.')
    else:
        start_epoch = 0
        print('Model loaded.')
    return train_loader, test_loader, start_epoch

def train(args, model, train_loader, optimizer, device, epoch, writer):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        data_temp, target = data.to(device), target.to(device)
        bs = data_temp.shape[0]
        data = torch.zeros((TimeStep*bs,) + data_temp.shape[2:], device=data_temp.device)
        for t in range(TimeStep):
            data[t*bs:(t+1)*bs, ...] = data_temp[:, t, :, :, :]

        output = model(data)

        target = target.long().to(target.device)

        loss = F.cross_entropy(output, target)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        if batch_idx % args.log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * args.batch_size, len(train_loader.dataset),
                       100. * batch_idx / len(train_loader), loss.item()))
            if args.tensorboard:
                writer.add_scalar('Train Loss / batch_idx', loss.item(), batch_idx + len(train_loader) * epoch)

def _test(args, model, test_loader, device, writer):
    model.eval()
    total_loss = 0.
    correct = 0
    with torch.no_grad():
        for batch_idx, (data, target) in enumerate(test_loader):
            data_temp, target = data.to(device), target.to(device)
            bs = data_temp.shape[0]
            data = torch.zeros((TimeStep * bs,) + data_temp.shape[2:], device=data_temp.device)
            for t in range(TimeStep):
                data[t * bs:(t + 1) * bs, ...] = data_temp[:, t, :, :, :]

            output = model(data)
            target = target.long().to(target.device)

            total_loss += F.cross_entropy(output, target, reduction='sum').item()
            pre_result = output.argmax(dim=1, keepdim=True)
            correct += pre_result.eq(target.view_as(pre_result)).sum().item()

    total_loss /= len(test_loader.dataset)
    accuracy = 100. * correct / len(test_loader.dataset)
    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.2f}%)'.format(
        total_loss, correct, len(test_loader.dataset),
        accuracy))

    if args.tensorboard:
        writer.add_scalar('Test Loss / epoch', total_loss, epoch)
        writer.add_scalar('Test Accuracy / epoch', accuracy, epoch)

if __name__ == '__main__':
    # 超参数设置  4
    batch_size = 1
    #
    parser = argparse.ArgumentParser(description='trian')
    # 32
    parser.add_argument('--batch-size', type=int, default=batch_size, help='input batch size for training')
    parser.add_argument('--test-batch-size', type=int, default=batch_size, help='input batch size for testing')
    # 100
    parser.add_argument('--total-epochs', type=int, default=30, help='number of epochs to train')
    # 0.1
    parser.add_argument('--lr', type=float, default=0.01, help='learning rate')
    parser.add_argument('--use-cuda', action='store_true', default=True, help='use CUDA training')
    # True
    parser.add_argument('--save', action='store_true', default=False, help='save model')
    parser.add_argument('--tensorboard', action='store_true', default=True, help='write tensorboard')
    parser.add_argument('--pretrained', action='store_true', default=False, help='use pre-trained model') ######   needs to be modified  #####
    parser.add_argument('--log-interval', type=int, default=25,
                        help='how many batches to wait before logging training status')
    # 5
    parser.add_argument('--save-model-interval', type=int, default=5,
                        help='save model every save_model_interval')
    parser.add_argument('--checkpoint-path', type=str, default='./checkpoint/NC/result_NC.pth',
                        help='use CUDA training')
    args = parser.parse_args()
    use_cuda = args.use_cuda and torch.cuda.is_available()
    device = torch.device("cuda:0" if use_cuda else "cpu")
    torch.manual_seed(3)
    # 1 True
    kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}

    writer = None
    writer_path = './summaries/N_Caltech/result_N_Caltech' + '_' + str(len(os.listdir('./summaries/N_Caltech')))
    if args.tensorboard:
        writer = SummaryWriter(writer_path)

    model = resnet14().to(device)

    train_loader1, test_loader1, start_epoch = data_model_load(args, model, kwargs)
    # SGD
    optimizer = optim.SGD(model.parameters(), args.lr, momentum=momentum_SGD, weight_decay=1e-3)
    #     40   0.1
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 10, 0.1)
    for _ in range(start_epoch):
        scheduler.step()

    for epoch in range(start_epoch + 1, args.total_epochs + 1):
        start_time = time.time()
        train(args, model, train_loader1, optimizer, device, epoch, writer)
        _test(args, model, test_loader1, device, writer)
        waste_time = time.time() - start_time
        print('One epoch wasting time:{:.0f}s, learning rate:{:.8f}\n'.format(
            waste_time, optimizer.state_dict()['param_groups'][0]['lr']))
        if epoch % args.save_model_interval == 0:
            if args.save:
                state = {'model': model.state_dict(), 'epoch': epoch}
                torch.save(state, args.checkpoint_path)
        scheduler.step()

    if args.tensorboard:
        writer.close()

我的处理函数NCaltech_preprocess:

from spikingjelly.datasets.n_caltech101 import NCaltech101
import spikingjelly.datasets as sjds
# 超参数设置
TS = 5  # 采用全段时间进行分割,这样做,dt会十分长
# ds = [4.29, 3.04] # size = [2, 42, 42]
# 解压
root_dir = 'E:\GitCode\MLF-DSResNet/data/N_Caltech/source'
event_set = NCaltech101(root=root_dir, data_type='frame', frames_number=TS, split_by='number')
# 划分训练集与测试集
# random_split 置为 False,按照从前到后的顺序 划分训练集与测试集。 置为True,则使用随机种子进行分割。
def splitdataset():
    return sjds.split_to_train_test_set(train_ratio = 0.9, origin_dataset = event_set, num_classes = 101, random_split = False)

网络结构移植了其他数据集的代码,目前未发现bug。

2ephyrus commented 3 months ago

这个问题已经被解决,错误原因为输出层的类别没有对应当前数据集。