keris2020 / hackathon

10 stars 7 forks source link

AdamW 사용하는 법 #90

Open kkh041124 opened 3 years ago

kkh041124 commented 3 years ago

import os import math

import argparse import nsml import torch import torch.nn as nn import torchvision.models as models

from data_loader import feed_infer from data_local_loader import data_loader from torch.optim import AdamW from torch.optim.lr_scheduler import StepLR

from nsml import DATASET_PATH, IS_ON_NSML from evaluation import evaluation_metrics

from .optimizer import Optimizer

if IS_ON_NSML: TRAIN_DATASET_PATH = os.path.join(DATASET_PATH, 'train', 'train_data') else: DATASET_PATH = '/home/dataset/keris/'

class AdamW(Optimizer): r"""Implements AdamW algorithm.

The original Adam algorithm was proposed in `Adam: A Method for Stochastic Optimization`_.
The AdamW variant was proposed in `Decoupled Weight Decay Regularization`_.

Arguments:
    params (iterable): iterable of parameters to optimize or dicts defining
        parameter groups
    lr (float, optional): learning rate (default: 1e-3)
    betas (Tuple[float, float], optional): coefficients used for computing
        running averages of gradient and its square (default: (0.9, 0.999))
    eps (float, optional): term added to the denominator to improve
        numerical stability (default: 1e-8)
    weight_decay (float, optional): weight decay coefficient (default: 1e-2)
    amsgrad (boolean, optional): whether to use the AMSGrad variant of this
        algorithm from the paper `On the Convergence of Adam and Beyond`_
        (default: False)

.. _Adam\: A Method for Stochastic Optimization:
    https://arxiv.org/abs/1412.6980
.. _Decoupled Weight Decay Regularization:
    https://arxiv.org/abs/1711.05101
.. _On the Convergence of Adam and Beyond:
    https://openreview.net/forum?id=ryQu7f-RZ
"""

def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
             weight_decay=1e-2, amsgrad=False):
    if not 0.0 <= lr:
        raise ValueError("Invalid learning rate: {}".format(lr))
    if not 0.0 <= eps:
        raise ValueError("Invalid epsilon value: {}".format(eps))
    if not 0.0 <= betas[0] < 1.0:
        raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
    if not 0.0 <= betas[1] < 1.0:
        raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
    if not 0.0 <= weight_decay:
        raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
    defaults = dict(lr=lr, betas=betas, eps=eps,
                    weight_decay=weight_decay, amsgrad=amsgrad)
    super(AdamW, self).__init__(params, defaults)

def __setstate__(self, state):
    super(AdamW, self).__setstate__(state)
    for group in self.param_groups:
        group.setdefault('amsgrad', False)

@torch.no_grad()

def step(self, closure=None):
    """Performs a single optimization step.

    Arguments:
        closure (callable, optional): A closure that reevaluates the model
            and returns the loss.
    """
    loss = None
    if closure is not None:
        with torch.enable_grad():
            loss = closure()

    for group in self.param_groups:
        for p in group['params']:
            if p.grad is None:
                continue

            # Perform stepweight decay
            p.mul_(1 - group['lr'] * group['weight_decay'])

            # Perform optimization step
            grad = p.grad
            if grad.is_sparse:
                raise RuntimeError('AdamW does not support sparse gradients')
            amsgrad = group['amsgrad']

            state = self.state[p]

            # State initialization
            if len(state) == 0:
                state['step'] = 0
                # Exponential moving average of gradient values
                state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
                # Exponential moving average of squared gradient values
                state['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
                if amsgrad:
                    # Maintains max of all exp. moving avg. of sq. grad. values
                    state['max_exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)

            exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
            if amsgrad:
                max_exp_avg_sq = state['max_exp_avg_sq']
            beta1, beta2 = group['betas']

            state['step'] += 1
            bias_correction1 = 1 - beta1 ** state['step']
            bias_correction2 = 1 - beta2 ** state['step']

            # Decay the first and second moment running average coefficient
            exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
            exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
            if amsgrad:
                # Maintains the maximum of all 2nd moment running avg. till now
                torch.maximum(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
                # Use the max. for normalizing running avg. of gradient
                denom = (max_exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])
            else:
                denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])

            step_size = group['lr'] / bias_correction1

            p.addcdiv_(exp_avg, denom, value=-step_size)

    return loss

class ClsResNet(models.ResNet): """Model definition.

You can use any model for the challenge. Feel free to modify this class.
"""

def forward(self, x, extract=False):
    x = self.conv1(x)
    x = self.bn1(x)
    x = self.relu(x)
    x = self.maxpool(x)

    x = self.layer1(x)
    x = self.layer2(x)
    x = self.layer3(x)
    x = self.layer4(x)

    x = self.avgpool(x)
    x = x.view(x.size(0), -1)

    x = self.fc(x)
    return x

def _infer(model, root_path, loader=None): """Local inference function for NSML infer.

Args:
    model: instance. Any model is available.
    root_path: string. Automatically set by NSML.
    test_loader: instance. Data loader is defined in `data_local_loader.py`.

Returns:
    predictions_str: list of string.
                     ['img_1,1,0,1,0,1,0,0,0', 'img_2,0,1,0,0,1,0,0,0', ...]
"""
model.eval()

if loader is None:
    loader = data_loader(root=os.path.join(root_path))

list_of_fids = []
list_of_preds = []

for idx, (image, fid, _) in enumerate(loader):
    image = image.cuda()
    fc = model(image, extract=True)
    fc = fc.detach().cpu().numpy()
    fc = 1 * (fc > 0.5)

    list_of_fids.extend(fid)
    list_of_preds.extend(fc)

predictions_str = []
for idx, fid in enumerate(list_of_fids):
    test_str = fid
    for pred in list_of_preds[idx]:
        test_str += ',{}'.format(pred)
    predictions_str.append(test_str)

return predictions_str

def bind_nsml(model): """NSML binding function.

This function is used for internal process in NSML.
Please modify this module according to your framework.
"""

def save(dir_name, *args, **kwargs):
    os.makedirs(dir_name, exist_ok=True)
    state = {
        'model': model.state_dict(),
    }
    torch.save(state, os.path.join(dir_name, 'model.pth'))
    print('saved')

def load(dir_name, *args, **kwargs):
    state = torch.load(os.path.join(dir_name, 'model.pth'))
    model.load_state_dict(state['model'])
    print('loaded')

def infer(root_path, top_k=1):
    return _infer(model, root_path)

nsml.bind(save=save, load=load, infer=infer)

def load_weight(model): """Weight loading function.

You should put your weight file on root directory. The name of weight file
should be 'checkpoint.pth'. If there is no 'checkpoint.pth' on root directory,
the weights will be randomly initialized.
"""
if os.path.isfile('checkpoint.pth'):
    state_dict = torch.load('checkpoint.pth')['state_dict']
    model.load_state_dict(state_dict, strict=True)
else:
    for m in model.modules():
        if isinstance(m, nn.Conv2d):
            n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
            m.weight.data.normal_(0, math.sqrt(2. / n))
        elif isinstance(m, nn.BatchNorm2d):
            m.weight.data.fill_(1)
            m.bias.data.zero_()

def local_eval(model, loader, gt_path): """Local debugging function.

You can use this function for debugging. You may need dummy gt file.

Args:
    model: instance.
    test_loader: instance.
    gt_path: string.

Returns:
    metric_result: float. Performance of your method.
"""
pred_path = 'pred.txt'
feed_infer(pred_path, lambda root_path: _infer(model=model,
                                               root_path=root_path,
                                               loader=loader))
metric_result = evaluation_metrics(pred_path, gt_path)
return metric_result

if name == 'main': args = argparse.ArgumentParser() args.add_argument("--num_classes", type=int, default=8)

# Arguments for train mode
args.add_argument("--num_epochs", type=int, default=300)
args.add_argument("--base_lr", type=float, default=0.001) #default 0.001 fir 0.001 sec 0.001 thi 0.1 fo 0.01 fi 0.005 si 0.001 se 0.001 ei 0.001 ni 0.001 te 0.0005 el 0.01 tw 0.005 th 0.005 ft 0.005
args.add_argument("--step_size", type=int, default=50)

# These three arguments are reserved for nsml. Do not change.
args.add_argument("--mode", type=str, default="train")
args.add_argument("--iteration", type=str, default='0')
args.add_argument("--pause", type=int, default=0)

config = args.parse_args()

model = ClsResNet(block=models.resnet.BasicBlock,
                  layers=[3, 4, 6, 3],
                  num_classes=config.num_classes)
load_weight(model)
criterion = nn.BCEWithLogitsLoss()

model = model.cuda()
criterion = criterion.cuda()

optimizer1 = AdamW([param for param in model.parameters() if param.requires_gard], lr=config.base_lr, weight_decay=1e-2)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer1, mode='min', factor=0.3,
patience=7, threshold=0.001, threshold_mode='rel', cooldown=0, min_lr=0, eps=1e-04,
verbose=False) #StepLR(optimizer, step_size=20, gamma=0.5) #default gamma 0.1 fir 0.3 sec 0.5 thi 0.1 fo 0.3 fi 0.3 si 0.3 se 0.3 ei 0.2 ni 0.4 te 0.5 el 0.1 tw 0.1 th 0.1 ft 0.1
if IS_ON_NSML:
    # This NSML block is mandatory. Do not change.
    bind_nsml(model)
    nsml.save('checkpoint')
    if config.pause:
        nsml.paused(scope=locals())

if config.mode == 'train':
    # Local debugging block. This module is not mandatory.
    # But this would be quite useful for troubleshooting.
    train_loader = data_loader(root=DATASET_PATH, split='train')
    val_loader = data_loader(root=DATASET_PATH, split='val')
    num_batches = len(train_loader)

    for epoch in range(config.num_epochs):

        #model.train()
        model.train()

        total_loss = 0.0
        num_images = 0

        for iter_, (image, image_id, label) in enumerate(train_loader):
            image = image.cuda()
            label = label.cuda()

            pred = model(image)
            loss = criterion(pred, label)

            total_loss += loss.item() * image.size(0)
            num_images += image.size(0)

            optimizer1.zero_grad()
            loss.backward()
            optimizer1.step()

        loss_average = total_loss / float(num_images)
        scheduler.step(metrics=loss_average, epoch=epoch)

        if IS_ON_NSML:
            nsml.save(str(epoch + 1))

        gt_label = os.path.join(DATASET_PATH, 'train/train_data/val_label')
        acc = local_eval(model, val_loader, gt_label)

        print(f'[{epoch + 1}/{config.num_epochs}] '
              f'Validation performance: {acc:.3f}')
        nsml.report(step=epoch, val_acc=acc)
        nsml.report(step=epoch, train_loss=loss_average)
        print("loss_average: ",loss_average)
        print("real epoch: ", epoch)

이렇게 작성 했는데 Traceback (most recent call last): File "main.py", line 18, in from .optimizer import Optimizer ModuleNotFoundError: No module named 'main.optimizer'; 'main' is not a package User session exited 이런 오류가 뜨는데 뭐가 문제일까요

taeseug commented 3 years ago

from .optimizer import Optimizer 이 부분을 #으로 막고 실행해보세요

kkh041124 commented 3 years ago

Traceback (most recent call last): File "main.py", line 26, in class AdamW(Optimizer): NameError: name 'Optimizer' is not defined 이런 오류가 떠요

taeseug commented 3 years ago

from torch.optim import AdamW 가 있으니 class AdamW 로 선언합 부분이 필요없습니다 이 부분 삭제하시고 실행시켜보세쇼

taeseug commented 3 years ago

오늘 오후 7시에 김윤기님의 모덜성능 성능향상 방법에 대한 zoom 강의가 있으니 꼭한번들어보세요 그러면 좀더 쉽게 optimizer 사용법에 대해 알수 있을겁니다

  1. AdamW
  2. Epoch 300까지
  3. ReduceOnLRPleateu
  4. Data augmentation (HorizontalFlip,Rotation)
  5. 모델 사이즈 키우기 layers=[3,4,6,3]