Open TangGokwai opened 3 years ago
Were you able to fix this? How did you run it on a single GPU?
Hi! I ran the code on a single GPU. I just commented out the distributed and fp16 portions and it's working fine.
Hi! I ran the code on a single GPU. I just commented out the distributed and fp16 portions and it's working fine.
Hello! Can you tell me which part of the code has been commented!
Hello! Can you tell me which part of the code has been commented!
from __future__ import absolute_import, division, print_function
import logging
import argparse
import os
import random
import numpy as np
import time
from datetime import timedelta
import torch
# import torch.distributed as dist
from tqdm import tqdm
from torch.utils.tensorboard import SummaryWriter
# from apex import amp
# from apex.parallel import DistributedDataParallel as DDP
from transfg_model import VisionTransformer, CONFIGS
from utils.scheduler import WarmupLinearSchedule, WarmupCosineSchedule
from utils.data_utils import get_loader
# from utils.dist_util import get_world_size
logger = logging.getLogger(__name__)
class AverageMeter(object):
"""Computes and stores the average and current value"""
def __init__(self):
self.reset()
def reset(self):
self.val = 0
self.avg = 0
self.sum = 0
self.count = 0
def update(self, val, n=1):
self.val = val
self.sum += val * n
self.count += n
self.avg = self.sum / self.count
def simple_accuracy(preds, labels):
return (preds == labels).mean()
# def reduce_mean(tensor, nprocs):
# rt = tensor.clone()
# dist.all_reduce(rt, op=dist.ReduceOp.SUM)
# rt /= nprocs
# return rt
def save_model(args, model):
model_to_save = model.module if hasattr(model, 'module') else model
model_checkpoint = os.path.join(args.output_dir, "%s_checkpoint.bin" % args.name)
if args.fp16:
checkpoint = {
'model': model_to_save.state_dict(),
'amp': amp.state_dict()
}
else:
checkpoint = {
'model': model_to_save.state_dict(),
}
torch.save(checkpoint, model_checkpoint)
logger.info("Saved model checkpoint to [DIR: %s]", args.output_dir)
def setup(args):
# Prepare model
config = CONFIGS[args.model_type]
config.split = args.split
config.slide_step = args.slide_step
if args.dataset == "CUB_200_2011":
num_classes = 200
elif args.dataset == "car":
num_classes = 196
elif args.dataset == "nabirds":
num_classes = 555
elif args.dataset == "dog":
num_classes = 120
elif args.dataset == "INat2017":
num_classes = 5089
model = VisionTransformer(config, args.img_size, zero_head=True, num_classes=num_classes, smoothing_value=args.smoothing_value)
model.load_from(np.load(args.pretrained_dir))
if args.pretrained_model is not None:
pretrained_model = torch.load(args.pretrained_model)['model']
model.load_state_dict(pretrained_model)
model.to(args.device)
num_params = count_parameters(model)
logger.info("{}".format(config))
logger.info("Training parameters %s", args)
logger.info("Total Parameter: \t%2.1fM" % num_params)
return args, model
def count_parameters(model):
params = sum(p.numel() for p in model.parameters() if p.requires_grad)
return params/1000000
def set_seed(args):
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
if args.n_gpu > 0:
torch.cuda.manual_seed_all(args.seed)
def valid(args, model, writer, test_loader, global_step):
# Validation!
eval_losses = AverageMeter()
logger.info("***** Running Validation *****")
logger.info(" Num steps = %d", len(test_loader))
logger.info(" Batch size = %d", args.eval_batch_size)
model.eval()
all_preds, all_label = [], []
epoch_iterator = tqdm(test_loader,
desc="Validating... (loss=X.X)",
bar_format="{l_bar}{r_bar}",
dynamic_ncols=True,
disable=args.local_rank not in [-1, 0])
loss_fct = torch.nn.CrossEntropyLoss()
for step, batch in enumerate(epoch_iterator):
batch = tuple(t.to(args.device) for t in batch)
x, y = batch
with torch.no_grad():
logits = model(x)
eval_loss = loss_fct(logits, y)
eval_loss = eval_loss.mean()
eval_losses.update(eval_loss.item())
preds = torch.argmax(logits, dim=-1)
if len(all_preds) == 0:
all_preds.append(preds.detach().cpu().numpy())
all_label.append(y.detach().cpu().numpy())
else:
all_preds[0] = np.append(
all_preds[0], preds.detach().cpu().numpy(), axis=0
)
all_label[0] = np.append(
all_label[0], y.detach().cpu().numpy(), axis=0
)
epoch_iterator.set_description("Validating... (loss=%2.5f)" % eval_losses.val)
all_preds, all_label = all_preds[0], all_label[0]
accuracy = simple_accuracy(all_preds, all_label)
accuracy = torch.tensor(accuracy).to(args.device)
# dist.barrier()
# val_accuracy = reduce_mean(accuracy, args.nprocs)
val_accuracy = accuracy.detach().cpu().numpy()
logger.info("\n")
logger.info("Validation Results")
logger.info("Global Steps: %d" % global_step)
logger.info("Valid Loss: %2.5f" % eval_losses.avg)
logger.info("Valid Accuracy: %2.5f" % val_accuracy)
if args.local_rank in [-1, 0]:
writer.add_scalar("test/accuracy", scalar_value=val_accuracy, global_step=global_step)
return val_accuracy
def train(args, model):
""" Train the model """
if args.local_rank in [-1, 0]:
os.makedirs(args.output_dir, exist_ok=True)
writer = SummaryWriter(log_dir=os.path.join("logs", args.name))
args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
# Prepare dataset
train_loader, test_loader = get_loader(args)
# Prepare optimizer and scheduler
optimizer = torch.optim.SGD(model.parameters(),
lr=args.learning_rate,
momentum=0.9,
weight_decay=args.weight_decay)
t_total = args.num_steps
if args.decay_type == "cosine":
scheduler = WarmupCosineSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)
else:
scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)
# if args.fp16:
# model, optimizer = amp.initialize(models=model,
# optimizers=optimizer,
# opt_level=args.fp16_opt_level)
# amp._amp_state.loss_scalers[0]._loss_scale = 2**20
# Distributed training
# if args.local_rank != -1:
# model = DDP(model, message_size=250000000, gradient_predivide_factor=get_world_size())
# Train!
logger.info("***** Running training *****")
logger.info(" Total optimization steps = %d", args.num_steps)
logger.info(" Instantaneous batch size per GPU = %d", args.train_batch_size)
logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d",
args.train_batch_size * args.gradient_accumulation_steps * (
torch.distributed.get_world_size() if args.local_rank != -1 else 1))
logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
model.zero_grad()
set_seed(args) # Added here for reproducibility (even between python 2 and 3)
losses = AverageMeter()
global_step, best_acc = 0, 0
start_time = time.time()
while True:
model.train()
epoch_iterator = tqdm(train_loader,
desc="Training (X / X Steps) (loss=X.X)",
bar_format="{l_bar}{r_bar}",
dynamic_ncols=True,
disable=args.local_rank not in [-1, 0])
all_preds, all_label = [], []
for step, batch in enumerate(epoch_iterator):
batch = tuple(t.to(args.device) for t in batch)
x, y = batch
loss, logits = model(x, y)
loss = loss.mean()
preds = torch.argmax(logits, dim=-1)
if len(all_preds) == 0:
all_preds.append(preds.detach().cpu().numpy())
all_label.append(y.detach().cpu().numpy())
else:
all_preds[0] = np.append(
all_preds[0], preds.detach().cpu().numpy(), axis=0
)
all_label[0] = np.append(
all_label[0], y.detach().cpu().numpy(), axis=0
)
if args.gradient_accumulation_steps > 1:
loss = loss / args.gradient_accumulation_steps
# if args.fp16:
# with amp.scale_loss(loss, optimizer) as scaled_loss:
# scaled_loss.backward()
else:
loss.backward()
if (step + 1) % args.gradient_accumulation_steps == 0:
losses.update(loss.item()*args.gradient_accumulation_steps)
if args.fp16:
torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
else:
torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
scheduler.step()
optimizer.step()
optimizer.zero_grad()
global_step += 1
epoch_iterator.set_description(
"Training (%d / %d Steps) (loss=%2.5f)" % (global_step, t_total, losses.val)
)
if args.local_rank in [-1, 0]:
writer.add_scalar("train/loss", scalar_value=losses.val, global_step=global_step)
writer.add_scalar("train/lr", scalar_value=scheduler.get_lr()[0], global_step=global_step)
if global_step % args.eval_every == 0:
with torch.no_grad():
accuracy = valid(args, model, writer, test_loader, global_step)
if args.local_rank in [-1, 0]:
if best_acc < accuracy:
save_model(args, model)
best_acc = accuracy
logger.info("best accuracy so far: %f" % best_acc)
model.train()
if global_step % t_total == 0:
break
all_preds, all_label = all_preds[0], all_label[0]
accuracy = simple_accuracy(all_preds, all_label)
accuracy = torch.tensor(accuracy).to(args.device)
# dist.barrier()
# train_accuracy = reduce_mean(accuracy, args.nprocs)
train_accuracy = accuracy.detach().cpu().numpy()
logger.info("train accuracy so far: %f" % train_accuracy)
losses.reset()
if global_step % t_total == 0:
break
writer.close()
logger.info("Best Accuracy: \t%f" % best_acc)
logger.info("End Training!")
end_time = time.time()
logger.info("Total Training Time: \t%f" % ((end_time - start_time) / 3600))
def main():
parser = argparse.ArgumentParser()
# Required parameters
parser.add_argument("--name", required = True , default= "Aivian",
help="Name of this run. Used for monitoring.")
parser.add_argument("--dataset", choices=["CUB_200_2011", "car", "dog", "nabirds", "INat2017"], default="nabirds",
help="Which dataset.")
parser.add_argument('--data_root', type=str, default='/home/aritram21/Aivian')
parser.add_argument("--model_type", choices=["ViT-B_16", "ViT-B_32", "ViT-L_16",
"ViT-L_32", "ViT-H_14"],
default="ViT-B_16",
help="Which variant to use.")
parser.add_argument("--pretrained_dir", type=str, default="ViT-B_16.npz",
help="Where to search for pretrained ViT models.")
parser.add_argument("--pretrained_model", type=str, default=None,
help="load pretrained model")
parser.add_argument("--output_dir", default="./TransFG_output", type=str,
help="The output directory where checkpoints will be written.")
parser.add_argument("--img_size", default=448, type=int,
help="Resolution size")
parser.add_argument("--train_batch_size", default=16, type=int,
help="Total batch size for training.")
parser.add_argument("--eval_batch_size", default=8, type=int,
help="Total batch size for eval.")
parser.add_argument("--eval_every", default=200, type=int,
help="Run prediction on validation set every so many steps."
"Will always run one evaluation at the end of training.")
parser.add_argument("--learning_rate", default=3e-2, type=float,
help="The initial learning rate for SGD.")
parser.add_argument("--weight_decay", default=0, type=float,
help="Weight deay if we apply some.")
parser.add_argument("--num_steps", default=10000, type=int,
help="Total number of training epochs to perform.")
parser.add_argument("--decay_type", choices=["cosine", "linear"], default="cosine",
help="How to decay the learning rate.")
parser.add_argument("--warmup_steps", default=500, type=int,
help="Step of training to perform learning rate warmup for.")
parser.add_argument("--max_grad_norm", default=1.0, type=float,
help="Max gradient norm.")
parser.add_argument("--local_rank", type=int, default=-1,
help="local_rank for distributed training on gpus")
parser.add_argument('--seed', type=int, default=42,
help="random seed for initialization")
parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
help="Number of updates steps to accumulate before performing a backward/update pass.")
parser.add_argument('--fp16', action='store_true',
help="Whether to use 16-bit float precision instead of 32-bit")
parser.add_argument('--fp16_opt_level', type=str, default='O2',
help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
"See details at https://nvidia.github.io/apex/amp.html")
parser.add_argument('--loss_scale', type=float, default=0,
help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
"0 (default value): dynamic loss scaling.\n"
"Positive power of 2: static loss scaling value.\n")
parser.add_argument('--smoothing_value', type=float, default=0.0,
help="Label smoothing value\n")
parser.add_argument('--split', type=str, default='non-overlap',
help="Split method")
parser.add_argument('--slide_step', type=int, default=12,
help="Slide step for overlap split")
args = parser.parse_args()
# if args.fp16 and args.smoothing_value != 0:
# raise NotImplementedError("label smoothing not supported for fp16 training now")
# args.data_root = '{}/{}'.format(args.data_root, args.dataset)
# Setup CUDA, GPU & distributed training
if args.local_rank == -1:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
args.n_gpu = torch.cuda.device_count()
else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
torch.cuda.set_device(args.local_rank)
device = torch.device("cuda", args.local_rank)
torch.distributed.init_process_group(backend='nccl',
timeout=timedelta(minutes=60))
args.n_gpu = 1
args.device = device
args.nprocs = torch.cuda.device_count()
# Setup logging
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
datefmt='%m/%d/%Y %H:%M:%S',
level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s" %
(args.local_rank, args.device, args.n_gpu, bool(args.local_rank != -1), args.fp16))
# Set seed
set_seed(args)
# Model & Tokenizer Setup
args, model = setup(args)
# Training
train(args, model)
if __name__ == "__main__":
main()
This is the code I used for my own project. This works fine but you'll have to make minor adjustments. Also, it takes a really really long time to train on a single GPU. Just a warning.
Hello! Can you tell me which part of the code has been commented!
from __future__ import absolute_import, division, print_function import logging import argparse import os import random import numpy as np import time from datetime import timedelta import torch # import torch.distributed as dist from tqdm import tqdm from torch.utils.tensorboard import SummaryWriter # from apex import amp # from apex.parallel import DistributedDataParallel as DDP from transfg_model import VisionTransformer, CONFIGS from utils.scheduler import WarmupLinearSchedule, WarmupCosineSchedule from utils.data_utils import get_loader # from utils.dist_util import get_world_size logger = logging.getLogger(__name__) class AverageMeter(object): """Computes and stores the average and current value""" def __init__(self): self.reset() def reset(self): self.val = 0 self.avg = 0 self.sum = 0 self.count = 0 def update(self, val, n=1): self.val = val self.sum += val * n self.count += n self.avg = self.sum / self.count def simple_accuracy(preds, labels): return (preds == labels).mean() # def reduce_mean(tensor, nprocs): # rt = tensor.clone() # dist.all_reduce(rt, op=dist.ReduceOp.SUM) # rt /= nprocs # return rt def save_model(args, model): model_to_save = model.module if hasattr(model, 'module') else model model_checkpoint = os.path.join(args.output_dir, "%s_checkpoint.bin" % args.name) if args.fp16: checkpoint = { 'model': model_to_save.state_dict(), 'amp': amp.state_dict() } else: checkpoint = { 'model': model_to_save.state_dict(), } torch.save(checkpoint, model_checkpoint) logger.info("Saved model checkpoint to [DIR: %s]", args.output_dir) def setup(args): # Prepare model config = CONFIGS[args.model_type] config.split = args.split config.slide_step = args.slide_step if args.dataset == "CUB_200_2011": num_classes = 200 elif args.dataset == "car": num_classes = 196 elif args.dataset == "nabirds": num_classes = 555 elif args.dataset == "dog": num_classes = 120 elif args.dataset == "INat2017": num_classes = 5089 model = VisionTransformer(config, args.img_size, zero_head=True, num_classes=num_classes, smoothing_value=args.smoothing_value) model.load_from(np.load(args.pretrained_dir)) if args.pretrained_model is not None: pretrained_model = torch.load(args.pretrained_model)['model'] model.load_state_dict(pretrained_model) model.to(args.device) num_params = count_parameters(model) logger.info("{}".format(config)) logger.info("Training parameters %s", args) logger.info("Total Parameter: \t%2.1fM" % num_params) return args, model def count_parameters(model): params = sum(p.numel() for p in model.parameters() if p.requires_grad) return params/1000000 def set_seed(args): random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if args.n_gpu > 0: torch.cuda.manual_seed_all(args.seed) def valid(args, model, writer, test_loader, global_step): # Validation! eval_losses = AverageMeter() logger.info("***** Running Validation *****") logger.info(" Num steps = %d", len(test_loader)) logger.info(" Batch size = %d", args.eval_batch_size) model.eval() all_preds, all_label = [], [] epoch_iterator = tqdm(test_loader, desc="Validating... (loss=X.X)", bar_format="{l_bar}{r_bar}", dynamic_ncols=True, disable=args.local_rank not in [-1, 0]) loss_fct = torch.nn.CrossEntropyLoss() for step, batch in enumerate(epoch_iterator): batch = tuple(t.to(args.device) for t in batch) x, y = batch with torch.no_grad(): logits = model(x) eval_loss = loss_fct(logits, y) eval_loss = eval_loss.mean() eval_losses.update(eval_loss.item()) preds = torch.argmax(logits, dim=-1) if len(all_preds) == 0: all_preds.append(preds.detach().cpu().numpy()) all_label.append(y.detach().cpu().numpy()) else: all_preds[0] = np.append( all_preds[0], preds.detach().cpu().numpy(), axis=0 ) all_label[0] = np.append( all_label[0], y.detach().cpu().numpy(), axis=0 ) epoch_iterator.set_description("Validating... (loss=%2.5f)" % eval_losses.val) all_preds, all_label = all_preds[0], all_label[0] accuracy = simple_accuracy(all_preds, all_label) accuracy = torch.tensor(accuracy).to(args.device) # dist.barrier() # val_accuracy = reduce_mean(accuracy, args.nprocs) val_accuracy = accuracy.detach().cpu().numpy() logger.info("\n") logger.info("Validation Results") logger.info("Global Steps: %d" % global_step) logger.info("Valid Loss: %2.5f" % eval_losses.avg) logger.info("Valid Accuracy: %2.5f" % val_accuracy) if args.local_rank in [-1, 0]: writer.add_scalar("test/accuracy", scalar_value=val_accuracy, global_step=global_step) return val_accuracy def train(args, model): """ Train the model """ if args.local_rank in [-1, 0]: os.makedirs(args.output_dir, exist_ok=True) writer = SummaryWriter(log_dir=os.path.join("logs", args.name)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps # Prepare dataset train_loader, test_loader = get_loader(args) # Prepare optimizer and scheduler optimizer = torch.optim.SGD(model.parameters(), lr=args.learning_rate, momentum=0.9, weight_decay=args.weight_decay) t_total = args.num_steps if args.decay_type == "cosine": scheduler = WarmupCosineSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) else: scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) # if args.fp16: # model, optimizer = amp.initialize(models=model, # optimizers=optimizer, # opt_level=args.fp16_opt_level) # amp._amp_state.loss_scalers[0]._loss_scale = 2**20 # Distributed training # if args.local_rank != -1: # model = DDP(model, message_size=250000000, gradient_predivide_factor=get_world_size()) # Train! logger.info("***** Running training *****") logger.info(" Total optimization steps = %d", args.num_steps) logger.info(" Instantaneous batch size per GPU = %d", args.train_batch_size) logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * ( torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) model.zero_grad() set_seed(args) # Added here for reproducibility (even between python 2 and 3) losses = AverageMeter() global_step, best_acc = 0, 0 start_time = time.time() while True: model.train() epoch_iterator = tqdm(train_loader, desc="Training (X / X Steps) (loss=X.X)", bar_format="{l_bar}{r_bar}", dynamic_ncols=True, disable=args.local_rank not in [-1, 0]) all_preds, all_label = [], [] for step, batch in enumerate(epoch_iterator): batch = tuple(t.to(args.device) for t in batch) x, y = batch loss, logits = model(x, y) loss = loss.mean() preds = torch.argmax(logits, dim=-1) if len(all_preds) == 0: all_preds.append(preds.detach().cpu().numpy()) all_label.append(y.detach().cpu().numpy()) else: all_preds[0] = np.append( all_preds[0], preds.detach().cpu().numpy(), axis=0 ) all_label[0] = np.append( all_label[0], y.detach().cpu().numpy(), axis=0 ) if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps # if args.fp16: # with amp.scale_loss(loss, optimizer) as scaled_loss: # scaled_loss.backward() else: loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: losses.update(loss.item()*args.gradient_accumulation_steps) if args.fp16: torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) scheduler.step() optimizer.step() optimizer.zero_grad() global_step += 1 epoch_iterator.set_description( "Training (%d / %d Steps) (loss=%2.5f)" % (global_step, t_total, losses.val) ) if args.local_rank in [-1, 0]: writer.add_scalar("train/loss", scalar_value=losses.val, global_step=global_step) writer.add_scalar("train/lr", scalar_value=scheduler.get_lr()[0], global_step=global_step) if global_step % args.eval_every == 0: with torch.no_grad(): accuracy = valid(args, model, writer, test_loader, global_step) if args.local_rank in [-1, 0]: if best_acc < accuracy: save_model(args, model) best_acc = accuracy logger.info("best accuracy so far: %f" % best_acc) model.train() if global_step % t_total == 0: break all_preds, all_label = all_preds[0], all_label[0] accuracy = simple_accuracy(all_preds, all_label) accuracy = torch.tensor(accuracy).to(args.device) # dist.barrier() # train_accuracy = reduce_mean(accuracy, args.nprocs) train_accuracy = accuracy.detach().cpu().numpy() logger.info("train accuracy so far: %f" % train_accuracy) losses.reset() if global_step % t_total == 0: break writer.close() logger.info("Best Accuracy: \t%f" % best_acc) logger.info("End Training!") end_time = time.time() logger.info("Total Training Time: \t%f" % ((end_time - start_time) / 3600)) def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument("--name", required = True , default= "Aivian", help="Name of this run. Used for monitoring.") parser.add_argument("--dataset", choices=["CUB_200_2011", "car", "dog", "nabirds", "INat2017"], default="nabirds", help="Which dataset.") parser.add_argument('--data_root', type=str, default='/home/aritram21/Aivian') parser.add_argument("--model_type", choices=["ViT-B_16", "ViT-B_32", "ViT-L_16", "ViT-L_32", "ViT-H_14"], default="ViT-B_16", help="Which variant to use.") parser.add_argument("--pretrained_dir", type=str, default="ViT-B_16.npz", help="Where to search for pretrained ViT models.") parser.add_argument("--pretrained_model", type=str, default=None, help="load pretrained model") parser.add_argument("--output_dir", default="./TransFG_output", type=str, help="The output directory where checkpoints will be written.") parser.add_argument("--img_size", default=448, type=int, help="Resolution size") parser.add_argument("--train_batch_size", default=16, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--eval_every", default=200, type=int, help="Run prediction on validation set every so many steps." "Will always run one evaluation at the end of training.") parser.add_argument("--learning_rate", default=3e-2, type=float, help="The initial learning rate for SGD.") parser.add_argument("--weight_decay", default=0, type=float, help="Weight deay if we apply some.") parser.add_argument("--num_steps", default=10000, type=int, help="Total number of training epochs to perform.") parser.add_argument("--decay_type", choices=["cosine", "linear"], default="cosine", help="How to decay the learning rate.") parser.add_argument("--warmup_steps", default=500, type=int, help="Step of training to perform learning rate warmup for.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument('--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--fp16_opt_level', type=str, default='O2', help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument('--loss_scale', type=float, default=0, help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--smoothing_value', type=float, default=0.0, help="Label smoothing value\n") parser.add_argument('--split', type=str, default='non-overlap', help="Split method") parser.add_argument('--slide_step', type=int, default=12, help="Slide step for overlap split") args = parser.parse_args() # if args.fp16 and args.smoothing_value != 0: # raise NotImplementedError("label smoothing not supported for fp16 training now") # args.data_root = '{}/{}'.format(args.data_root, args.dataset) # Setup CUDA, GPU & distributed training if args.local_rank == -1: device = torch.device("cuda" if torch.cuda.is_available() else "cpu") args.n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl', timeout=timedelta(minutes=60)) args.n_gpu = 1 args.device = device args.nprocs = torch.cuda.device_count() # Setup logging logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s" % (args.local_rank, args.device, args.n_gpu, bool(args.local_rank != -1), args.fp16)) # Set seed set_seed(args) # Model & Tokenizer Setup args, model = setup(args) # Training train(args, model) if __name__ == "__main__": main()
This is the code I used for my own project. This works fine but you'll have to make minor adjustments. Also, it takes a really really long time to train on a single GPU. Just a warning.
Hello! Can you tell me which part of the code has been commented!
from __future__ import absolute_import, division, print_function import logging import argparse import os import random import numpy as np import time from datetime import timedelta import torch # import torch.distributed as dist from tqdm import tqdm from torch.utils.tensorboard import SummaryWriter # from apex import amp # from apex.parallel import DistributedDataParallel as DDP from transfg_model import VisionTransformer, CONFIGS from utils.scheduler import WarmupLinearSchedule, WarmupCosineSchedule from utils.data_utils import get_loader # from utils.dist_util import get_world_size logger = logging.getLogger(__name__) class AverageMeter(object): """Computes and stores the average and current value""" def __init__(self): self.reset() def reset(self): self.val = 0 self.avg = 0 self.sum = 0 self.count = 0 def update(self, val, n=1): self.val = val self.sum += val * n self.count += n self.avg = self.sum / self.count def simple_accuracy(preds, labels): return (preds == labels).mean() # def reduce_mean(tensor, nprocs): # rt = tensor.clone() # dist.all_reduce(rt, op=dist.ReduceOp.SUM) # rt /= nprocs # return rt def save_model(args, model): model_to_save = model.module if hasattr(model, 'module') else model model_checkpoint = os.path.join(args.output_dir, "%s_checkpoint.bin" % args.name) if args.fp16: checkpoint = { 'model': model_to_save.state_dict(), 'amp': amp.state_dict() } else: checkpoint = { 'model': model_to_save.state_dict(), } torch.save(checkpoint, model_checkpoint) logger.info("Saved model checkpoint to [DIR: %s]", args.output_dir) def setup(args): # Prepare model config = CONFIGS[args.model_type] config.split = args.split config.slide_step = args.slide_step if args.dataset == "CUB_200_2011": num_classes = 200 elif args.dataset == "car": num_classes = 196 elif args.dataset == "nabirds": num_classes = 555 elif args.dataset == "dog": num_classes = 120 elif args.dataset == "INat2017": num_classes = 5089 model = VisionTransformer(config, args.img_size, zero_head=True, num_classes=num_classes, smoothing_value=args.smoothing_value) model.load_from(np.load(args.pretrained_dir)) if args.pretrained_model is not None: pretrained_model = torch.load(args.pretrained_model)['model'] model.load_state_dict(pretrained_model) model.to(args.device) num_params = count_parameters(model) logger.info("{}".format(config)) logger.info("Training parameters %s", args) logger.info("Total Parameter: \t%2.1fM" % num_params) return args, model def count_parameters(model): params = sum(p.numel() for p in model.parameters() if p.requires_grad) return params/1000000 def set_seed(args): random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if args.n_gpu > 0: torch.cuda.manual_seed_all(args.seed) def valid(args, model, writer, test_loader, global_step): # Validation! eval_losses = AverageMeter() logger.info("***** Running Validation *****") logger.info(" Num steps = %d", len(test_loader)) logger.info(" Batch size = %d", args.eval_batch_size) model.eval() all_preds, all_label = [], [] epoch_iterator = tqdm(test_loader, desc="Validating... (loss=X.X)", bar_format="{l_bar}{r_bar}", dynamic_ncols=True, disable=args.local_rank not in [-1, 0]) loss_fct = torch.nn.CrossEntropyLoss() for step, batch in enumerate(epoch_iterator): batch = tuple(t.to(args.device) for t in batch) x, y = batch with torch.no_grad(): logits = model(x) eval_loss = loss_fct(logits, y) eval_loss = eval_loss.mean() eval_losses.update(eval_loss.item()) preds = torch.argmax(logits, dim=-1) if len(all_preds) == 0: all_preds.append(preds.detach().cpu().numpy()) all_label.append(y.detach().cpu().numpy()) else: all_preds[0] = np.append( all_preds[0], preds.detach().cpu().numpy(), axis=0 ) all_label[0] = np.append( all_label[0], y.detach().cpu().numpy(), axis=0 ) epoch_iterator.set_description("Validating... (loss=%2.5f)" % eval_losses.val) all_preds, all_label = all_preds[0], all_label[0] accuracy = simple_accuracy(all_preds, all_label) accuracy = torch.tensor(accuracy).to(args.device) # dist.barrier() # val_accuracy = reduce_mean(accuracy, args.nprocs) val_accuracy = accuracy.detach().cpu().numpy() logger.info("\n") logger.info("Validation Results") logger.info("Global Steps: %d" % global_step) logger.info("Valid Loss: %2.5f" % eval_losses.avg) logger.info("Valid Accuracy: %2.5f" % val_accuracy) if args.local_rank in [-1, 0]: writer.add_scalar("test/accuracy", scalar_value=val_accuracy, global_step=global_step) return val_accuracy def train(args, model): """ Train the model """ if args.local_rank in [-1, 0]: os.makedirs(args.output_dir, exist_ok=True) writer = SummaryWriter(log_dir=os.path.join("logs", args.name)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps # Prepare dataset train_loader, test_loader = get_loader(args) # Prepare optimizer and scheduler optimizer = torch.optim.SGD(model.parameters(), lr=args.learning_rate, momentum=0.9, weight_decay=args.weight_decay) t_total = args.num_steps if args.decay_type == "cosine": scheduler = WarmupCosineSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) else: scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) # if args.fp16: # model, optimizer = amp.initialize(models=model, # optimizers=optimizer, # opt_level=args.fp16_opt_level) # amp._amp_state.loss_scalers[0]._loss_scale = 2**20 # Distributed training # if args.local_rank != -1: # model = DDP(model, message_size=250000000, gradient_predivide_factor=get_world_size()) # Train! logger.info("***** Running training *****") logger.info(" Total optimization steps = %d", args.num_steps) logger.info(" Instantaneous batch size per GPU = %d", args.train_batch_size) logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * ( torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) model.zero_grad() set_seed(args) # Added here for reproducibility (even between python 2 and 3) losses = AverageMeter() global_step, best_acc = 0, 0 start_time = time.time() while True: model.train() epoch_iterator = tqdm(train_loader, desc="Training (X / X Steps) (loss=X.X)", bar_format="{l_bar}{r_bar}", dynamic_ncols=True, disable=args.local_rank not in [-1, 0]) all_preds, all_label = [], [] for step, batch in enumerate(epoch_iterator): batch = tuple(t.to(args.device) for t in batch) x, y = batch loss, logits = model(x, y) loss = loss.mean() preds = torch.argmax(logits, dim=-1) if len(all_preds) == 0: all_preds.append(preds.detach().cpu().numpy()) all_label.append(y.detach().cpu().numpy()) else: all_preds[0] = np.append( all_preds[0], preds.detach().cpu().numpy(), axis=0 ) all_label[0] = np.append( all_label[0], y.detach().cpu().numpy(), axis=0 ) if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps # if args.fp16: # with amp.scale_loss(loss, optimizer) as scaled_loss: # scaled_loss.backward() else: loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: losses.update(loss.item()*args.gradient_accumulation_steps) if args.fp16: torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) scheduler.step() optimizer.step() optimizer.zero_grad() global_step += 1 epoch_iterator.set_description( "Training (%d / %d Steps) (loss=%2.5f)" % (global_step, t_total, losses.val) ) if args.local_rank in [-1, 0]: writer.add_scalar("train/loss", scalar_value=losses.val, global_step=global_step) writer.add_scalar("train/lr", scalar_value=scheduler.get_lr()[0], global_step=global_step) if global_step % args.eval_every == 0: with torch.no_grad(): accuracy = valid(args, model, writer, test_loader, global_step) if args.local_rank in [-1, 0]: if best_acc < accuracy: save_model(args, model) best_acc = accuracy logger.info("best accuracy so far: %f" % best_acc) model.train() if global_step % t_total == 0: break all_preds, all_label = all_preds[0], all_label[0] accuracy = simple_accuracy(all_preds, all_label) accuracy = torch.tensor(accuracy).to(args.device) # dist.barrier() # train_accuracy = reduce_mean(accuracy, args.nprocs) train_accuracy = accuracy.detach().cpu().numpy() logger.info("train accuracy so far: %f" % train_accuracy) losses.reset() if global_step % t_total == 0: break writer.close() logger.info("Best Accuracy: \t%f" % best_acc) logger.info("End Training!") end_time = time.time() logger.info("Total Training Time: \t%f" % ((end_time - start_time) / 3600)) def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument("--name", required = True , default= "Aivian", help="Name of this run. Used for monitoring.") parser.add_argument("--dataset", choices=["CUB_200_2011", "car", "dog", "nabirds", "INat2017"], default="nabirds", help="Which dataset.") parser.add_argument('--data_root', type=str, default='/home/aritram21/Aivian') parser.add_argument("--model_type", choices=["ViT-B_16", "ViT-B_32", "ViT-L_16", "ViT-L_32", "ViT-H_14"], default="ViT-B_16", help="Which variant to use.") parser.add_argument("--pretrained_dir", type=str, default="ViT-B_16.npz", help="Where to search for pretrained ViT models.") parser.add_argument("--pretrained_model", type=str, default=None, help="load pretrained model") parser.add_argument("--output_dir", default="./TransFG_output", type=str, help="The output directory where checkpoints will be written.") parser.add_argument("--img_size", default=448, type=int, help="Resolution size") parser.add_argument("--train_batch_size", default=16, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--eval_every", default=200, type=int, help="Run prediction on validation set every so many steps." "Will always run one evaluation at the end of training.") parser.add_argument("--learning_rate", default=3e-2, type=float, help="The initial learning rate for SGD.") parser.add_argument("--weight_decay", default=0, type=float, help="Weight deay if we apply some.") parser.add_argument("--num_steps", default=10000, type=int, help="Total number of training epochs to perform.") parser.add_argument("--decay_type", choices=["cosine", "linear"], default="cosine", help="How to decay the learning rate.") parser.add_argument("--warmup_steps", default=500, type=int, help="Step of training to perform learning rate warmup for.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument('--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--fp16_opt_level', type=str, default='O2', help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument('--loss_scale', type=float, default=0, help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--smoothing_value', type=float, default=0.0, help="Label smoothing value\n") parser.add_argument('--split', type=str, default='non-overlap', help="Split method") parser.add_argument('--slide_step', type=int, default=12, help="Slide step for overlap split") args = parser.parse_args() # if args.fp16 and args.smoothing_value != 0: # raise NotImplementedError("label smoothing not supported for fp16 training now") # args.data_root = '{}/{}'.format(args.data_root, args.dataset) # Setup CUDA, GPU & distributed training if args.local_rank == -1: device = torch.device("cuda" if torch.cuda.is_available() else "cpu") args.n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl', timeout=timedelta(minutes=60)) args.n_gpu = 1 args.device = device args.nprocs = torch.cuda.device_count() # Setup logging logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s" % (args.local_rank, args.device, args.n_gpu, bool(args.local_rank != -1), args.fp16)) # Set seed set_seed(args) # Model & Tokenizer Setup args, model = setup(args) # Training train(args, model) if __name__ == "__main__": main()
This is the code I used for my own project. This works fine but you'll have to make minor adjustments. Also, it takes a really really long time to train on a single GPU. Just a warning.
Thank you very much for your code!
I would like to know if it takes four GPUs to train the model, is it because the model is too big or the dataset is too big, and if I use the model and train it with a small dataset, is it possible to train it with a single GPU? Will the training time be long?
I have trained the model on a single GPU, but if you want to use 4 GPUs you can definitely go for it. The model is pretty big. It is certainly possible to use the model and train it on a small dataset using a single GPU. I don't think the size of the dataset has anything to do with it. Just reduce your batch size so that everything fits on a single GPU and you should be fine. If the dataset is big though, it will take a longer time than if the dataset is small of course.
Hi! I ran the code on a single GPU. I just commented out the distributed and fp16 portions and it's working fine.
hello,I ran the code on a single GPU,and I also commented out distributed and fp16 portions.But my accuracy only 0.01.
CUDA_VISIBLE_DEVICES=0 python3 train.py --dataset car --num_steps 10000 --name sample_run --pretrained_dir ViT-B_16.npz --data_root /root/TransFG/data/car --train_batch_size 1
Is the number of iterations not enough?
Hi @ylshxi I cannot say anything about this because I do not know what dataset you're running it on, what it's size is/class imbalances/quality is. The number of steps look standard. Batch size of 1 is pretty small and could be the issue. I would go with 8 or 16 at least.
Hi @ylshxi I cannot say anything about this because I do not know what dataset you're running it on, what it's size is/class imbalances/quality is. The number of steps look standard. Batch size of 1 is pretty small and could be the issue. I would go with 8 or 16 at least.
Thank you very much for your comments!!! You are right. It is because batch_ size is too small. When batch_size increased to 4, it's working fine with an accuracy of 91.3.
I would like to ask if you can use a single gpu (4060 series graphics card) to achieve the results in the paper 94.8 on the StanfordCar data set. Because of the Stanford car data set that I run with a 3060 graphics card, increasing batch_size to 8 will cause cuda out of memory problems.
Hello! Can you tell me which part of the code has been commented!
from __future__ import absolute_import, division, print_function import logging import argparse import os import random import numpy as np import time from datetime import timedelta import torch # import torch.distributed as dist from tqdm import tqdm from torch.utils.tensorboard import SummaryWriter # from apex import amp # from apex.parallel import DistributedDataParallel as DDP from transfg_model import VisionTransformer, CONFIGS from utils.scheduler import WarmupLinearSchedule, WarmupCosineSchedule from utils.data_utils import get_loader # from utils.dist_util import get_world_size logger = logging.getLogger(__name__) class AverageMeter(object): """Computes and stores the average and current value""" def __init__(self): self.reset() def reset(self): self.val = 0 self.avg = 0 self.sum = 0 self.count = 0 def update(self, val, n=1): self.val = val self.sum += val * n self.count += n self.avg = self.sum / self.count def simple_accuracy(preds, labels): return (preds == labels).mean() # def reduce_mean(tensor, nprocs): # rt = tensor.clone() # dist.all_reduce(rt, op=dist.ReduceOp.SUM) # rt /= nprocs # return rt def save_model(args, model): model_to_save = model.module if hasattr(model, 'module') else model model_checkpoint = os.path.join(args.output_dir, "%s_checkpoint.bin" % args.name) if args.fp16: checkpoint = { 'model': model_to_save.state_dict(), 'amp': amp.state_dict() } else: checkpoint = { 'model': model_to_save.state_dict(), } torch.save(checkpoint, model_checkpoint) logger.info("Saved model checkpoint to [DIR: %s]", args.output_dir) def setup(args): # Prepare model config = CONFIGS[args.model_type] config.split = args.split config.slide_step = args.slide_step if args.dataset == "CUB_200_2011": num_classes = 200 elif args.dataset == "car": num_classes = 196 elif args.dataset == "nabirds": num_classes = 555 elif args.dataset == "dog": num_classes = 120 elif args.dataset == "INat2017": num_classes = 5089 model = VisionTransformer(config, args.img_size, zero_head=True, num_classes=num_classes, smoothing_value=args.smoothing_value) model.load_from(np.load(args.pretrained_dir)) if args.pretrained_model is not None: pretrained_model = torch.load(args.pretrained_model)['model'] model.load_state_dict(pretrained_model) model.to(args.device) num_params = count_parameters(model) logger.info("{}".format(config)) logger.info("Training parameters %s", args) logger.info("Total Parameter: \t%2.1fM" % num_params) return args, model def count_parameters(model): params = sum(p.numel() for p in model.parameters() if p.requires_grad) return params/1000000 def set_seed(args): random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if args.n_gpu > 0: torch.cuda.manual_seed_all(args.seed) def valid(args, model, writer, test_loader, global_step): # Validation! eval_losses = AverageMeter() logger.info("***** Running Validation *****") logger.info(" Num steps = %d", len(test_loader)) logger.info(" Batch size = %d", args.eval_batch_size) model.eval() all_preds, all_label = [], [] epoch_iterator = tqdm(test_loader, desc="Validating... (loss=X.X)", bar_format="{l_bar}{r_bar}", dynamic_ncols=True, disable=args.local_rank not in [-1, 0]) loss_fct = torch.nn.CrossEntropyLoss() for step, batch in enumerate(epoch_iterator): batch = tuple(t.to(args.device) for t in batch) x, y = batch with torch.no_grad(): logits = model(x) eval_loss = loss_fct(logits, y) eval_loss = eval_loss.mean() eval_losses.update(eval_loss.item()) preds = torch.argmax(logits, dim=-1) if len(all_preds) == 0: all_preds.append(preds.detach().cpu().numpy()) all_label.append(y.detach().cpu().numpy()) else: all_preds[0] = np.append( all_preds[0], preds.detach().cpu().numpy(), axis=0 ) all_label[0] = np.append( all_label[0], y.detach().cpu().numpy(), axis=0 ) epoch_iterator.set_description("Validating... (loss=%2.5f)" % eval_losses.val) all_preds, all_label = all_preds[0], all_label[0] accuracy = simple_accuracy(all_preds, all_label) accuracy = torch.tensor(accuracy).to(args.device) # dist.barrier() # val_accuracy = reduce_mean(accuracy, args.nprocs) val_accuracy = accuracy.detach().cpu().numpy() logger.info("\n") logger.info("Validation Results") logger.info("Global Steps: %d" % global_step) logger.info("Valid Loss: %2.5f" % eval_losses.avg) logger.info("Valid Accuracy: %2.5f" % val_accuracy) if args.local_rank in [-1, 0]: writer.add_scalar("test/accuracy", scalar_value=val_accuracy, global_step=global_step) return val_accuracy def train(args, model): """ Train the model """ if args.local_rank in [-1, 0]: os.makedirs(args.output_dir, exist_ok=True) writer = SummaryWriter(log_dir=os.path.join("logs", args.name)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps # Prepare dataset train_loader, test_loader = get_loader(args) # Prepare optimizer and scheduler optimizer = torch.optim.SGD(model.parameters(), lr=args.learning_rate, momentum=0.9, weight_decay=args.weight_decay) t_total = args.num_steps if args.decay_type == "cosine": scheduler = WarmupCosineSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) else: scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) # if args.fp16: # model, optimizer = amp.initialize(models=model, # optimizers=optimizer, # opt_level=args.fp16_opt_level) # amp._amp_state.loss_scalers[0]._loss_scale = 2**20 # Distributed training # if args.local_rank != -1: # model = DDP(model, message_size=250000000, gradient_predivide_factor=get_world_size()) # Train! logger.info("***** Running training *****") logger.info(" Total optimization steps = %d", args.num_steps) logger.info(" Instantaneous batch size per GPU = %d", args.train_batch_size) logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * ( torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) model.zero_grad() set_seed(args) # Added here for reproducibility (even between python 2 and 3) losses = AverageMeter() global_step, best_acc = 0, 0 start_time = time.time() while True: model.train() epoch_iterator = tqdm(train_loader, desc="Training (X / X Steps) (loss=X.X)", bar_format="{l_bar}{r_bar}", dynamic_ncols=True, disable=args.local_rank not in [-1, 0]) all_preds, all_label = [], [] for step, batch in enumerate(epoch_iterator): batch = tuple(t.to(args.device) for t in batch) x, y = batch loss, logits = model(x, y) loss = loss.mean() preds = torch.argmax(logits, dim=-1) if len(all_preds) == 0: all_preds.append(preds.detach().cpu().numpy()) all_label.append(y.detach().cpu().numpy()) else: all_preds[0] = np.append( all_preds[0], preds.detach().cpu().numpy(), axis=0 ) all_label[0] = np.append( all_label[0], y.detach().cpu().numpy(), axis=0 ) if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps # if args.fp16: # with amp.scale_loss(loss, optimizer) as scaled_loss: # scaled_loss.backward() else: loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: losses.update(loss.item()*args.gradient_accumulation_steps) if args.fp16: torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) scheduler.step() optimizer.step() optimizer.zero_grad() global_step += 1 epoch_iterator.set_description( "Training (%d / %d Steps) (loss=%2.5f)" % (global_step, t_total, losses.val) ) if args.local_rank in [-1, 0]: writer.add_scalar("train/loss", scalar_value=losses.val, global_step=global_step) writer.add_scalar("train/lr", scalar_value=scheduler.get_lr()[0], global_step=global_step) if global_step % args.eval_every == 0: with torch.no_grad(): accuracy = valid(args, model, writer, test_loader, global_step) if args.local_rank in [-1, 0]: if best_acc < accuracy: save_model(args, model) best_acc = accuracy logger.info("best accuracy so far: %f" % best_acc) model.train() if global_step % t_total == 0: break all_preds, all_label = all_preds[0], all_label[0] accuracy = simple_accuracy(all_preds, all_label) accuracy = torch.tensor(accuracy).to(args.device) # dist.barrier() # train_accuracy = reduce_mean(accuracy, args.nprocs) train_accuracy = accuracy.detach().cpu().numpy() logger.info("train accuracy so far: %f" % train_accuracy) losses.reset() if global_step % t_total == 0: break writer.close() logger.info("Best Accuracy: \t%f" % best_acc) logger.info("End Training!") end_time = time.time() logger.info("Total Training Time: \t%f" % ((end_time - start_time) / 3600)) def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument("--name", required = True , default= "Aivian", help="Name of this run. Used for monitoring.") parser.add_argument("--dataset", choices=["CUB_200_2011", "car", "dog", "nabirds", "INat2017"], default="nabirds", help="Which dataset.") parser.add_argument('--data_root', type=str, default='/home/aritram21/Aivian') parser.add_argument("--model_type", choices=["ViT-B_16", "ViT-B_32", "ViT-L_16", "ViT-L_32", "ViT-H_14"], default="ViT-B_16", help="Which variant to use.") parser.add_argument("--pretrained_dir", type=str, default="ViT-B_16.npz", help="Where to search for pretrained ViT models.") parser.add_argument("--pretrained_model", type=str, default=None, help="load pretrained model") parser.add_argument("--output_dir", default="./TransFG_output", type=str, help="The output directory where checkpoints will be written.") parser.add_argument("--img_size", default=448, type=int, help="Resolution size") parser.add_argument("--train_batch_size", default=16, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--eval_every", default=200, type=int, help="Run prediction on validation set every so many steps." "Will always run one evaluation at the end of training.") parser.add_argument("--learning_rate", default=3e-2, type=float, help="The initial learning rate for SGD.") parser.add_argument("--weight_decay", default=0, type=float, help="Weight deay if we apply some.") parser.add_argument("--num_steps", default=10000, type=int, help="Total number of training epochs to perform.") parser.add_argument("--decay_type", choices=["cosine", "linear"], default="cosine", help="How to decay the learning rate.") parser.add_argument("--warmup_steps", default=500, type=int, help="Step of training to perform learning rate warmup for.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument('--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--fp16_opt_level', type=str, default='O2', help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument('--loss_scale', type=float, default=0, help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--smoothing_value', type=float, default=0.0, help="Label smoothing value\n") parser.add_argument('--split', type=str, default='non-overlap', help="Split method") parser.add_argument('--slide_step', type=int, default=12, help="Slide step for overlap split") args = parser.parse_args() # if args.fp16 and args.smoothing_value != 0: # raise NotImplementedError("label smoothing not supported for fp16 training now") # args.data_root = '{}/{}'.format(args.data_root, args.dataset) # Setup CUDA, GPU & distributed training if args.local_rank == -1: device = torch.device("cuda" if torch.cuda.is_available() else "cpu") args.n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl', timeout=timedelta(minutes=60)) args.n_gpu = 1 args.device = device args.nprocs = torch.cuda.device_count() # Setup logging logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s" % (args.local_rank, args.device, args.n_gpu, bool(args.local_rank != -1), args.fp16)) # Set seed set_seed(args) # Model & Tokenizer Setup args, model = setup(args) # Training train(args, model) if __name__ == "__main__": main()
This is the code I used for my own project. This works fine but you'll have to make minor adjustments. Also, it takes a really really long time to train on a single GPU. Just a warning.
sorry to bother you! I met this problem when I replace your code:
Traceback (most recent call last):
File "train.py", line 394, in cublasSgemm( handle, opa, opb, m, n, k, &alpha, a, lda, b, ldb, &beta, c, ldc)
I wanna ask how can I solve this problem. I tried to change "img_size=64" and data_utils.py to reduce image size, but it still didn't work.
Hi @ylshxi I cannot say anything about this because I do not know what dataset you're running it on, what it's size is/class imbalances/quality is. The number of steps look standard. Batch size of 1 is pretty small and could be the issue. I would go with 8 or 16 at least.
- Thank you very much for your comments!!! You are right. It is because batch_ size is too small. When batch_size increased to 4, it's working fine with an accuracy of 91.3.
- I would like to ask if you can use a single gpu (4060 series graphics card) to achieve the results in the paper 94.8 on the StanfordCar data set. Because of the Stanford car data set that I run with a 3060 graphics card, increasing batch_size to 8 will cause cuda out of memory problems.
I'm not sure if I would be qualified to respond to this query, since I am not the creator of the model, so they would know better. But I would expect that you'd need either multiple GPUs or one really big GPU to be able to replicate the score.
in call result = self.forward(*input, kwargs) File "/home/u/haochen/TransFG/models/modeling.py", line 90, in forward mixed_query_layer = self.query(hidden_states) File "/home/u/miniconda3/envs/transfg/lib/python3.7/site-packages/torch/nn/modules/module.py", line 550, in call* result = self.forward(input, **kwargs) File "/home/u/miniconda3/envs/transfg/lib/python3.7/site-packages/torch/nn/modules/linear.py", line 87, in forward return F.linear(input, self.weight, self.bias) File "/home/u/miniconda3/envs/transfg/lib/python3.7/site-packages/torch/nn/functional.py", line 1612, in linear output = input.matmul(weight.t()) RuntimeError: CUDA error: CUBLAS_STATUS_EXECUTION_FAILED when calling
cublasSgemm( handle, opa, opb, m, n, k, &alpha, a, lda, b, ldb, &beta, c, ldc)
I wanna ask how can I solve this problem. I tried to change "img_size=64" and data_utils.py to reduce image size, but it still didn't work.
I'm not sure about this error. I think it may have something to do with your python environment not having a compatible CUDA driver or something weird like that. But again, I am not qualified to answer this question because I am not the creator of this model, but I'd try running the same thing in Python 3.11 environment instead of Python 3.7.
in call result = self.forward(*input, kwargs) File "/home/u/haochen/TransFG/models/modeling.py", line 90, in forward mixed_query_layer = self.query(hidden_states) File "/home/u/miniconda3/envs/transfg/lib/python3.7/site-packages/torch/nn/modules/module.py", line 550, in call* result = self.forward(input, **kwargs) File "/home/u/miniconda3/envs/transfg/lib/python3.7/site-packages/torch/nn/modules/linear.py", line 87, in forward return F.linear(input, self.weight, self.bias) File "/home/u/miniconda3/envs/transfg/lib/python3.7/site-packages/torch/nn/functional.py", line 1612, in linear output = input.matmul(weight.t()) RuntimeError: CUDA error: CUBLAS_STATUS_EXECUTION_FAILED when calling
cublasSgemm( handle, opa, opb, m, n, k, &alpha, a, lda, b, ldb, &beta, c, ldc)
I wanna ask how can I solve this problem. I tried to change "img_size=64" and data_utils.py to reduce image size, but it still didn't work.I'm not sure about this error. I think it may have something to do with your python environment not having a compatible CUDA driver or something weird like that. But again, I am not qualified to answer this question because I am not the creator of this model, but I'd try running the same thing in Python 3.11 environment instead of Python 3.7.
I changed my version and it worked! Thx!!! Package Version
absl-py 2.1.0 Brotli 1.0.9 certifi 2024.2.2 charset-normalizer 2.0.4 contextlib2 21.6.0 contourpy 1.2.1 cycler 0.12.1 fonttools 4.51.0 grpcio 1.62.2 idna 3.4 kiwisolver 1.4.5 Markdown 3.6 MarkupSafe 2.1.5 matplotlib 3.8.4 mkl-fft 1.3.8 mkl-random 1.2.4 mkl-service 2.4.0 ml-collections 0.1.1 numpy 1.26.4 packaging 24.0 pandas 2.2.2 pillow 10.2.0 pip 23.3.1 protobuf 5.26.1 pyparsing 3.1.2 PySocks 1.7.1 python-dateutil 2.9.0.post0 pytz 2024.1 PyYAML 6.0.1 requests 2.31.0 scipy 1.13.0 setuptools 68.2.2 six 1.16.0 tensorboard 2.16.2 tensorboard-data-server 0.7.2 torch 1.12.0 torchaudio 0.12.0 torchvision 0.13.0 tqdm 4.66.2 typing_extensions 4.9.0 tzdata 2024.1 urllib3 2.1.0 Werkzeug 3.0.2 wheel 0.41.2
But I still met this problem... :
Traceback (most recent call last):
File "/home/u/haochen/TransFG/train.py", line 394, in
How do you define the variable amp?Thanks a lot~
in call result = self.forward(*input, kwargs) File "/home/u/haochen/TransFG/models/modeling.py", line 90, in forward mixed_query_layer = self.query(hidden_states) File "/home/u/miniconda3/envs/transfg/lib/python3.7/site-packages/torch/nn/modules/module.py", line 550, in call* result = self.forward(input, **kwargs) File "/home/u/miniconda3/envs/transfg/lib/python3.7/site-packages/torch/nn/modules/linear.py", line 87, in forward return F.linear(input, self.weight, self.bias) File "/home/u/miniconda3/envs/transfg/lib/python3.7/site-packages/torch/nn/functional.py", line 1612, in linear output = input.matmul(weight.t()) RuntimeError: CUDA error: CUBLAS_STATUS_EXECUTION_FAILED when calling
cublasSgemm( handle, opa, opb, m, n, k, &alpha, a, lda, b, ldb, &beta, c, ldc)
I wanna ask how can I solve this problem. I tried to change "img_size=64" and data_utils.py to reduce image size, but it still didn't work.I'm not sure about this error. I think it may have something to do with your python environment not having a compatible CUDA driver or something weird like that. But again, I am not qualified to answer this question because I am not the creator of this model, but I'd try running the same thing in Python 3.11 environment instead of Python 3.7.
I changed my version and it worked! Thx!!! Package Version
absl-py 2.1.0 Brotli 1.0.9 certifi 2024.2.2 charset-normalizer 2.0.4 contextlib2 21.6.0 contourpy 1.2.1 cycler 0.12.1 fonttools 4.51.0 grpcio 1.62.2 idna 3.4 kiwisolver 1.4.5 Markdown 3.6 MarkupSafe 2.1.5 matplotlib 3.8.4 mkl-fft 1.3.8 mkl-random 1.2.4 mkl-service 2.4.0 ml-collections 0.1.1 numpy 1.26.4 packaging 24.0 pandas 2.2.2 pillow 10.2.0 pip 23.3.1 protobuf 5.26.1 pyparsing 3.1.2 PySocks 1.7.1 python-dateutil 2.9.0.post0 pytz 2024.1 PyYAML 6.0.1 requests 2.31.0 scipy 1.13.0 setuptools 68.2.2 six 1.16.0 tensorboard 2.16.2 tensorboard-data-server 0.7.2 torch 1.12.0 torchaudio 0.12.0 torchvision 0.13.0 tqdm 4.66.2 typing_extensions 4.9.0 tzdata 2024.1 urllib3 2.1.0 Werkzeug 3.0.2 wheel 0.41.2
But I still met this problem... : Traceback (most recent call last): File "/home/u/haochen/TransFG/train.py", line 394, in main() File "/home/u/haochen/TransFG/train.py", line 391, in main train(args, model) File "/home/u/haochen/TransFG/train.py", line 252, in train torch.nn.utils.clip_gradnorm(amp.master_params(optimizer), args.max_grad_norm) NameError: name 'amp' is not defined
How do you define the variable amp?Thanks a lot~
I changed "torch.nn.utils.clip_gradnorm(amp.master_params(optimizer), args.max_grad_norm)" to "torch.nn.utils.clip_gradnorm(model.parameters(), args.max_grad_norm)", it could run. Can I make such changes?
Hi @ylshxi I cannot say anything about this because I do not know what dataset you're running it on, what it's size is/class imbalances/quality is. The number of steps look standard. Batch size of 1 is pretty small and could be the issue. I would go with 8 or 16 at least.
- Thank you very much for your comments!!! You are right. It is because batch_ size is too small. When batch_size increased to 4, it's working fine with an accuracy of 91.3.
- I would like to ask if you can use a single gpu (4060 series graphics card) to achieve the results in the paper 94.8 on the StanfordCar data set. Because of the Stanford car data set that I run with a 3060 graphics card, increasing batch_size to 8 will cause cuda out of memory problems.
I'm not sure if I would be qualified to respond to this query, since I am not the creator of the model, so they would know better. But I would expect that you'd need either multiple GPUs or one really big GPU to be able to replicate the score.
But I can't start. I don't know where to change it. Can you give me some advice? Look forward to your reply!
But I still met this problem... : Traceback (most recent call last): File "/home/u/haochen/TransFG/train.py", line 394, in main() File "/home/u/haochen/TransFG/train.py", line 391, in main train(args, model) File "/home/u/haochen/TransFG/train.py", line 252, in train torch.nn.utils.clip_gradnorm(amp.master_params(optimizer), args.max_grad_norm) NameError: name 'amp' is not defined
How do you define the variable amp?Thanks a lot~
I think amp might be some module I don't know. You can try replacing it with torch.nn.utils.clip_gradnorm(model.parameters(), args.max_grad_norm), and maybe it should be fine. If it works, it works!
Hi @ylshxi I cannot say anything about this because I do not know what dataset you're running it on, what it's size is/class imbalances/quality is. The number of steps look standard. Batch size of 1 is pretty small and could be the issue. I would go with 8 or 16 at least.
- Thank you very much for your comments!!! You are right. It is because batch_ size is too small. When batch_size increased to 4, it's working fine with an accuracy of 91.3.
- I would like to ask if you can use a single gpu (4060 series graphics card) to achieve the results in the paper 94.8 on the StanfordCar data set. Because of the Stanford car data set that I run with a 3060 graphics card, increasing batch_size to 8 will cause cuda out of memory problems.
I'm not sure if I would be qualified to respond to this query, since I am not the creator of the model, so they would know better. But I would expect that you'd need either multiple GPUs or one really big GPU to be able to replicate the score.
- Thank you for your comments. I tried to run once with the 4090 series graphics card, and the number of iterations was 1000000 and the size was 8, with an accuracy of 92.3. After that, I intend to use more GPU to run, hoping to achieve the results in the paper.
- Besides, I would like to ask if you have tried to run your own dataset? Now I want to run the plane's data set. The file structure is as follows: -- fvgc. -- train. -- class1. -- class2. ... -- test. -- class1. -- class2. ...
But I can't start. I don't know where to change it. Can you give me some advice? Look forward to your reply!
I have not used a custom dataset. I simply trained it on NABirds dataset. Sorry!
But I still met this problem... : Traceback (most recent call last): File "/home/u/haochen/TransFG/train.py", line 394, in main() File "/home/u/haochen/TransFG/train.py", line 391, in main train(args, model) File "/home/u/haochen/TransFG/train.py", line 252, in train torch.nn.utils.clip_gradnorm(amp.master_params(optimizer), args.max_grad_norm) NameError: name 'amp' is not defined
How do you define the variable amp?Thanks a lot~
I think amp might be some module I don't know. You can try replacing it with torch.nn.utils.clip_gradnorm(model.parameters(), args.max_grad_norm), and maybe it should be fine. If it works, it works!
It works! Thx a lot!!!
Hi @ylshxi I cannot say anything about this because I do not know what dataset you're running it on, what it's size is/class imbalances/quality is. The number of steps look standard. Batch size of 1 is pretty small and could be the issue. I would go with 8 or 16 at least.
- Thank you very much for your comments!!! You are right. It is because batch_ size is too small. When batch_size increased to 4, it's working fine with an accuracy of 91.3.
- I would like to ask if you can use a single gpu (4060 series graphics card) to achieve the results in the paper 94.8 on the StanfordCar data set. Because of the Stanford car data set that I run with a 3060 graphics card, increasing batch_size to 8 will cause cuda out of memory problems.
I'm not sure if I would be qualified to respond to this query, since I am not the creator of the model, so they would know better. But I would expect that you'd need either multiple GPUs or one really big GPU to be able to replicate the score.
- Thank you for your comments. I tried to run once with the 4090 series graphics card, and the number of iterations was 1000000 and the size was 8, with an accuracy of 92.3. After that, I intend to use more GPU to run, hoping to achieve the results in the paper.
- Besides, I would like to ask if you have tried to run your own dataset? Now I want to run the plane's data set. The file structure is as follows: -- fvgc. -- train. -- class1. -- class2. ... -- test. -- class1. -- class2. ...
But I can't start. I don't know where to change it. Can you give me some advice? Look forward to your reply!
I have not used a custom dataset. I simply trained it on NABirds dataset. Sorry!
It's okay. I've solved this problem, using my dataset. And it works well. Thank you again for your comments !
I have only one GPU. I have set local_rank=-1 and assigned os.environ['CUDA_VISIBLE_DEVICES']='0',but failed to run the code. What do i need to revise to successfully run on one GPU?