import logging
import os
import random
from datetime import datetime
import numpy as np
import torch
from torch import optim
from torch.cuda.amp import GradScaler
try:
import wandb
except ImportError:
wandb = None
try:
import torch.utils.tensorboard as tensorboard
except ImportError:
tensorboard = None
try:
import horovod.torch as hvd
except ImportError:
hvd = None
from torch_ema import ExponentialMovingAverage
from open_clip import create_model_and_transforms, trace_model
from training.data import get_data
from training.distributed import is_master, init_distributed_device, world_info_from_env
from training.logger import setup_logging
from training.params import parse_args
from training.scheduler import cosine_lr
from training.train import train_one_epoch, evaluate
def random_seed(seed=42, rank=0):
torch.manual_seed(seed + rank)
np.random.seed(seed + rank)
random.seed(seed + rank)
def main():
args = parse_args()
# sanitize model name for filesystem / uri use, easier if we don't use / in name as a rule?
args.model = args.model.replace('/', '-')
# get the name of the experiments
if args.name is None:
args.name = '-'.join([
datetime.now().strftime("%Y_%m_%d-%H_%M_%S"),
f"model_{args.model}",
f"lr_{args.lr}",
f"b_{args.batch_size}",
f"j_{args.workers}",
f"p_{args.precision}",
])
# discover initial world args early so we can log properly
args.distributed = False
args.local_rank, args.rank, args.world_size = world_info_from_env()
args.log_path = None
if is_master(args, local=args.log_local):
log_base_path = os.path.join(args.logs, args.name)
os.makedirs(log_base_path, exist_ok=True)
log_filename = f'out-{args.rank}' if args.log_local else 'out.log'
args.log_path = os.path.join(log_base_path, log_filename)
if os.path.exists(args.log_path):
print(
"Error. Experiment already exists. Use --name {} to specify a new experiment."
)
return -1
# Set logger
args.log_level = logging.DEBUG if args.debug else logging.INFO
setup_logging(args.log_path, args.log_level)
# fully initialize distributed device environment
torch.backends.cudnn.benchmark = True
torch.backends.cudnn.deterministic = False
device = init_distributed_device(args)
args.wandb = 'wandb' in args.report_to or 'all' in args.report_to
args.tensorboard = 'tensorboard' in args.report_to or 'all' in args.report_to
if is_master(args):
args.tensorboard_path = os.path.join(args.logs, args.name, "tensorboard") if args.tensorboard else ''
args.checkpoint_path = os.path.join(args.logs, args.name, "checkpoints")
for dirname in [args.tensorboard_path, args.checkpoint_path]:
if dirname:
os.makedirs(dirname, exist_ok=True)
else:
args.tensorboard_path = ''
args.checkpoint_path = ''
if args.copy_codebase:
copy_codebase(args)
assert args.precision in ['amp', 'fp16', 'fp32']
if args.precision == 'fp16':
logging.warning(
'It is recommended to use AMP mixed-precision instead of FP16. '
'FP16 support needs further verification and tuning, especially for train.')
if args.horovod:
logging.info(
f'Running in horovod mode with multiple processes / nodes. Device: {args.device}.'
f'Process (global: {args.rank}, local {args.local_rank}), total {args.world_size}.')
elif args.distributed:
logging.info(
f'Running in distributed mode with multiple processes. Device: {args.device}.'
f'Process (global: {args.rank}, local {args.local_rank}), total {args.world_size}.')
else:
logging.info(f'Running with a single process. Device {args.device}.')
random_seed(args.seed, 0)
model, preprocess_train, preprocess_val = create_model_and_transforms(
args.model,
args.pretrained,
precision=args.precision,
device=device,
jit=args.torchscript,
force_quick_gelu=args.force_quick_gelu,
pretrained_image=args.pretrained_image,
scale=(args.scale_min, args.scale_max),
)
random_seed(args.seed, args.rank)
if args.trace:
model = trace_model(model, batch_size=args.batch_size, device=device)
if args.lock_image:
# lock image tower as per LiT - https://arxiv.org/abs/2111.07991
model.lock_image_tower(
unlocked_groups=args.lock_image_unlocked_groups,
freeze_bn_stats=args.lock_image_freeze_bn_stats)
if args.lock_text:
model.loc_text_tower()
if args.grad_checkpointing:
model.set_grad_checkpointing()
if is_master(args):
logging.info("Model:")
logging.info(f"{str(model)}")
logging.info("Params:")
params_file = os.path.join(args.logs, args.name, "params.txt")
with open(params_file, "w") as f:
for name in sorted(vars(args)):
val = getattr(args, name)
logging.info(f" {name}: {val}")
f.write(f"{name}: {val}\n")
if args.distributed and not args.horovod:
if args.use_bn_sync:
model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
ddp_args = {}
if args.ddp_static_graph:
# this doesn't exist in older PyTorch, arg only added if enabled
ddp_args['static_graph'] = True
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[device],
find_unused_parameters=True, **ddp_args)
# create optimizer and scaler
optimizer = None
scaler = None
ema = None
if args.train_data:
assert not args.trace, 'Cannot train with traced model'
exclude = lambda n, p: p.ndim < 2 or "bn" in n or "ln" in n or "bias" in n or 'logit_scale' in n
include = lambda n, p: not exclude(n, p)
named_parameters = list(model.named_parameters())
gain_or_bias_params = [p for n, p in named_parameters if exclude(n, p) and p.requires_grad]
rest_params = [p for n, p in named_parameters if include(n, p) and p.requires_grad]
optimizer = optim.AdamW(
[
{"params": gain_or_bias_params, "weight_decay": 0.},
{"params": rest_params, "weight_decay": args.wd},
],
lr=args.lr,
betas=(args.beta1, args.beta2),
eps=args.eps,
)
if args.horovod:
optimizer = hvd.DistributedOptimizer(optimizer, named_parameters=model.named_parameters())
hvd.broadcast_parameters(model.state_dict(), root_rank=0)
hvd.broadcast_optimizer_state(optimizer, root_rank=0)
scaler = GradScaler() if args.precision == "amp" else None
if args.ema:
ema = ExponentialMovingAverage(model.parameters(), decay=args.ema_ratio)
# optionally resume from a checkpoint
start_epoch = 0
if args.resume is not None:
if os.path.isfile(args.resume):
checkpoint = torch.load(args.resume, map_location=device)
if 'epoch' in checkpoint:
# resuming a train checkpoint w/ epoch and optimizer state
start_epoch = checkpoint["epoch"]
sd = checkpoint["state_dict"]
if not args.distributed and next(iter(sd.items()))[0].startswith('module'):
sd = {k[len('module.'):]: v for k, v in sd.items()}
model.load_state_dict(sd)
if optimizer is not None:
optimizer.load_state_dict(checkpoint["optimizer"])
if scaler is not None and 'scaler' in checkpoint:
scaler.load_state_dict(checkpoint['scaler'])
logging.info(f"=> resuming checkpoint '{args.resume}' (epoch {start_epoch})")
else:
# loading a bare (model only) checkpoint for fine-tune or evaluation
model.load_state_dict(checkpoint)
logging.info(f"=> loaded checkpoint '{args.resume}' (epoch {start_epoch})")
else:
logging.info("=> no checkpoint found at '{}'".format(args.resume))
# initialize datasets
data = get_data(args, (preprocess_train, preprocess_val), epoch=start_epoch)
assert len(data), 'At least one train or eval dataset must be specified.'
# create scheduler if train
scheduler = None
if 'train' in data and optimizer is not None:
total_steps = data["train"].dataloader.num_batches * args.epochs
scheduler = cosine_lr(optimizer, args.lr, args.warmup, total_steps)
# determine if this worker should save logs and checkpoints. only do so if it is rank == 0
args.save_logs = args.logs and args.logs.lower() != 'none' and is_master(args)
writer = None
if args.save_logs and args.tensorboard:
assert tensorboard is not None, "Please install tensorboard."
writer = tensorboard.SummaryWriter(args.tensorboard_path)
if args.wandb and is_master(args):
assert wandb is not None, 'Please install wandb.'
logging.debug('Starting wandb.')
args.train_sz = data["train"].dataloader.num_samples
if args.val_data is not None:
args.val_sz = data["val"].dataloader.num_samples
# you will have to configure this for your project!
wandb.init(
project="open-clip",
notes=args.wandb_notes,
tags=[],
config=vars(args),
)
if args.debug:
wandb.watch(model, log='all')
wandb.save(params_file)
logging.debug('Finished loading wandb.')
if 'train' not in data:
evaluate(model, data, start_epoch, ema, args, writer)
return
if any(v in data for v in ('val', 'imagenet-val', 'imagenet-v2', 'ade-val')):
evaluate(model, data, start_epoch, ema, args, writer)
for epoch in range(start_epoch, args.epochs):
if is_master(args):
logging.info(f'Start epoch {epoch}')
train_one_epoch(model, data, epoch, optimizer, scaler, scheduler, ema, args, writer)
completed_epoch = epoch + 1
if any(v in data for v in ('val', 'imagenet-val', 'imagenet-v2', 'ade-val')):
evaluate(model, data, completed_epoch, ema, args, writer)
# Saving checkpoints.
if args.save_logs:
checkpoint_dict = {
"epoch": completed_epoch,
"name": args.name,
"state_dict": model.state_dict(),
"optimizer": optimizer.state_dict(),
}
if scaler is not None:
checkpoint_dict["scaler"] = scaler.state_dict()
if completed_epoch == args.epochs or (
args.save_frequency > 0 and (completed_epoch % args.save_frequency) == 0
):
torch.save(
checkpoint_dict,
os.path.join(args.checkpoint_path, f"epoch_{completed_epoch}.pt"),
)
if args.save_most_recent:
torch.save(
checkpoint_dict,
os.path.join(args.checkpoint_path, f"epoch_latest.pt"),
)
if args.wandb and is_master(args):
wandb.finish()
def copy_codebase(args):
from shutil import copytree, ignore_patterns
new_code_path = os.path.join(args.logs, args.name, "code")
if os.path.exists(new_code_path):
print(
f"Error. Experiment already exists at {new_code_path}. Use --name to specify a new experiment."
)
return -1
print(f"Copying codebase to {new_code_path}")
current_code_path = os.path.realpath(__file__)
for _ in range(3):
current_code_path = os.path.dirname(current_code_path)
copytree(current_code_path, new_code_path, ignore=ignore_patterns('log', 'logs', 'wandb'))
print("Done copying code.")
return 1
if __name__ == "__main__":
main()
Raccoon's error
Client network socket disconnected before secure TLS connection was established
Your expection
Extension version: 0.67.0
VS Code version: Code 1.87.2 (863d2581ecda6849923a2118d93a088b0745d9d6, 2024-03-08T15:20:17.278Z)
OS version: Windows_NT x64 10.0.22631
Modes:
Type: Bug
Your question
请用中文,帮我对代码进行逐行解释和说明
Raccoon's error
Client network socket disconnected before secure TLS connection was established
Your expection
Extension version: 0.67.0 VS Code version: Code 1.87.2 (863d2581ecda6849923a2118d93a088b0745d9d6, 2024-03-08T15:20:17.278Z) OS version: Windows_NT x64 10.0.22631 Modes:
System Info
|Item|Value| |---|---| |CPUs|AMD Ryzen 7 5800H with Radeon Graphics (16 x 3194)| |GPU Status|2d_canvas: enabledcanvas_oop_rasterization: enabled_on
direct_rendering_display_compositor: disabled_off_ok
gpu_compositing: enabled
multiple_raster_threads: enabled_on
opengl: enabled_on
rasterization: enabled
raw_draw: disabled_off_ok
skia_graphite: disabled_off
video_decode: enabled
video_encode: enabled
vulkan: disabled_off
webgl: enabled
webgl2: enabled
webgpu: enabled| |Load (avg)|undefined| |Memory (System)|15.86GB (6.55GB free)| |Process Argv|--crash-reporter-id cf5fccc7-b0b9-4f6e-8be9-3c55ff0ffce5| |Screen Reader|no| |VM|0%|
A/B Experiments
``` vsliv368cf:30146710 vspor879:30202332 vspor708:30202333 vspor363:30204092 vscoreces:30445986 vscod805:30301674 binariesv615:30325510 vsaa593cf:30376535 py29gd2263:30899288 c4g48928:30535728 azure-dev_surveyone:30548225 962ge761:30959799 pythongtdpath:30769146 pythonidxpt:30866567 pythonnoceb:30805159 asynctok:30898717 pythontestfixt:30902429 pythonregdiag2:30936856 pyreplss1:30897532 pythonmypyd1:30879173 pythoncet0:30885854 h48ei257:31000450 pythontbext0:30879054 dsvsc016:30899300 dsvsc017:30899301 dsvsc018:30899302 cppperfnew:31000557 d34g3935:30971562 fegfb526:30981948 bg6jg535:30979843 ccp2r3:30993541 dsvsc020:30976470 pythonait:31006305 gee8j676:30988843 dsvsc021:30996838 gd77d436:30999572 ```