Open ghost opened 5 years ago
model_path = os.path.join(
shufflenetv2_path, 'shufflenetv2_x1_69.402_88.374.pth.tar')
model = ShuffleNetV2()
if pretrained:
print(f"=> loading model '{model_path}'")
pretrained_dict = torch.load(model_path)
model_dict = model.state_dict()
# 1. filter out unnecessary keys
pretrained_dict = {k: v for k, v in pretrained_dict.items()
if k in model_dict}
# 2. overwrite entries in the existing state dict
model_dict.update(pretrained_dict)
# 3. load the new state dict
model.load_state_dict(model_dict)
# 存储一个epoch后的模型(权重和偏置项), 以便后期使用
filename = ('%s/feature-current.pth' % check_root_feature)
torch.save(model.state_dict(), filename)
# 存储优化器状态
filename_opti = ('%s/opti-current.pth' % check_root_opti)
torch.save(optimizer_feature.state_dict(), filename_opti)
# 载入上一次的训练结果(权重和偏置项), 进一步的训练
model.load_state_dict(
torch.load(check_root_feature + '/feature-current.pth')
)
# 载入优化器状态
optimizer_feature.load_state_dict(
torch.load(check_root_opti + '/opti-current.pth')
)
fileroot = ('%s/feature-current.pth' % check_root_feature)
# 基于torch.save(model.state_dict(), filename)存储方法的对应的恢复方法
model.load_state_dict(torch.load(fileroot))
# Have a look at data
inputs, classes = next(iter(dataloaders['train']))
out = torchvision.utils.make_grid(inputs)
imshow(out, title=[class_names[x] for x in classes])
del
async = True
来并行化数据传输和GPU数字运算# demo.py
import torch
import torchvision.utils as vutils
import numpy as np
import torchvision.models as models
from torchvision import datasets
from tensorboardX import SummaryWriter
resnet18 = models.resnet18(False)
writer = SummaryWriter()
sample_rate = 44100
freqs = [262, 294, 330, 349, 392, 440, 440, 440, 440, 440, 440]
for n_iter in range(100):
dummy_s1 = torch.rand(1)
dummy_s2 = torch.rand(1)
# data grouping by `slash`
writer.add_scalar('data/scalar1', dummy_s1[0], n_iter)
writer.add_scalar('data/scalar2', dummy_s2[0], n_iter)
writer.add_scalars('data/scalar_group', {'xsinx': n_iter * np.sin(n_iter),
'xcosx': n_iter * np.cos(n_iter),
'arctanx': np.arctan(n_iter)}, n_iter)
dummy_img = torch.rand(32, 3, 64, 64) # output from network
if n_iter % 10 == 0:
x = vutils.make_grid(dummy_img, normalize=True, scale_each=True)
writer.add_image('Image', x, n_iter)
dummy_audio = torch.zeros(sample_rate * 2)
for i in range(x.size(0)):
# amplitude of sound should in [-1, 1]
dummy_audio[i] = np.cos(freqs[n_iter // 10] * np.pi * float(i) / float(sample_rate))
writer.add_audio('myAudio', dummy_audio, n_iter, sample_rate=sample_rate)
writer.add_text('Text', 'text logged at step:' + str(n_iter), n_iter)
for name, param in resnet18.named_parameters():
writer.add_histogram(name, param.clone().cpu().data.numpy(), n_iter)
# needs tensorboard 0.4RC or later
writer.add_pr_curve('xoxo', np.random.randint(2, size=100), np.random.rand(100), n_iter)
dataset = datasets.MNIST('mnist', train=False, download=True)
images = dataset.test_data[:100].float()
label = dataset.test_labels[:100]
features = images.view(100, 784)
writer.add_embedding(features, metadata=label, label_img=images.unsqueeze(1))
# export scalar data to JSON for external processing
writer.export_scalars_to_json("./all_scalars.json")
writer.close()
optimizer.zero_grad()
loss, hidden = model(data, hidden, targets)
loss.backward()
# https://pytorch.org/docs/master/nn.html#clip-grad-norm
# torch.nn.utils.clip_grad_norm_(parameters, max_norm, norm_type=2)
# torch.nn.utils.clip_grad_value_(parameters, clip_value)
torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip)
optimizer.step()
既然在BP过程中会产生梯度消失/爆炸(就是偏导无限接近0,导致长时记忆无法更新),那么最简单粗暴的方法,设定阈值,当梯度小于/大于阈值时,对梯度进行限制.
官方文档里还有其他的: https://blog.csdn.net/dss_dssssd/article/details/83959474
torch.nn.init.uniform_(tensor, a=0, b=1)
, 服从U(a,b)
torch.nn.init.normal_(tensor, mean=0, std=1)
服从N(mean,std)
torch.nn.init.constant_(tensor, val)
初始化整个矩阵为常数valtorch.nn.init.xavier_uniform_(tensor, gain=1)
均匀分布U(−a,a)
torch.nn.init.xavier_normal_(tensor, gain=1)
正态分布N(0,std)
Xavier在tanh中表现的很好,但在Relu激活函数中表现的很差,所何凯明提出了针对于Relu的初始化方法。
Xavier
的基础上再除以2. 也就是说在方差推到过程中,式子左侧除以2. pytorch也提供了两个版本:
torch.nn.init.kaiming_uniform_(tensor, a=0, mode=‘fan_in’, nonlinearity=‘leaky_relu’)
,均匀分布U(−bound,bound)
torch.nn.init.kaiming_normal_(tensor, a=0, mode=‘fan_in’, nonlinearity=‘leaky_relu’)
, 正态分布N(0,std)
两函数的参数:
a
:该层后面一层的激活函数中负的斜率(默认为ReLU,此时a=0)mode
:‘fan_in’ (default) 或者 ‘fan_out’. 使用fan_in保持weights的方差在前向传播中不变;使用fan_out保持weights的方差在反向传播中不变
针对于Relu的激活函数,基本使用He initialization,pytorch也是使用kaiming 初始化卷积层参数的
torch.nn.init
里的初始化函数layer1 = torch.nn.Linear(10,20)
torch.nn.init.xavier_uniform_(layer1.weight)
torch.nn.init.constant_(layer1.bias, 0)
reset_parameters()
方法,并不推荐使用nn.Squential
或自定义多层网络
torch.nn.Module.apply(fn)
将函数fn
递归的运用在每个子模块上,这些子模块由self.children()
返回.
注意:此种初始化方式采用的递归,而在python中,对递归层数是有限制的,所以当网络结构很深时,可能会递归层数过深的错误
import torch
from torch import nn
# hyper parameters
in_dim=1
n_hidden_1=1
n_hidden_2=1
out_dim=1
class Net(nn.Module):
def __init__(self, in_dim, n_hidden_1, n_hidden_2, out_dim):
super().__init__()
self.layer = nn.Sequential(
nn.Linear(in_dim, n_hidden_1),
nn.ReLU(True),
nn.Linear(n_hidden_1, n_hidden_2),
nn.ReLU(True),
nn.Linear(n_hidden_2, out_dim)
)
def forward(self, x):
x = self.layer1(x)
x = self.layer2(x)
x = self.layer3(x)
return x
# 注意, 在类的外边
# 1. 根据网络层的不同定义不同的初始化方式
def weight_init(m):
if isinstance(m, nn.Linear):
nn.init.xavier_normal_(m.weight)
nn.init.constant_(m.bias, 0)
# 也可以判断是否为conv2d,使用相应的初始化方式
elif isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
# 是否为批归一化层
elif isinstance(m, nn.BatchNorm2d):
nn.init.constant_(m.weight, 1)
nn.init.constant_(m.bias, 0)
# 2. 初始化网络结构
model = Net(in_dim, n_hidden_1, n_hidden_2, out_dim)
# 3. 将weight_init应用在子模块上
model.apply(weight_init)
__init__
中迭代循环self.modules()
来初始化网络参数import torch
from torch import nn
# hyper parameters
in_dim=1
n_hidden_1=1
n_hidden_2=1
out_dim=1
class Net(nn.Module):
def __init__(self, in_dim, n_hidden_1, n_hidden_2, out_dim):
super().__init__()
self.layer = nn.Sequential(
nn.Linear(in_dim, n_hidden_1),
nn.ReLU(True),
nn.Linear(n_hidden_1, n_hidden_2),
nn.ReLU(True),
nn.Linear(n_hidden_2, out_dim)
)
# 迭代循环初始化参数
for m in self.modules():
if isinstance(m, nn.Linear):
# 这里只是示例, 不见得要这样定常数
nn.init.constant_(m.weight, 1)
nn.init.constant_(m.bias, -100)
# 也可以判断是否为conv2d,使用相应的初始化方式
elif isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(
m.weight, mode='fan_out', nonlinearity='relu')
elif isinstance(m, nn.BatchNorm2d):
nn.init.constant_(m.weight.item(), 1)
nn.init.constant_(m.bias.item(), 0)
def forward(self, x):
x = self.layer(x)
return x
model = Net(in_dim, n_hidden_1, n_hidden_2, out_dim)
# 打印参数信息
def print_weight(m):
if isinstance(m, nn.Linear):
print("weight", m.weight.item())
print("bias:", m.bias.item())
print("next...")
model.apply(print_weight)
a = torch.randn([2, 3, 4, 4])
a
Out[6]:
tensor([[[[-0.5390, -0.4974, 0.4392, 0.0885],
[ 0.3316, 0.2863, 0.5387, 0.9645],
[-0.2879, -0.0852, 0.1790, 2.1958],
[ 0.2817, 0.4995, 0.6190, -0.5218]],
[[-0.1495, -0.9248, 1.9004, 0.2535],
[-0.9124, 0.7679, -0.2503, 0.0491],
[-1.0860, -0.0838, -0.8773, -1.4696],
[ 1.5713, -0.9741, -0.1584, 1.1361]],
[[-0.1027, 2.1711, 0.0953, -0.9208],
[-1.2121, -1.2770, 1.1427, -0.3149],
[-0.0458, -1.5204, -0.1037, 0.6764],
[ 0.3862, 0.6306, -1.0143, -0.1202]]],
[[[ 1.2074, -1.0920, 0.9833, 0.7729],
[-0.3728, -0.4250, -0.3600, -0.7940],
[ 0.6346, -0.4655, 0.9664, 0.4688],
[-1.0701, 0.0883, 0.2658, 0.0234]],
[[ 1.5879, 0.5017, 0.4271, 0.6949],
[ 0.4801, -0.0612, -1.4131, 0.0766],
[-1.0388, -0.7434, -0.1933, -0.7082],
[ 0.2480, 0.3196, -2.1165, -0.3998]],
[[-0.8971, -0.5938, -1.5611, 0.3487],
[ 1.0478, -0.3852, 0.1441, -2.2990],
[-0.7650, 0.4652, -1.0962, 1.5915],
[ 0.7840, 0.2409, 0.3218, 0.4186]]]])
import torch
unfold = nn.Unfold(2, 1, 0, 2)
b = unfold(a)
b
Out[19]:
tensor([[[-0.5390, 0.4392, -0.2879, 0.1790],
[-0.4974, 0.0885, -0.0852, 2.1958],
[ 0.3316, 0.5387, 0.2817, 0.6190],
[ 0.2863, 0.9645, 0.4995, -0.5218],
[-0.1495, 1.9004, -1.0860, -0.8773],
[-0.9248, 0.2535, -0.0838, -1.4696],
[-0.9124, -0.2503, 1.5713, -0.1584],
[ 0.7679, 0.0491, -0.9741, 1.1361],
[-0.1027, 0.0953, -0.0458, -0.1037],
[ 2.1711, -0.9208, -1.5204, 0.6764],
[-1.2121, 1.1427, 0.3862, -1.0143],
[-1.2770, -0.3149, 0.6306, -0.1202]],
[[ 1.2074, 0.9833, 0.6346, 0.9664],
[-1.0920, 0.7729, -0.4655, 0.4688],
[-0.3728, -0.3600, -1.0701, 0.2658],
[-0.4250, -0.7940, 0.0883, 0.0234],
[ 1.5879, 0.4271, -1.0388, -0.1933],
[ 0.5017, 0.6949, -0.7434, -0.7082],
[ 0.4801, -1.4131, 0.2480, -2.1165],
[-0.0612, 0.0766, 0.3196, -0.3998],
[-0.8971, -1.5611, -0.7650, -1.0962],
[-0.5938, 0.3487, 0.4652, 1.5915],
[ 1.0478, 0.1441, 0.7840, 0.3218],
[-0.3852, -2.2990, 0.2409, 0.4186]]])
b.view(2, 12, 2, 2)
Out[18]:
tensor([[[[-0.5390, 0.4392],
[-0.2879, 0.1790]],
[[-0.4974, 0.0885],
[-0.0852, 2.1958]],
[[ 0.3316, 0.5387],
[ 0.2817, 0.6190]],
[[ 0.2863, 0.9645],
[ 0.4995, -0.5218]],
[[-0.1495, 1.9004],
[-1.0860, -0.8773]],
[[-0.9248, 0.2535],
[-0.0838, -1.4696]],
[[-0.9124, -0.2503],
[ 1.5713, -0.1584]],
[[ 0.7679, 0.0491],
[-0.9741, 1.1361]],
[[-0.1027, 0.0953],
[-0.0458, -0.1037]],
[[ 2.1711, -0.9208],
[-1.5204, 0.6764]],
[[-1.2121, 1.1427],
[ 0.3862, -1.0143]],
[[-1.2770, -0.3149],
[ 0.6306, -0.1202]]],
[[[ 1.2074, 0.9833],
[ 0.6346, 0.9664]],
[[-1.0920, 0.7729],
[-0.4655, 0.4688]],
[[-0.3728, -0.3600],
[-1.0701, 0.2658]],
[[-0.4250, -0.7940],
[ 0.0883, 0.0234]],
[[ 1.5879, 0.4271],
[-1.0388, -0.1933]],
[[ 0.5017, 0.6949],
[-0.7434, -0.7082]],
[[ 0.4801, -1.4131],
[ 0.2480, -2.1165]],
[[-0.0612, 0.0766],
[ 0.3196, -0.3998]],
[[-0.8971, -1.5611],
[-0.7650, -1.0962]],
[[-0.5938, 0.3487],
[ 0.4652, 1.5915]],
[[ 1.0478, 0.1441],
[ 0.7840, 0.3218]],
[[-0.3852, -2.2990],
[ 0.2409, 0.4186]]]])
这里可以看出,unfold
的操作的收集数据的顺序是按照滑窗区域里W->H->C->N的顺序进行的收集, 而且,这个过程本身也是分通道的过程,是逐通道收集的。
temp.view(1, 2*2*2, 1, 1)
Out[16]:
tensor([[[[2.]],
[[7.]],
[[1.]],
[[3.]],
[[2.]],
[[0.]],
[[1.]],
[[1.]]]])
temp.view(1, 2*2, 2, 1)
Out[17]:
tensor([[[[2.],
[7.]],
[[1.],
[3.]],
[[2.],
[0.]],
[[1.],
[1.]]]])
temp.view(1, 2*2, 1, 2)
Out[18]:
tensor([[[[2., 7.]],
[[1., 3.]],
[[2., 0.]],
[[1., 1.]]]])
temp
Out[30]:
tensor([[[[3., 7., 0., 8.],
[5., 2., 2., 4.],
[1., 1., 4., 4.],
[8., 6., 8., 7.]],
[[3., 3., 4., 0.],
[5., 0., 0., 7.],
[3., 2., 8., 3.],
[8., 5., 8., 8.]]]])
temp.view(1, 2*4, 1, 4)
Out[28]:
tensor([[[[3., 7., 0., 8.]],
[[5., 2., 2., 4.]],
[[1., 1., 4., 4.]],
[[8., 6., 8., 7.]],
[[3., 3., 4., 0.]],
[[5., 0., 0., 7.]],
[[3., 2., 8., 3.]],
[[8., 5., 8., 8.]]]])
F.unfold(temp, (2, 2), 1, 0, 2)
Out[29]:
tensor([[[3., 0., 1., 4.],
[7., 8., 1., 4.],
[5., 2., 8., 8.],
[2., 4., 6., 7.],
[3., 4., 3., 8.],
[3., 0., 2., 3.],
[5., 0., 8., 8.],
[0., 7., 5., 8.]]])
import torch
import torch.nn.functional as F
a = torch.rand((1, 1, 4, 4))
a
Out[5]:
tensor([[[[0.6956, 0.2741, 0.7546, 0.6516],
[0.7810, 0.5884, 0.4314, 0.1446],
[0.4217, 0.5753, 0.0358, 0.3593],
[0.1191, 0.0768, 0.3927, 0.3685]]]])
b =F.unfold(a, 2, stride=2)
b
Out[10]:
tensor([[[0.6956, 0.7546, 0.4217, 0.0358],
[0.2741, 0.6516, 0.5753, 0.3593],
[0.7810, 0.4314, 0.1191, 0.3927],
[0.5884, 0.1446, 0.0768, 0.3685]]])
b.view(1, 4, 2, 2)
Out[9]:
tensor([[[[0.6956, 0.7546],
[0.4217, 0.0358]],
[[0.2741, 0.6516],
[0.5753, 0.3593]],
[[0.7810, 0.4314],
[0.1191, 0.3927]],
[[0.5884, 0.1446],
[0.0768, 0.3685]]]])
torch.addmm 对矩阵mat1和mat2进行矩阵乘操作(用@表示)。矩阵mat加到最终结果。
out=(beta∗M)+(alpha∗mat1@mat2)
torch.addmm(beta=1, mat, alpha=1, mat1, mat2, out=None)
torch.mm 对矩阵mat1和mat2进行相乘。
torch.mm(mat1, mat2, out=None)
torch.bmm 对存储在两个批batch1和batch2内的矩阵进行批矩阵乘操作。torch.bmm(batch1, batch2, out=None)
用法:
>>> batch1 = torch.randn(10, 3, 4)
>>> batch2 = torch.randn(10, 4, 5)
>>> res = torch.bmm(batch1, batch2)
>>> res.size()
torch.Size([10, 3, 5])
主要用在了shufflenet中,是一种修改通道顺序的操作,从各组中抽取特定通道进行组合。
def shuffle_channels(x, groups):
'''Channel shuffle: [N,C,H,W] -> [N,g,C/g,H,W] -> [N,C/g,g,H,W] -> [N,C,H,W]'''
'''一共C个channel要分成g组混合的channel,先把C reshape成(g, C/g)的形状,然后转置成(C/g, g)最后平坦成C组channel'''
N, C, H, W = x.size()
return x.view(N, groups, C // groups, H, W).permute(0, 2, 1, 3, 4).contiguous().view(N, C, H, W) # 因为x之前view过了,他的内存不连续了,需要contiguous来规整一下
作者:急流勇进 来源:CSDN 原文:https://blog.csdn.net/weixin_44538273/article/details/88856239 版权声明:本文为博主原创文章,转载请附上博文链接!
另一份实现:
https://github.com/jaxony/ShuffleNet/blob/e9bf42f0cda8dda518cafffd515654cc04584e7a/model.py#L36
def channel_shuffle(x, groups):
batchsize, num_channels, height, width = x.data.size()
channels_per_group = num_channels // groups
# reshape
x = x.view(batchsize, groups,
channels_per_group, height, width)
# transpose
# - contiguous() required if transpose() is used before view().
# See https://github.com/pytorch/pytorch/issues/764
x = torch.transpose(x, 1, 2).contiguous()
# flatten
x = x.view(batchsize, -1, height, width)
return x
nn.PixelShuffle(upscale_factor)
该操作实现了下图的操作,只是重排了特征,没有其他操作:
具体公式:https://blog.csdn.net/oLingFengYu/article/details/87728077
损失函数整理
https://blog.csdn.net/zhangxb35/article/details/72464152?utm_source=itdadao&utm_medium=referral