本学习笔记主要摘自“深度之眼”,做一个总结,方便查阅。 使用Pytorch版本为1.2
池化层——Pooling Layer线性层——Linear Layer激活函数层——Activation Layer作业1.nn.MaxPool2d 功能:对二维信号(图像)进行最大值池化 主要参数:
kernel_size:池化核尺寸stride:步长padding:填充个数dilation:池化核间隔大小ceil_mode:尺寸向上取整return_indices:记录池化像素索引 测试代码: # -*- coding: utf-8 -*- import os import torch import random import numpy as np import torchvision import torch.nn as nn from torchvision import transforms from matplotlib import pyplot as plt from PIL import Image from tools.common_tools import transform_invert#, set_seed import random random.seed(2) #set_seed(3) # 设置随机种子 # ================================= load img ================================== path_img = os.path.join(os.path.dirname(os.path.abspath(__file__)), "lena.png") img = Image.open(path_img).convert('RGB') # 0~255 # convert to tensor img_transform = transforms.Compose([transforms.ToTensor()]) img_tensor = img_transform(img) img_tensor.unsqueeze_(dim=0) # C*H*W to B*C*H*W # ================================= create convolution layer ================================== # ================ maxpool # flag = 1 flag = 1 if flag: maxpool_layer = nn.MaxPool2d((2, 2), stride=(2, 2)) # input:(i, o, size) weights:(o, i , h, w) img_pool = maxpool_layer(img_tensor) # ================ avgpool # flag = 1 flag = 0 if flag: avgpoollayer = nn.AvgPool2d((2, 2), stride=(2, 2)) # input:(i, o, size) weights:(o, i , h, w) img_pool = avgpoollayer(img_tensor) # ================ avgpool divisor_override # flag = 1 flag = 0 if flag: img_tensor = torch.ones((1, 1, 4, 4)) avgpool_layer = nn.AvgPool2d((2, 2), stride=(2, 2), divisor_override=3) img_pool = avgpool_layer(img_tensor) print("raw_img:\n{}\npooling_img:\n{}".format(img_tensor, img_pool)) # ================ max unpool # flag = 1 flag = 0 if flag: # pooling img_tensor = torch.randint(high=5, size=(1, 1, 4, 4), dtype=torch.float) maxpool_layer = nn.MaxPool2d((2, 2), stride=(2, 2), return_indices=True) img_pool, indices = maxpool_layer(img_tensor) # unpooling img_reconstruct = torch.randn_like(img_pool, dtype=torch.float) maxunpool_layer = nn.MaxUnpool2d((2, 2), stride=(2, 2)) img_unpool = maxunpool_layer(img_reconstruct, indices) print("raw_img:\n{}\nimg_pool:\n{}".format(img_tensor, img_pool)) print("img_reconstruct:\n{}\nimg_unpool:\n{}".format(img_reconstruct, img_unpool)) # ================ linear flag = 0 # flag = 0 if flag: inputs = torch.tensor([[1., 2, 3]]) linear_layer = nn.Linear(3, 4) linear_layer.weight.data = torch.tensor([[1., 1., 1.], [2., 2., 2.], [3., 3., 3.], [4., 4., 4.]]) linear_layer.bias.data.fill_(0.5) output = linear_layer(inputs) print(inputs, inputs.shape) print(linear_layer.weight.data, linear_layer.weight.data.shape) print(output, output.shape) # ================================= visualization ================================== print("池化前尺寸:{}\n池化后尺寸:{}".format(img_tensor.shape, img_pool.shape)) img_pool = transform_invert(img_pool[0, 0:3, ...], img_transform) img_raw = transform_invert(img_tensor.squeeze(), img_transform) plt.subplot(122).imshow(img_pool) plt.subplot(121).imshow(img_raw) plt.show() 输出: ```python 池化前尺寸:torch.Size([1, 3, 512, 512]) 池化后尺寸:torch.Size([1, 3, 256, 256])2.nn.AvgPool2d 功能:对二维信号(图像)进行平均池化 主要参数:
kernel_size:池化核尺寸stride:步长padding:填充个数dilation:池化核间隔大小ceil_mode:尺寸向上取整count_ include_ pad:填充值用于计算divisor override:除法因子设置:
# ================ avgpool flag = 1 # flag = 0 if flag: avgpoollayer = nn.AvgPool2d((2, 2), stride=(2, 2)) # input:(i, o, size) weights:(o, i , h, w) img_pool = avgpoollayer(img_tensor)输出:
池化前尺寸:torch.Size([1, 3, 512, 512]) 池化后尺寸:torch.Size([1, 3, 256, 256])测试: avgpool divisor_override
# ================ avgpool divisor_override flag = 1 # flag = 0 if flag: img_tensor = torch.ones((1, 1, 4, 4)) avgpool_layer = nn.AvgPool2d((2, 2), stride=(2, 2)) img_pool = avgpool_layer(img_tensor) print("raw_img:\n{}\npooling_img:\n{}".format(img_tensor, img_pool)) # ================================= visualization ================================== #print("池化前尺寸:{}\n池化后尺寸:{}".format(img_tensor.shape, img_pool.shape)) #img_pool = transform_invert(img_pool[0, 0:3, ...], img_transform) #img_raw = transform_invert(img_tensor.squeeze(), img_transform) #plt.subplot(122).imshow(img_pool) #plt.subplot(121).imshow(img_raw) #plt.show()输出:
raw_img: tensor([[[[1., 1., 1., 1.], [1., 1., 1., 1.], [1., 1., 1., 1.], [1., 1., 1., 1.]]]]) pooling_img: tensor([[[[1., 1.], [1., 1.]]]])当设置avgpool_layer = nn.AvgPool2d((2, 2), stride=(2, 2), divisor_override=3)时:
raw_img: tensor([[[[1., 1., 1., 1.], [1., 1., 1., 1.], [1., 1., 1., 1.], [1., 1., 1., 1.]]]]) pooling_img: tensor([[[[1.3333, 1.3333], [1.3333, 1.3333]]]])其中1.333如何计算: 1.3333 = ( 1 + 1 + 1 + 1 ) / 3 1.3333 = (1 + 1 + 1 + 1) / 3 1.3333=(1+1+1+1)/3( divisor_override=3)。
3.nn.MaxUnpool2d 功能:对二维信号(图像)进行最大值池化上采样 主要参数:
kernel_size:池化核尺寸stride:步长padding:填充个数测试代码:
# ================ max unpool flag = 1 # flag = 0 if flag: # pooling img_tensor = torch.randint(high=5, size=(1, 1, 4, 4), dtype=torch.float) maxpool_layer = nn.MaxPool2d((2, 2), stride=(2, 2), return_indices=True) img_pool, indices = maxpool_layer(img_tensor) # unpooling img_reconstruct = torch.randn_like(img_pool, dtype=torch.float) maxunpool_layer = nn.MaxUnpool2d((2, 2), stride=(2, 2)) img_unpool = maxunpool_layer(img_reconstruct, indices) print("raw_img:\n{}\nimg_pool:\n{}".format(img_tensor, img_pool)) print("img_reconstruct:\n{}\nimg_unpool:\n{}".format(img_reconstruct, img_unpool))输出:
raw_img: tensor([[[[0., 3., 2., 3.], [2., 4., 0., 4.], [4., 0., 4., 4.], [1., 4., 2., 3.]]]]) img_pool: tensor([[[[4., 4.], [4., 4.]]]]) img_reconstruct: tensor([[[[0.4582, 2.4747], [0.2824, 1.3175]]]]) img_unpool: tensor([[[[0.0000, 0.0000, 0.0000, 0.0000], [0.0000, 0.4582, 0.0000, 2.4747], [0.2824, 0.0000, 1.3175, 0.0000], [0.0000, 0.0000, 0.0000, 0.0000]]]])1.nn.Linear 功能:对一维信号(向量)进行线性组合 主要参数:
in_features:输入结点数out_features:输出结点数bias:是否需要偏置计算公式: y = x W T + b i a s y=x W^{T}+b i a s y=xWT+bias测试代码:
# ================ linear flag = 1 # flag = 0 if flag: inputs = torch.tensor([[1., 2, 3]]) linear_layer = nn.Linear(3, 4) linear_layer.weight.data = torch.tensor([[1., 1., 1.], [2., 2., 2.], [3., 3., 3.], [4., 4., 4.]]) linear_layer.bias.data.fill_(0) output = linear_layer(inputs) print(inputs, inputs.shape) print(linear_layer.weight.data, linear_layer.weight.data.shape) print(output, output.shape)输出:
tensor([[1., 2., 3.]]) torch.Size([1, 3]) tensor([[1., 1., 1.], [2., 2., 2.], [3., 3., 3.], [4., 4., 4.]]) torch.Size([4, 3]) tensor([[ 6., 12., 18., 24.]], grad_fn=<AddmmBackward>) torch.Size([1, 4])设置 linear_layer.bias.data.fill_(0.5)
tensor([[1., 2., 3.]]) torch.Size([1, 3]) tensor([[1., 1., 1.], [2., 2., 2.], [3., 3., 3.], [4., 4., 4.]]) torch.Size([4, 3]) tensor([[ 6.5000, 12.5000, 18.5000, 24.5000]], grad_fn=<AddmmBackward>) torch.Size([1, 4])激活函数对特征进行非线性变换,赋予多层神经网络具有深度的意义 当没有使用非线性激活层时,等价于单层线性层。 推导: H 1 = X ∗ W 1 H 2 = H 1 ∗ W 2 Out p u t = H 2 ∗ W 3 = H 1 ∗ W 2 ∗ W 3 = X ∗ ( W 1 ∗ W 2 ∗ W 3 ) = X ∗ W \begin{aligned} \boldsymbol{H}_{1}=\boldsymbol{X} & * \boldsymbol{W}_{1} \\ \boldsymbol{H}_{2}=\boldsymbol{H}_{1} & * \boldsymbol{W}_{2} \\ \text { Out } \boldsymbol{p} u \boldsymbol{t} &=\boldsymbol{H}_{2} * \boldsymbol{W}_{3} \\ &=\boldsymbol{H}_{1} * \boldsymbol{W}_{2} * \boldsymbol{W}_{3} \\ &=\boldsymbol{X} *\left(\boldsymbol{W}_{1} * \boldsymbol{W}_{2} * \boldsymbol{W}_{3}\right) \\ &=\boldsymbol{X} * \boldsymbol{W} \end{aligned} H1=XH2=H1 Out put∗W1∗W2=H2∗W3=H1∗W2∗W3=X∗(W1∗W2∗W3)=X∗W 从上为推导。 1.nn.Sigmoid 计算公式: y = 1 1 + e − x y=\frac{1}{1+e^{-x}} y=1+e−x1 梯度公式: y ′ = y ∗ ( 1 − y ) y^{\prime}=y *(1-y) y′=y∗(1−y) 特性:
输出值在(0,1),符合概率导数范围是[0,0.25], 易导致梯度消失输出为非0均值,破坏数据分布2.nn.tanh 计算公式: y = sin x cos x = e x − e − x e − + e − x = 2 1 + e − 2 x + 1 y=\frac{\sin x}{\cos x}=\frac{e^{x}-e^{-x}}{e^{-}+e^{-x}}=\frac{2}{1+e^{-2 x}}+1 y=cosxsinx=e−+e−xex−e−x=1+e−2x2+1 梯度公式: y ′ = 1 − y 2 y^{\prime}=1-y^{2} y′=1−y2 特性:
输出值在(-1,1),数据符合0均值导数范围是(0,1),易导致梯度消失3.nn.ReLu 计算公式: y = max ( 0 , x ) y=\max (0, x) y=max(0,x) 梯度公式: y ′ = { 1 , x > 0 undefined, x = 0 0 , x < 0 y^{\prime}=\left\{\begin{array}{ll}{1,} & {x>0} \\ {\text { undefined, }} & {x=0} \\ {0,} & {x<0}\end{array}\right. y′=⎩⎨⎧1, undefined, 0,x>0x=0x<0 特性:
输出值均为正数,负半轴导致死神经元导数是1,缓解梯度消失,但易引发梯度爆炸4.nn.LeakyReLU
negative_ slope: 负半轴斜率5.nn.PReLU
init:可学习斜率6.nn.RReLU
lower:均匀分布下限upper:均匀分布上限1.深入理解二维卷积,采用手算的方式实现以下卷积操作,然后用代码验证
采用2个尺寸为33的卷积核对3通道的55图像进行卷积,padding=0, stride=1,dilation=0 其中 input shape = (3, 5, 5),数据如下 kernel size = 3*3, 第一个卷积核所有权值均为1, 第二个卷积核所有权值均为2, 计算输出的feature map尺寸以及所有像素值 测试代码: conv_layer_1 = nn.Conv2d(3, 1, 3, bias=False) # conv_layer_1 = nn.Conv2d(3, 1, 3, bias=False, padding=1) conv_layer_1.weight.data = torch.ones(conv_layer_1.weight.shape) conv_layer_2 = nn.Conv2d(3, 1, 3, bias=False) # conv_layer_2 = nn.Conv2d(3, 1, 3, bias=False, padding=1) conv_layer_2.weight.data = torch.ones(conv_layer_2.weight.shape)*2 img_tensor = torch.ones((1, 3, 5, 5)) img_tensor[:, 1, :, :] = img_tensor[:, 1, :, :]*2 img_tensor[:, 2, :, :] = img_tensor[:, 2, :, :]*3 img_conv_1 = conv_layer_1(img_tensor) print("卷积前尺寸:{}\n卷积后尺寸:{}".format(img_tensor.shape, img_conv_1.shape)) print("像素值大小:1x1x9+2x1x9+3x1x9 = {}".format(img_conv_1[0, 0, 0, 0].data)) print("=======================================") img_conv_2 = conv_layer_2(img_tensor) print("卷积前尺寸:{}\n卷积后尺寸:{}".format(img_tensor.shape, img_conv_2.shape)) print("像素值大小:1x2x9+2x2x9+3x2x9 = {}".format(img_conv_2[0, 0, 0, 0].data))输出:
卷积前尺寸:torch.Size([1, 3, 5, 5]) 卷积后尺寸:torch.Size([1, 1, 3, 3]) 像素值大小:1x1x9+2x1x9+3x1x9 = 54.0 ======================================= 卷积前尺寸:torch.Size([1, 3, 5, 5]) 卷积后尺寸:torch.Size([1, 1, 3, 3]) 像素值大小:1x2x9+2x2x9+3x2x9 = 108.0 接上题,上下左右四条边均采用padding,padding=1,填充值为0,计算输出的feature map尺寸以及所有像素值 只需把注释修改即可。2.对lena图进行3333d卷积,提示:padding=(1, 0, 0)
3d使用例程:
# ================ 3d # flag = 1 flag = 0 if flag: conv_layer = nn.Conv3d(3, 1, (1, 3, 3), padding=(1, 0, 0)) nn.init.xavier_normal_(conv_layer.weight.data) # calculation img_tensor.unsqueeze_(dim=2) # B*C*H*W to B*C*D*H*W img_conv = conv_layer(img_tensor)测试代码:
import torch.nn as nn from PIL import Image from torchvision import transforms from tools.common_tools import transform_invert, set_seed from matplotlib import pyplot as plt set_seed(2) # ================================= load img ================================== path_img = "lena.png" img = Image.open(path_img).convert('RGB') # 0~255 # convert to tensor img_transform = transforms.Compose([transforms.ToTensor()]) img_tensor = img_transform(img) img_tensor.unsqueeze_(dim=0) # C*H*W to B*C*H*W # ================ 3d kernel (1, 3, 3) # flag = 1 flag = 0 if flag: conv_layer = nn.Conv3d(3, 1, (1, 3, 3), padding=(1, 0, 0), bias=False) nn.init.xavier_normal_(conv_layer.weight.data) # calculation img_tensor.unsqueeze_(dim=2) # B*C*H*W to B*C*D*H*W img_conv = conv_layer(img_tensor) # ================================= visualization ================================== print("卷积前尺寸:{}\n卷积后尺寸:{}".format(img_tensor.shape, img_conv.shape)) img_conv = transform_invert(img_conv.squeeze(), img_transform) img_raw = transform_invert(img_tensor.squeeze(), img_transform) plt.subplot(122).imshow(img_conv, cmap='gray') plt.subplot(121).imshow(img_raw) plt.show() # ================ 3d kernel (3, 3, 3) flag = 1 # flag = 0 if flag: conv_layer = nn.Conv3d(3, 1, (3, 3, 3), padding=(1, 0, 0), bias=False) nn.init.xavier_normal_(conv_layer.weight.data) # calculation img_tensor.unsqueeze_(dim=2) # B*C*H*W to B*C*D*H*W img_conv = conv_layer(img_tensor) # ================================= visualization ================================== print("卷积前尺寸:{}\n卷积后尺寸:{}".format(img_tensor.shape, img_conv.shape)) img_conv = transform_invert(img_conv[:, :, ...], img_transform) img_raw = transform_invert(img_tensor.squeeze(), img_transform) plt.subplot(122).imshow(img_conv, cmap='gray') plt.subplot(121).imshow(img_raw) plt.show()输出:
卷积前尺寸:torch.Size([1, 3, 1, 512, 512]) 卷积后尺寸:torch.Size([1, 1, 1, 510, 510])