# 一、torch.nn.Sequential

Sequential 本质是一个模块（即 Module），根据Pytorch中的约定，模块中可以继续添加模块。这意味着我们可以在 Sequential 中添加其它的模块（自然也就可以添加其他的 Sequential）。添加完成后，Sequential 会将这些模块组成一个流水线，输入将依次通过这些模块得到一个输出，如下图所示：

from torch import nn

myseq = nn.Sequential(
# Module 1
# Module 2
# ...
# Module n
)


20

,

10

,

5

20, 10, 5

20,10,5，隐层激活函数采用 ReLU，则我们的网络可写为

net = nn.Sequential(
nn.Linear(20, 10),
nn.ReLU(),
nn.Linear(10, 5)
)


3

3

3net 将返回一个 batch 的输出

torch.manual_seed(42)
X = torch.randn(3, 20)
net(X)
# tensor([[ 0.0092, -0.3154, -0.1202, -0.2654,  0.1336],
#         [-0.0042, -0.2338, -0.1788, -0.5513, -0.6258],
#         [ 0.0731, -0.4427, -0.3108,  0.1791,  0.1614]],


## 1.1 Sequential 的基础操作

print(net)
# Sequential(
#   (0): Linear(in_features=20, out_features=10, bias=True)
#   (1): ReLU()
#   (2): Linear(in_features=10, out_features=5, bias=True)
# )


print(net[0])
# Linear(in_features=20, out_features=10, bias=True)
print(net[1])
# ReLU()
print(len(net))
# 3


net[1] = nn.Sigmoid()
print(net)
# Sequential(
#   (0): Linear(in_features=20, out_features=10, bias=True)
#   (1): Sigmoid()
#   (2): Linear(in_features=10, out_features=5, bias=True)
# )

del net[2]
print(net)
# Sequential(
#   (0): Linear(in_features=20, out_features=10, bias=True)
#   (1): Sigmoid()
# )

net.append(nn.Linear(10, 2))  # 均会添加到末尾
print(net)
# Sequential(
#   (0): Linear(in_features=20, out_features=10, bias=True)
#   (1): Sigmoid()
#   (2): Linear(in_features=10, out_features=2, bias=True)
# )


net = nn.Sequential(
nn.Linear(20, 10),
nn.ReLU(),
nn.Linear(10, 5)
)

for sub_module in net:
print(sub_module)
# Linear(in_features=20, out_features=10, bias=True)
# ReLU()
# Linear(in_features=10, out_features=5, bias=True)


## 1.2 手动实现一个 Sequential

class MySeq(nn.Module):
def __init__(self, *args):
super().__init__()
for idx, module in enumerate(args):
self._modules[str(idx)] = module

def forward(self, inputs):
for module in self._modules.values():
inputs = module(inputs)
return inputs


torch.manual_seed(42)
myseq = MySeq(nn.Linear(20, 10), nn.ReLU(), nn.Linear(10, 5))
X = torch.rand(3, 20)
myseq(X)
# tensor([[ 0.2056, -0.5307, -0.0023, -0.0309,  0.1289],
#         [ 0.0681, -0.4473,  0.2085, -0.1179,  0.1157],
#         [ 0.1187, -0.5331,  0.0530, -0.0466,  0.0874]],


class MySeq(nn.Module):

def __init__(self, *args):
super().__init__()
for idx, module in enumerate(args):
self._modules[str(idx)] = module

def __getitem__(self, idx):
return self._modules[str(idx)]

def __setitem__(self, idx, module):
assert idx < len(self)
self._modules[str(idx)] = module

def __delitem__(self, idx):
for i in range(idx, len(self) - 1):
self._modules[str(i)] = self._modules[str(i + 1)]
del self._modules[str(len(self) - 1)]

def __len__(self):
return len(self._modules)

def append(self, module):
new_idx = int(list(self._modules.keys())[-1]) + 1
self._modules[str(new_idx)] = module

def forward(self, inputs):
for module in self._modules.values():
inputs = module(inputs)
return inputs


## 1.3 Sequential 嵌套

Sequential 本身就是一个模块，而模块可以嵌套模块，这说明 Sequential 可以嵌套 Sequential

seq_1 = nn.Sequential(nn.Linear(15, 10), nn.ReLU(), nn.Linear(10, 5))
seq_2 = nn.Sequential(nn.Linear(25, 15), nn.Sigmoid(), nn.Linear(15, 10))
seq_3 = nn.Sequential(seq_1, seq_2)
print(seq_3)
# Sequential(
#   (0): Sequential(
#     (0): Linear(in_features=15, out_features=10, bias=True)
#     (1): ReLU()
#     (2): Linear(in_features=10, out_features=5, bias=True)
#   )
#   (1): Sequential(
#     (0): Linear(in_features=25, out_features=15, bias=True)
#     (1): Sigmoid()
#     (2): Linear(in_features=15, out_features=10, bias=True)
#   )
# )


print(seq_3[1])
# Sequential(
#   (0): Linear(in_features=25, out_features=15, bias=True)
#   (1): Sigmoid()
#   (2): Linear(in_features=15, out_features=10, bias=True)
# )
print(seq_3[0][1])
# ReLU()


for seq in seq_3:
for module in seq:
print(module)
# Linear(in_features=15, out_features=10, bias=True)
# ReLU()
# Linear(in_features=10, out_features=5, bias=True)
# Linear(in_features=25, out_features=15, bias=True)
# Sigmoid()
# Linear(in_features=15, out_features=10, bias=True)


seq_1 = nn.Sequential(nn.Linear(30, 25), nn.ReLU(), nn.Linear(25, 20))
seq_2 = nn.Sequential(nn.Linear(20, 15), nn.Sigmoid(), nn.Linear(15, 10))
seq_3 = nn.Sequential(seq_1, seq_2)


## 1.4 自定义层

Sequential 中的模块又称为，我们完全不必局限于 torch.nn 中提供的各种层，通过继承 nn.Module 我们可以自定义层并将其添加到 Sequential 中。

### 1.4.1 不带参数的层

class CenteredLayer(nn.Module):
def __init__(self):
super().__init__()

def forward(self, X):
return X - X.mean()


torch.manual_seed(42)
net = nn.Sequential(nn.Linear(64, 30), CenteredLayer())
X = torch.randn(3, 64)
print(net(X).mean())


### 1.4.2 带参数的层

class Net(nn.Module):
def __init__(self, input_nodes, hidden_nodes, output_nodes):
super().__init__()
self.inodes = input_nodes
self.hnodes = hidden_nodes
self.onodes = output_nodes
self.model = nn.Sequential(
nn.Linear(self.inodes, self.hnodes),
nn.ReLU(),
nn.Linear(self.hnodes, self.onodes)
)

def forward(self, inputs):
return self.model(inputs)


784

,

256

,

8

784,256,8

784,256,8

torch.manual_seed(42)
net = Net(784, 256, 8)
X = torch.randn(5, 784)
print(net(X))
# tensor([[ 0.2291, -0.3913, -0.1745, -0.2685, -0.2684,  0.0760,  0.0071, -0.0337],
#         [ 0.2084,  0.1235, -0.1054, -0.0508,  0.0194, -0.0429, -0.3269,  0.1890],
#         [-0.0756, -0.4335, -0.1643, -0.1817, -0.2376, -0.1399,  0.2710, -0.3719],
#         [ 0.4110, -0.2428, -0.1021, -0.1019, -0.0550, -0.0890,  0.1430,  0.0881],
#         [ 0.0626, -0.4117,  0.0130,  0.1339, -0.2529, -0.1106, -0.2586,  0.2205]],


# 二、参数管理

## 2.1 nn.Parameter

nn.ParameterTensor 的子类，可以被视为一种特殊的张量，它可被用作模块的参数，具体使用格式如下：

nn.Parameter(data, requires_grad=True)


module = nn.Linear(3, 3)
type(module.weight)
# torch.nn.parameter.Parameter
type(module.bias)
# torch.nn.parameter.Parameter


""" 代码片段一 """
class Net(nn.Module):
def __init__(self):
super().__init__()
self.weight = torch.randn(3, 3)
self.bias = torch.randn(3)

def forward(self, inputs):
pass

net = Net()
print(list(net.parameters()))
# []

""" 代码片段二 """
class Net(nn.Module):
def __init__(self):
super().__init__()
self.weight = nn.Parameter(torch.randn(3, 3))
self.bias = nn.Parameter(torch.randn(3))

def forward(self, inputs):
pass

net = Net()
print(list(net.parameters()))
# [Parameter containing:
# tensor([[-0.4584,  0.3815, -0.4522],
#         [ 2.1236,  0.7928, -0.7095],
#         [-1.4921, -0.5689, -0.2342]], requires_grad=True), Parameter containing:


nn.Parameter 相当于把传入的数据包装成一个参数，如果要直接访问/使用其中的数据而非参数本身，可对 nn.Parameter 对象调用 data 属性：

a = torch.tensor([1, 2, 3]).to(torch.float32)
param = nn.Parameter(a)
print(param)
# Parameter containing:
print(param.data)
# tensor([1., 2., 3.])


## 2.2 参数访问

nn.Module 中有 state_dict() 方法（官网链接），该方法将以字典形式返回模块的所有状态，包括模块的参数和 persistent buffers （博主目前还不太理解后者，暂时略过），字典的键就是对应的参数/缓冲区的名称。

linear_layer = nn.Linear(2, 2)
print(linear_layer.state_dict())
# OrderedDict([('weight', tensor([[ 0.2602, -0.2318],
#         [-0.5192,  0.0130]])), ('bias', tensor([0.5890, 0.2476]))])
print(linear_layer.state_dict().keys())
# odict_keys(['weight', 'bias'])


linear_layer = nn.Linear(2, 1)
print(linear_layer.weight)
# Parameter containing:
print(linear_layer.bias)
# Parameter containing:


## 2.3 参数初始化

U

(

1

/

a

,

1

/

a

)

U(-1/sqrt{a},1/sqrt{a})

U(1/a

,1/a

) 中随机采样而来。

### 2.3.1 使用内置初始化

class Net(nn.Module):
def __init__(self):
super().__init__()
self.layers = nn.Sequential(
nn.Linear(3, 2),
nn.ReLU(),
nn.Linear(2, 3),
)

def forward(self, X):
return self.layers(X)


N

(

0

,

1

)

mathcal{N}(0,1)

N(0,1) 中采样，偏置全部初始化为

0

0

0，则初始化代码如下

def init_normal(module):
# 需要判断子模块是否为nn.Linear类，因为激活函数没有参数
if type(module) == nn.Linear:
nn.init.normal_(module.weight, mean=0, std=1)
nn.init.zeros_(module.bias)

net = Net()
net.apply(init_normal)
for param in net.parameters():
print(param)
# Parameter containing:
# tensor([[-0.3560,  0.8078, -2.4084],
#         [ 0.1700, -0.3217, -1.3320]], requires_grad=True)
# Parameter containing:
# Parameter containing:
# tensor([[-0.8025, -1.0695],
#         [-1.7031, -0.3068],
# Parameter containing:


net 调用 apply 方法则会递归地对其下所有的子模块应用 init_normal 函数。

### 2.3.2 自定义初始化

w

{

U

(

5

,

10

)

,

p

r

o

b

=

0.25

0

,

p

r

o

b

=

0.5

U

(

10

,

5

)

,

p

r

o

b

=

0.25

wsim begin{cases} U(5,10),&prob=0.25 \ 0,&prob=0.5\ U(-10,-5),&prob=0.25 \ end{cases}

wU(5,10),0,U(10,5),prob=0.25prob=0.5prob=0.25

w

w

w

U

(

10

,

10

)

U(-10, 10)

U(10,10) 中采样，如果

w

w

w 落到

(

5

,

5

)

(-5, 5)

(5,5) 中，则将其置为

0

0

0

def my_init(module):
if type(module) == nn.Linear:
nn.init.uniform_(module.weight, -10, 10)

net = Net()
net.apply(my_init)
for param in net.parameters():
print(param)
# Parameter containing:
# tensor([[-0.0000, -5.9610,  8.0000],
# Parameter containing:
# Parameter containing:
# tensor([[ 0.0000, -0.0000],
#         [-6.9569, -9.5102],
# Parameter containing:
# tensor([ 0.2521, -0.1500, -0.1484], requires_grad=True)


## 2.4 参数绑定

net = nn.Sequential(nn.Linear(4, 8), nn.ReLU(),
nn.Linear(8, 8), nn.ReLU(),
nn.Linear(8, 8), nn.ReLU(),
nn.Linear(8, 1))


shared = nn.Linear(8, 8)
net = nn.Sequential(nn.Linear(4, 8), nn.ReLU(),
shared, nn.ReLU(),
shared, nn.ReLU(),
nn.Linear(8, 1))


## 2.5 模型保存

### 2.5.1 张量的保存

torch.save()torch.load() 可以保存/加载Pytorch中的任何对象，使用格式如下:

torch.save(obj, path)


t = torch.tensor([1, 2, 3])
path = './models/my_tensor.pt'
torch.save(t, path)

print(a)
# tensor([1, 2, 3])


### 2.5.2 保存整个模型

torch.save(model, 'model.pt')


### 2.5.3 保存模型的参数

torch.save(model.state_dict(), 'model_params.pt')


model.load_state_dict(torch.load('model_params.pt'))
model.eval()


# 三、GPU

i

i

i 块GPU（从0开始）。 另外，cuda:0cuda 是等价的。

print(torch.cuda.device_count())
# 1


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


## 3.1 将数据移动到GPU

t = torch.zeros(3, 3, device=device)
t.device
# device(type='cuda', index=0)


t = torch.zeros(3, 3)
t.device
# device(type='cpu')
t = t.to(device)
t.device
# device(type='cuda', index=0)


t = torch.zeros(3, 3)
t = t.cuda()
t.device
# device(type='cuda', index=0)


## 3.2 将模型移动到GPU上

""" 方法一 """
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
net = Net()
net.to(device)

""" 方法二 """
net = Net()
net.cuda()


THE END

)">