1 2 3 4 import torch import torch.nn as nnimport torch.optim as optimimport random
一个在标准训练过程如下:
进行前向传播(forward pass),得到中间激活值和计算图,计算 loss。 进行反向传播(backward pass),根据计算图来计算梯度,即 loss.backward()。梯度的形状和参数形状一致。 更新参数:调用 optimizer.step() 进行参数更新,并清空梯度。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 random.seed(42 )class SimpleNN (nn.Module): def __init__ (self ): super (SimpleNN, self ).__init__() self .fc1 = nn.Linear(5 ,2 ) self .fc2 = nn.Linear(2 ,5 ) def forward (self, x ): ''' x -> fc1(down projection) -> relu -> fc2(up projection) -> +x -> output ''' x = x + self .fc2(torch.relu(self .fc1(x))) return x
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 model = SimpleNN()input = torch.randn(5 ,5 ) print ("======接下来请看输入数据经过每一层的变化吧========" )print ("输入的数据input:\n" , input )print ("fc1的参数:fc1.weight\n:" , model.fc1.weight)print ("fc1(x):\n" , model.fc1(input ))print ("relu(fc1(x))\n" , torch.relu(model.fc1(input )))print ("fc2(relu(fc1(x))):\n" ,model.fc2(torch.relu(model.fc1(input ))))print ("x + fc2(relu(fc1(x))):\n" ,input + model.fc2(torch.relu(model.fc1(input )))) output = model(input )print ("============这和model(input)得到的结果一不一致呢?======" )print ("model(input):\n" ,output)print ("结果其实是:\n" ,output == input + model.fc2(torch.relu(model.fc1(input ))))
======接下来请看输入数据经过每一层的变化吧========
输入的数据input:
tensor([[-1.2202, 0.2502, -0.2889, -0.1986, -2.8538],
[ 0.4287, -1.6049, 0.5761, 1.1029, -1.6302],
[ 0.4738, 1.0801, -0.6158, -0.0197, -0.6740],
[-0.5638, 0.9001, 0.4496, -0.5458, 0.1249],
[ 1.9979, -1.1930, 1.5334, -0.8725, -0.0448]])
fc1的参数:fc1.weight
: Parameter containing:
tensor([[ 0.0664, 0.0643, -0.1771, -0.3984, -0.3248],
[ 0.1452, -0.1781, 0.1125, 0.2815, 0.0503]], requires_grad=True)
fc1(x):
tensor([[ 1.2801, -0.1998],
[ 0.2011, 0.8952],
[ 0.7247, 0.0215],
[ 0.4057, -0.0852],
[ 0.4344, 0.6810]], grad_fn=<AddmmBackward0>)
relu(fc1(x))
tensor([[1.2801, 0.0000],
[0.2011, 0.8952],
[0.7247, 0.0215],
[0.4057, 0.0000],
[0.4344, 0.6810]], grad_fn=<ReluBackward0>)
fc2(relu(fc1(x))):
tensor([[-3.1179e-01, 9.2656e-01, 6.6133e-01, 7.6778e-01, 1.6566e-01],
[-2.3086e-01, -3.6633e-02, -2.4526e-03, -2.0451e-04, 7.2680e-01],
[-2.0840e-01, 6.7831e-01, 5.9427e-01, 4.4476e-01, 2.5295e-01],
[-1.4424e-01, 5.5479e-01, 5.7692e-01, 2.6476e-01, 2.8756e-01],
[-2.4546e-01, 1.8325e-01, 1.5396e-01, 1.6924e-01, 5.9601e-01]],
grad_fn=<AddmmBackward0>)
x + fc2(relu(fc1(x))):
tensor([[-1.5320, 1.1768, 0.3724, 0.5691, -2.6881],
[ 0.1978, -1.6415, 0.5737, 1.1027, -0.9035],
[ 0.2654, 1.7584, -0.0215, 0.4250, -0.4211],
[-0.7080, 1.4549, 1.0265, -0.2810, 0.4125],
[ 1.7524, -1.0097, 1.6874, -0.7032, 0.5512]], grad_fn=<AddBackward0>)
============这和model(input)得到的结果一不一致呢?======
model(input):
tensor([[-1.5320, 1.1768, 0.3724, 0.5691, -2.6881],
[ 0.1978, -1.6415, 0.5737, 1.1027, -0.9035],
[ 0.2654, 1.7584, -0.0215, 0.4250, -0.4211],
[-0.7080, 1.4549, 1.0265, -0.2810, 0.4125],
[ 1.7524, -1.0097, 1.6874, -0.7032, 0.5512]], grad_fn=<AddBackward0>)
结果其实是:
tensor([[True, True, True, True, True],
[True, True, True, True, True],
[True, True, True, True, True],
[True, True, True, True, True],
[True, True, True, True, True]])
x -> fc1(down projection) -> relu -> fc2(up projection)-> +x -> output
新建一个base模型以及adapter,并且保证base模型在训练的时候参数不变,只微调adapter的参数
模型定义
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 class BaseModel (nn.Module): def __init__ (self ): super (BaseModel, self ).__init__() self .fc1 = nn.Linear(5 , 10 ) self .fc2 = nn.Linear(10 , 5 ) def forward (self, x ): x = torch.relu(self .fc1(x)) x = self .fc2(x) return xclass Adapter (nn.Module): def __init__ (self ): super (Adapter, self ).__init__() self .fc1 = nn.Linear(5 ,2 ) self .fc2 = nn.Linear(2 ,5 ) def forward (self, x ): residual = x x = torch.relu(self .fc1(x)) x = self .fc2(x) x = x + residual return xclass BasewithAdapter (nn.Module): def __init__ (self, base_model, adapter ): super (BasewithAdapter, self ).__init__() self .base_model = base_model self .adapter = adapter self .classhead = nn.Linear(5 ,1 ) self .sigmoid = nn.Sigmoid() def forward (self, x ): base_output = self .base_model(x) adapter_output = self .adapter(x) output = self .classhead(base_output + adapter_output) output = self .sigmoid(output) return output
模型初始化
1 2 3 base_model = BaseModel() adapter = Adapter() model = BasewithAdapter(base_model, adapter)
查看basemodel结构
1 2 3 4 5 6 7 8 print ("==========base_model=========" )print (base_model)print ("-" * 50 )for name, param in base_model.named_parameters(): print (f"parameter name:{name} " ) print (f"parameter value:{param} " ) print ("-" * 50 )
==========base_model=========
BaseModel(
(fc1): Linear(in_features=5, out_features=10, bias=True)
(fc2): Linear(in_features=10, out_features=5, bias=True)
)
--------------------------------------------------
parameter name:fc1.weight
parameter value:Parameter containing:
tensor([[ 0.4336, -0.2527, -0.1431, -0.3255, 0.1239],
[-0.2378, 0.1515, 0.4288, -0.3317, 0.2779],
[ 0.1535, 0.1688, -0.3781, -0.2174, 0.0738],
[-0.2811, -0.3980, -0.3960, -0.4455, 0.2161],
[ 0.1410, -0.3365, 0.3386, 0.1728, 0.0791],
[ 0.1138, -0.2654, 0.3056, 0.0793, 0.2307],
[-0.2616, 0.2112, 0.2274, 0.0392, 0.1044],
[-0.2264, -0.1778, 0.2020, 0.0354, 0.1658],
[-0.2374, 0.0352, -0.1868, 0.2495, 0.0379],
[-0.2995, -0.2185, -0.1470, -0.3039, -0.4207]], requires_grad=True)
--------------------------------------------------
parameter name:fc1.bias
parameter value:Parameter containing:
tensor([-0.0974, 0.1413, 0.2264, 0.2783, -0.0539, -0.0556, -0.3448, -0.3413,
-0.2531, 0.1960], requires_grad=True)
--------------------------------------------------
parameter name:fc2.weight
parameter value:Parameter containing:
tensor([[ 0.2579, -0.2095, -0.1630, 0.2688, 0.2801, 0.2098, -0.2368, -0.0675,
0.2128, 0.1830],
[ 0.2660, 0.1586, 0.0018, -0.2677, -0.0120, 0.1762, 0.0789, 0.0946,
-0.2273, -0.2025],
[ 0.1673, -0.0173, 0.0494, 0.2060, -0.2267, 0.3011, -0.1249, -0.1388,
-0.0258, 0.0965],
[ 0.0940, -0.2441, -0.2963, -0.1249, 0.1784, -0.2184, 0.0044, 0.0368,
0.0734, -0.3110],
[ 0.1679, -0.1773, 0.1796, -0.1042, 0.0288, 0.0196, -0.0735, -0.0327,
-0.0457, 0.0938]], requires_grad=True)
--------------------------------------------------
parameter name:fc2.bias
parameter value:Parameter containing:
tensor([ 0.0478, 0.0522, -0.2679, -0.0112, -0.2772], requires_grad=True)
--------------------------------------------------
查看adapter结构
1 2 3 4 5 6 7 8 print ("==========adapter=========" )print (adapter)print ("-" * 50 )for name, param in adapter.named_parameters(): print (f"parameter name:{name} " ) print (f"parameter value:{param} " ) print ("-" * 50 )
==========adapter=========
Adapter(
(fc1): Linear(in_features=5, out_features=2, bias=True)
(fc2): Linear(in_features=2, out_features=5, bias=True)
)
--------------------------------------------------
parameter name:fc1.weight
parameter value:Parameter containing:
tensor([[-0.1502, -0.0157, 0.1981, -0.3822, -0.1813],
[-0.0734, 0.3920, 0.1197, -0.4180, 0.2841]], requires_grad=True)
--------------------------------------------------
parameter name:fc1.bias
parameter value:Parameter containing:
tensor([ 0.4073, -0.0563], requires_grad=True)
--------------------------------------------------
parameter name:fc2.weight
parameter value:Parameter containing:
tensor([[-0.3924, 0.0132],
[ 0.2919, -0.5051],
[ 0.4481, 0.6499],
[-0.3320, -0.6645],
[ 0.0464, 0.4679]], requires_grad=True)
--------------------------------------------------
parameter name:fc2.bias
parameter value:Parameter containing:
tensor([ 0.4048, -0.1295, 0.1568, -0.2318, 0.0233], requires_grad=True)
--------------------------------------------------
冻结base模型里的参数,使得它不会被更新
1 2 3 4 5 6 for param in model.base_model.parameters(): param.requires_grad = False for name, param in model.named_parameters(): print (name, param.requires_grad)
base_model.fc1.weight False
base_model.fc1.bias False
base_model.fc2.weight False
base_model.fc2.bias False
adapter.fc1.weight True
adapter.fc1.bias True
adapter.fc2.weight True
adapter.fc2.bias True
classhead.weight True
classhead.bias True
1 2 3 4 input_data = torch.randn(10 ,5 ) target_data = torch.randint(0 ,2 ,(10 ,), dtype=torch.float )
tensor([[-1.4232, -0.3832, -0.4198, 0.4228, 0.2203],
[-1.4282, 1.8447, -1.2595, -0.9088, -1.3608],
[ 1.6698, -0.1378, -1.5213, 0.6471, 1.0883],
[ 0.9648, 0.0560, -0.2517, -1.3640, -0.8051],
[-1.5178, -0.0528, -1.1300, -1.5049, -0.9609],
[-0.9048, 0.0666, -0.8958, -0.6056, 0.0209],
[ 2.3616, 0.1847, 0.8678, -0.9167, 0.9430],
[-0.3303, 0.4280, -0.2748, -2.5328, -1.4385],
[ 1.8969, -1.8394, -0.0575, -0.0590, -2.4369],
[-0.8545, -1.0343, -0.3162, 0.3940, -1.0028]])
tensor([0., 1., 0., 1., 0., 1., 0., 0., 1., 0.])
定义loss
定义优化器
1 optimizer = optim.Adam(model.parameters(), lr = 1e-3 )
进行前向传播
1 2 3 output = model(input_data) output
tensor([[0.6159],
[0.5352],
[0.5104],
[0.3628],
[0.6889],
[0.6323],
[0.3086],
[0.5092],
[0.1160],
[0.4349]], grad_fn=<SigmoidBackward0>)
1 2 3 loss = loss_fn(output.squeeze(), target_data)print (f"loss:{loss} " )
loss:0.3196709156036377
1 2 3 print (torch.sum ((output.squeeze() - target_data) ** 2 ) / 10 )
tensor(0.3197, grad_fn=<DivBackward0>)
进行反向传播前,打印每个参数的梯度
1 2 3 4 5 6 7 8 print ("gradient:" )for name, param in model.named_parameters(): print (f"{name} " ) try : print (f"梯度:{param.grad} " ) except : pass
gradient:
base_model.fc1.weight
梯度:None
base_model.fc1.bias
梯度:None
base_model.fc2.weight
梯度:None
base_model.fc2.bias
梯度:None
adapter.fc1.weight
梯度:None
adapter.fc1.bias
梯度:None
adapter.fc2.weight
梯度:None
adapter.fc2.bias
梯度:None
classhead.weight
梯度:None
classhead.bias
梯度:None
1 2 3 4 5 loss.backward()for name, param in model.named_parameters(): print (name) print (f"gradient:{param.grad} " )
base_model.fc1.weight
gradient:None
base_model.fc1.bias
gradient:None
base_model.fc2.weight
gradient:None
base_model.fc2.bias
gradient:None
adapter.fc1.weight
gradient:tensor([[ 2.0354e-02, -7.9889e-03, -2.1384e-03, 9.3599e-03, -7.7675e-03],
[ 7.1102e-04, 3.8607e-05, 5.5156e-04, -9.6660e-04, -1.3542e-03]])
adapter.fc1.bias
gradient:tensor([-0.0043, -0.0006])
adapter.fc2.weight
gradient:tensor([[ 0.0065, -0.0079],
[ 0.0049, -0.0060],
[ 0.0074, -0.0090],
[ 0.0069, -0.0085],
[-0.0105, 0.0128]])
adapter.fc2.bias
gradient:tensor([-0.0130, -0.0098, -0.0147, -0.0139, 0.0210])
classhead.weight
gradient:tensor([[-0.0263, -0.0578, -0.0509, -0.0682, 0.0748]])
classhead.bias
gradient:tensor([0.0564])
可以发现上面的base model都没有梯度,因为它前面被我们冻结了。 另外,梯度的形状(注意,如果用大小可能会有歧义)和参数的形状一致。因为每个参数都需要配备一个对应的梯度来指导其应该如何更新。梯度实际上就是损失函数对该参数的偏导数。
更新前的模型参数
1 2 3 4 5 6 print (model)for name, param in model.named_parameters(): print (f"name:{name} " ) print (f"param:{param} " ) print ("-" * 50 )
BasewithAdapter(
(base_model): BaseModel(
(fc1): Linear(in_features=5, out_features=10, bias=True)
(fc2): Linear(in_features=10, out_features=5, bias=True)
)
(adapter): Adapter(
(fc1): Linear(in_features=5, out_features=2, bias=True)
(fc2): Linear(in_features=2, out_features=5, bias=True)
)
(classhead): Linear(in_features=5, out_features=1, bias=True)
(sigmoid): Sigmoid()
)
name:base_model.fc1.weight
param:Parameter containing:
tensor([[-0.3368, 0.1063, -0.0966, -0.0316, 0.0564],
[ 0.0717, 0.1012, -0.0392, -0.1883, -0.1120],
[-0.1414, 0.0691, -0.1963, -0.3627, 0.0400],
[ 0.0006, 0.2310, 0.1643, 0.1907, -0.2957],
[-0.2284, -0.4337, 0.0293, -0.3698, -0.0502],
[ 0.0180, -0.2635, 0.4388, -0.2568, -0.2210],
[-0.3673, -0.3681, 0.1915, -0.3489, 0.1758],
[-0.4011, 0.2684, 0.3424, -0.3033, -0.4273],
[ 0.0496, -0.2914, 0.0873, -0.3749, 0.1635],
[-0.1358, 0.2501, -0.0757, 0.3155, -0.2628]])
--------------------------------------------------
name:base_model.fc1.bias
param:Parameter containing:
tensor([-0.1694, -0.1769, -0.3724, 0.1771, -0.0550, -0.0744, 0.4022, -0.2611,
-0.1714, 0.1903])
--------------------------------------------------
name:base_model.fc2.weight
param:Parameter containing:
tensor([[-0.1357, 0.2190, -0.2924, -0.1935, -0.0521, 0.0705, 0.0679, -0.1433,
0.1924, -0.0517],
[-0.1323, -0.2032, 0.0015, 0.2776, -0.2703, 0.1662, -0.0765, 0.1111,
0.0926, 0.1197],
[-0.1398, 0.1887, 0.1364, -0.1965, -0.0594, 0.3036, -0.0004, -0.0255,
-0.2863, -0.2791],
[ 0.0979, -0.1706, 0.1123, -0.0108, -0.0526, -0.0403, -0.1269, -0.2003,
0.1699, -0.1547],
[ 0.3081, -0.3112, -0.0128, 0.0525, -0.1151, -0.1979, 0.3086, 0.1071,
-0.1751, -0.1322]])
--------------------------------------------------
name:base_model.fc2.bias
param:Parameter containing:
tensor([-0.2160, -0.2652, 0.1969, -0.1783, 0.0823])
--------------------------------------------------
name:adapter.fc1.weight
param:Parameter containing:
tensor([[ 0.2952, -0.0903, 0.3795, 0.2233, -0.3799],
[ 0.3118, -0.3888, 0.0493, 0.2254, 0.2531]], requires_grad=True)
--------------------------------------------------
name:adapter.fc1.bias
param:Parameter containing:
tensor([0.2686, 0.2880], requires_grad=True)
--------------------------------------------------
name:adapter.fc2.weight
param:Parameter containing:
tensor([[-0.6225, 0.0204],
[ 0.2991, 0.3418],
[ 0.3018, -0.1149],
[ 0.5925, -0.2609],
[-0.5754, -0.1206]], requires_grad=True)
--------------------------------------------------
name:adapter.fc2.bias
param:Parameter containing:
tensor([ 0.4099, 0.1889, -0.2469, -0.1469, -0.3643], requires_grad=True)
--------------------------------------------------
name:classhead.weight
param:Parameter containing:
tensor([[-0.2298, -0.1740, -0.2616, -0.2471, 0.3723]], requires_grad=True)
--------------------------------------------------
name:classhead.bias
param:Parameter containing:
tensor([-0.1815], requires_grad=True)
--------------------------------------------------
optimizer.step()
更新后的模型参数
1 2 3 4 5 print (model)for name, param in model.named_parameters(): print (f"name:{name} " ) print (f"param:{param} " ) print ("-" * 50 )
BasewithAdapter(
(base_model): BaseModel(
(fc1): Linear(in_features=5, out_features=10, bias=True)
(fc2): Linear(in_features=10, out_features=5, bias=True)
)
(adapter): Adapter(
(fc1): Linear(in_features=5, out_features=2, bias=True)
(fc2): Linear(in_features=2, out_features=5, bias=True)
)
(classhead): Linear(in_features=5, out_features=1, bias=True)
(sigmoid): Sigmoid()
)
name:base_model.fc1.weight
param:Parameter containing:
tensor([[-0.3368, 0.1063, -0.0966, -0.0316, 0.0564],
[ 0.0717, 0.1012, -0.0392, -0.1883, -0.1120],
[-0.1414, 0.0691, -0.1963, -0.3627, 0.0400],
[ 0.0006, 0.2310, 0.1643, 0.1907, -0.2957],
[-0.2284, -0.4337, 0.0293, -0.3698, -0.0502],
[ 0.0180, -0.2635, 0.4388, -0.2568, -0.2210],
[-0.3673, -0.3681, 0.1915, -0.3489, 0.1758],
[-0.4011, 0.2684, 0.3424, -0.3033, -0.4273],
[ 0.0496, -0.2914, 0.0873, -0.3749, 0.1635],
[-0.1358, 0.2501, -0.0757, 0.3155, -0.2628]])
--------------------------------------------------
name:base_model.fc1.bias
param:Parameter containing:
tensor([-0.1694, -0.1769, -0.3724, 0.1771, -0.0550, -0.0744, 0.4022, -0.2611,
-0.1714, 0.1903])
--------------------------------------------------
name:base_model.fc2.weight
param:Parameter containing:
tensor([[-0.1357, 0.2190, -0.2924, -0.1935, -0.0521, 0.0705, 0.0679, -0.1433,
0.1924, -0.0517],
[-0.1323, -0.2032, 0.0015, 0.2776, -0.2703, 0.1662, -0.0765, 0.1111,
0.0926, 0.1197],
[-0.1398, 0.1887, 0.1364, -0.1965, -0.0594, 0.3036, -0.0004, -0.0255,
-0.2863, -0.2791],
[ 0.0979, -0.1706, 0.1123, -0.0108, -0.0526, -0.0403, -0.1269, -0.2003,
0.1699, -0.1547],
[ 0.3081, -0.3112, -0.0128, 0.0525, -0.1151, -0.1979, 0.3086, 0.1071,
-0.1751, -0.1322]])
--------------------------------------------------
name:base_model.fc2.bias
param:Parameter containing:
tensor([-0.2160, -0.2652, 0.1969, -0.1783, 0.0823])
--------------------------------------------------
name:adapter.fc1.weight
param:Parameter containing:
tensor([[ 0.2942, -0.0893, 0.3805, 0.2223, -0.3789],
[ 0.3108, -0.3898, 0.0483, 0.2264, 0.2541]], requires_grad=True)
--------------------------------------------------
name:adapter.fc1.bias
param:Parameter containing:
tensor([0.2696, 0.2890], requires_grad=True)
--------------------------------------------------
name:adapter.fc2.weight
param:Parameter containing:
tensor([[-0.6235, 0.0214],
[ 0.2981, 0.3428],
[ 0.3008, -0.1139],
[ 0.5915, -0.2599],
[-0.5744, -0.1216]], requires_grad=True)
--------------------------------------------------
name:adapter.fc2.bias
param:Parameter containing:
tensor([ 0.4109, 0.1899, -0.2459, -0.1459, -0.3653], requires_grad=True)
--------------------------------------------------
name:classhead.weight
param:Parameter containing:
tensor([[-0.2288, -0.1730, -0.2606, -0.2461, 0.3713]], requires_grad=True)
--------------------------------------------------
name:classhead.bias
param:Parameter containing:
tensor([-0.1825], requires_grad=True)
--------------------------------------------------
累积梯度运算就是有多次的loss.backward,但是会过几次再进行optimizer.step() 通过这种方式来进行参数更新。模拟大批次。