torch代码演示模型训练流程中的梯度变化

import torch 
import torch.nn as nn
import torch.optim as optim
import random

一个在标准训练过程如下：

进行前向传播（forward pass），得到中间激活值和计算图，计算 loss。
进行反向传播（backward pass），根据计算图来计算梯度，即 loss.backward()。梯度的形状和参数形状一致。
更新参数：调用 optimizer.step() 进行参数更新，并清空梯度。

random.seed(42)
# 定义一个简单的神经网络模型
class SimpleNN(nn.Module):
    def __init__(self):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(5,2) # 输入3维，输出2维
        self.fc2 = nn.Linear(2,5) # 输入2维，输出3维

    def forward(self, x):
        '''
        x -> fc1(down projection) -> relu -> fc2(up projection) -> +x -> output
        '''
        x = x + self.fc2(torch.relu(self.fc1(x)))
        return x

# 创建模型实例
model = SimpleNN()

# 输入数据创建
input = torch.randn(5,5) # 5个样本，每个样本5个维度

print("======接下来请看输入数据经过每一层的变化吧========")
print("输入的数据input:\n", input)

# 查看fc1的参数
print("fc1的参数：fc1.weight\n:", model.fc1.weight)

# fc1(x)
print("fc1(x):\n", model.fc1(input))

# relu(fc1(x))
print("relu(fc1(x))\n", torch.relu(model.fc1(input)))

print("fc2(relu(fc1(x))):\n",model.fc2(torch.relu(model.fc1(input))))

print("x + fc2(relu(fc1(x))):\n",input + model.fc2(torch.relu(model.fc1(input))))
# 和直接前向传播进行对比
output = model(input)
print("============这和model(input)得到的结果一不一致呢？======")
print("model(input):\n",output)

print("结果其实是:\n",output == input + model.fc2(torch.relu(model.fc1(input))))

======接下来请看输入数据经过每一层的变化吧========
输入的数据input:
 tensor([[-1.2202,  0.2502, -0.2889, -0.1986, -2.8538],
        [ 0.4287, -1.6049,  0.5761,  1.1029, -1.6302],
        [ 0.4738,  1.0801, -0.6158, -0.0197, -0.6740],
        [-0.5638,  0.9001,  0.4496, -0.5458,  0.1249],
        [ 1.9979, -1.1930,  1.5334, -0.8725, -0.0448]])
fc1的参数：fc1.weight
: Parameter containing:
tensor([[ 0.0664,  0.0643, -0.1771, -0.3984, -0.3248],
        [ 0.1452, -0.1781,  0.1125,  0.2815,  0.0503]], requires_grad=True)
fc1(x):
 tensor([[ 1.2801, -0.1998],
        [ 0.2011,  0.8952],
        [ 0.7247,  0.0215],
        [ 0.4057, -0.0852],
        [ 0.4344,  0.6810]], grad_fn=<AddmmBackward0>)
relu(fc1(x))
 tensor([[1.2801, 0.0000],
        [0.2011, 0.8952],
        [0.7247, 0.0215],
        [0.4057, 0.0000],
        [0.4344, 0.6810]], grad_fn=<ReluBackward0>)
fc2(relu(fc1(x))):
 tensor([[-3.1179e-01,  9.2656e-01,  6.6133e-01,  7.6778e-01,  1.6566e-01],
        [-2.3086e-01, -3.6633e-02, -2.4526e-03, -2.0451e-04,  7.2680e-01],
        [-2.0840e-01,  6.7831e-01,  5.9427e-01,  4.4476e-01,  2.5295e-01],
        [-1.4424e-01,  5.5479e-01,  5.7692e-01,  2.6476e-01,  2.8756e-01],
        [-2.4546e-01,  1.8325e-01,  1.5396e-01,  1.6924e-01,  5.9601e-01]],
       grad_fn=<AddmmBackward0>)
x + fc2(relu(fc1(x))):
 tensor([[-1.5320,  1.1768,  0.3724,  0.5691, -2.6881],
        [ 0.1978, -1.6415,  0.5737,  1.1027, -0.9035],
        [ 0.2654,  1.7584, -0.0215,  0.4250, -0.4211],
        [-0.7080,  1.4549,  1.0265, -0.2810,  0.4125],
        [ 1.7524, -1.0097,  1.6874, -0.7032,  0.5512]], grad_fn=<AddBackward0>)
============这和model(input)得到的结果一不一致呢？======
model(input):
 tensor([[-1.5320,  1.1768,  0.3724,  0.5691, -2.6881],
        [ 0.1978, -1.6415,  0.5737,  1.1027, -0.9035],
        [ 0.2654,  1.7584, -0.0215,  0.4250, -0.4211],
        [-0.7080,  1.4549,  1.0265, -0.2810,  0.4125],
        [ 1.7524, -1.0097,  1.6874, -0.7032,  0.5512]], grad_fn=<AddBackward0>)
结果其实是:
 tensor([[True, True, True, True, True],
        [True, True, True, True, True],
        [True, True, True, True, True],
        [True, True, True, True, True],
        [True, True, True, True, True]])

x -> fc1(down projection) -> relu -> fc2(up projection)-> +x -> output

新建一个base模型以及adapter，并且保证base模型在训练的时候参数不变，只微调adapter的参数

模型定义

class BaseModel(nn.Module):
    def __init__(self):
        super(BaseModel, self).__init__()
        self.fc1 = nn.Linear(5, 10) # 本来只有5个特征，比如西瓜的大小，重量，颜色，声音，叶子。 然后把它向上投影到10维空间里
        self.fc2 = nn.Linear(10, 5) # 再映射回原来的维度

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

class Adapter(nn.Module):
    def __init__(self):
        super(Adapter, self).__init__()
        self.fc1 = nn.Linear(5,2)
        self.fc2 = nn.Linear(2,5)

    def forward(self, x):
        # fc1_x = self.fc1(x)
        # relu_fc1_x = torch.relu(fc1_x)
        # fc2_relu_fc1_x = self.fc2(relu_fc1_x)
        # return x + fc2_relu_fc1_x
        residual = x
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        x = x + residual
        return x

class BasewithAdapter(nn.Module):
    def __init__(self, base_model, adapter):
        super(BasewithAdapter, self).__init__()
        self.base_model = base_model
        self.adapter = adapter
        self.classhead = nn.Linear(5,1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        base_output = self.base_model(x)
        adapter_output = self.adapter(x)
        output = self.classhead(base_output + adapter_output)
        # 二分类用sigmoid, 多分类用softmax. 
        output = self.sigmoid(output)
        return output

模型初始化

1
2
3

base_model = BaseModel()
adapter = Adapter()
model = BasewithAdapter(base_model, adapter)

查看basemodel结构

print("==========base_model=========")
print(base_model)
print("-" * 50)

for name, param in base_model.named_parameters():
    print(f"parameter name:{name}")
    print(f"parameter value:{param}")
    print("-" * 50)

==========base_model=========
BaseModel(
  (fc1): Linear(in_features=5, out_features=10, bias=True)
  (fc2): Linear(in_features=10, out_features=5, bias=True)
)
--------------------------------------------------
parameter name:fc1.weight
parameter value:Parameter containing:
tensor([[ 0.4336, -0.2527, -0.1431, -0.3255,  0.1239],
        [-0.2378,  0.1515,  0.4288, -0.3317,  0.2779],
        [ 0.1535,  0.1688, -0.3781, -0.2174,  0.0738],
        [-0.2811, -0.3980, -0.3960, -0.4455,  0.2161],
        [ 0.1410, -0.3365,  0.3386,  0.1728,  0.0791],
        [ 0.1138, -0.2654,  0.3056,  0.0793,  0.2307],
        [-0.2616,  0.2112,  0.2274,  0.0392,  0.1044],
        [-0.2264, -0.1778,  0.2020,  0.0354,  0.1658],
        [-0.2374,  0.0352, -0.1868,  0.2495,  0.0379],
        [-0.2995, -0.2185, -0.1470, -0.3039, -0.4207]], requires_grad=True)
--------------------------------------------------
parameter name:fc1.bias
parameter value:Parameter containing:
tensor([-0.0974,  0.1413,  0.2264,  0.2783, -0.0539, -0.0556, -0.3448, -0.3413,
        -0.2531,  0.1960], requires_grad=True)
--------------------------------------------------
parameter name:fc2.weight
parameter value:Parameter containing:
tensor([[ 0.2579, -0.2095, -0.1630,  0.2688,  0.2801,  0.2098, -0.2368, -0.0675,
          0.2128,  0.1830],
        [ 0.2660,  0.1586,  0.0018, -0.2677, -0.0120,  0.1762,  0.0789,  0.0946,
         -0.2273, -0.2025],
        [ 0.1673, -0.0173,  0.0494,  0.2060, -0.2267,  0.3011, -0.1249, -0.1388,
         -0.0258,  0.0965],
        [ 0.0940, -0.2441, -0.2963, -0.1249,  0.1784, -0.2184,  0.0044,  0.0368,
          0.0734, -0.3110],
        [ 0.1679, -0.1773,  0.1796, -0.1042,  0.0288,  0.0196, -0.0735, -0.0327,
         -0.0457,  0.0938]], requires_grad=True)
--------------------------------------------------
parameter name:fc2.bias
parameter value:Parameter containing:
tensor([ 0.0478,  0.0522, -0.2679, -0.0112, -0.2772], requires_grad=True)
--------------------------------------------------

查看adapter结构

print("==========adapter=========")
print(adapter)
print("-" * 50)

for name, param in adapter.named_parameters():
    print(f"parameter name:{name}")
    print(f"parameter value:{param}")
    print("-" * 50)

==========adapter=========
Adapter(
  (fc1): Linear(in_features=5, out_features=2, bias=True)
  (fc2): Linear(in_features=2, out_features=5, bias=True)
)
--------------------------------------------------
parameter name:fc1.weight
parameter value:Parameter containing:
tensor([[-0.1502, -0.0157,  0.1981, -0.3822, -0.1813],
        [-0.0734,  0.3920,  0.1197, -0.4180,  0.2841]], requires_grad=True)
--------------------------------------------------
parameter name:fc1.bias
parameter value:Parameter containing:
tensor([ 0.4073, -0.0563], requires_grad=True)
--------------------------------------------------
parameter name:fc2.weight
parameter value:Parameter containing:
tensor([[-0.3924,  0.0132],
        [ 0.2919, -0.5051],
        [ 0.4481,  0.6499],
        [-0.3320, -0.6645],
        [ 0.0464,  0.4679]], requires_grad=True)
--------------------------------------------------
parameter name:fc2.bias
parameter value:Parameter containing:
tensor([ 0.4048, -0.1295,  0.1568, -0.2318,  0.0233], requires_grad=True)
--------------------------------------------------

冻结base模型里的参数，使得它不会被更新

for param in model.base_model.parameters():
    param.requires_grad = False

# 检查模型中哪些参数会被更新
for name, param in model.named_parameters():
    print(name, param.requires_grad)

base_model.fc1.weight False
base_model.fc1.bias False
base_model.fc2.weight False
base_model.fc2.bias False
adapter.fc1.weight True
adapter.fc1.bias True
adapter.fc2.weight True
adapter.fc2.bias True
classhead.weight True
classhead.bias True

1
2
3

#创建训练数据
# x： [10, 5] # 10个西瓜，每个西瓜有5个特征。我们建立模型的时候，不关注样本的数量，主要关注特征的维度。因为样本数量这些都是可以修改的
# y:  [10] # 每个西瓜是好瓜还是坏瓜

input_data = torch.randn(10,5)
# 注意，这样写可能会有问题。当反向传播的时候，我们的target是整型的，无法和float类型的output数据进行计算
# target_data = torch.randint(0,2,(10,)) # 位于[0,2) 之间的随机整数（也就是0和1啦），然后初始化10个，使用（10，）...背下来吧
target_data = torch.randint(0,2,(10,), dtype=torch.float)

1	`input_data`

tensor([[-1.4232, -0.3832, -0.4198,  0.4228,  0.2203],
        [-1.4282,  1.8447, -1.2595, -0.9088, -1.3608],
        [ 1.6698, -0.1378, -1.5213,  0.6471,  1.0883],
        [ 0.9648,  0.0560, -0.2517, -1.3640, -0.8051],
        [-1.5178, -0.0528, -1.1300, -1.5049, -0.9609],
        [-0.9048,  0.0666, -0.8958, -0.6056,  0.0209],
        [ 2.3616,  0.1847,  0.8678, -0.9167,  0.9430],
        [-0.3303,  0.4280, -0.2748, -2.5328, -1.4385],
        [ 1.8969, -1.8394, -0.0575, -0.0590, -2.4369],
        [-0.8545, -1.0343, -0.3162,  0.3940, -1.0028]])

1	`target_data`

tensor([0., 1., 0., 1., 0., 1., 0., 0., 1., 0.])

定义loss

1	`loss_fn = nn.MSELoss()`

定义优化器

1	`optimizer = optim.Adam(model.parameters(), lr = 1e-3)`

进行前向传播

1
2
3

# 进行前向传播
output = model(input_data)
output

tensor([[0.6159],
        [0.5352],
        [0.5104],
        [0.3628],
        [0.6889],
        [0.6323],
        [0.3086],
        [0.5092],
        [0.1160],
        [0.4349]], grad_fn=<SigmoidBackward0>)

1
2
3

# 损失函数设置成均方误差
loss = loss_fn(output.squeeze(), target_data)
print(f"loss:{loss}")

loss:0.3196709156036377

1
2
3

# 手动计算均方误差 MSE = (1/n) * Σ(y_pred - y_true)²
# 注意，这里只是答应了一个值。这种loss应该是没法反向传播的，因为他没有什么什么计算图？只是一个简单浮点数而已
print(torch.sum((output.squeeze() - target_data) ** 2) / 10)

tensor(0.3197, grad_fn=<DivBackward0>)

进行反向传播前，打印每个参数的梯度

# 进行反向传播之前，查看梯度
print("gradient:")
for name, param in model.named_parameters():
    print(f"{name}")
    try :
        print(f"梯度：{param.grad}")
    except:
        pass

gradient:
base_model.fc1.weight
梯度：None
base_model.fc1.bias
梯度：None
base_model.fc2.weight
梯度：None
base_model.fc2.bias
梯度：None
adapter.fc1.weight
梯度：None
adapter.fc1.bias
梯度：None
adapter.fc2.weight
梯度：None
adapter.fc2.bias
梯度：None
classhead.weight
梯度：None
classhead.bias
梯度：None

# 进行反向传播后，查看梯度
loss.backward()
for name, param in model.named_parameters():
    print(name)
    print(f"gradient:{param.grad}")

base_model.fc1.weight
gradient:None
base_model.fc1.bias
gradient:None
base_model.fc2.weight
gradient:None
base_model.fc2.bias
gradient:None
adapter.fc1.weight
gradient:tensor([[ 2.0354e-02, -7.9889e-03, -2.1384e-03,  9.3599e-03, -7.7675e-03],
        [ 7.1102e-04,  3.8607e-05,  5.5156e-04, -9.6660e-04, -1.3542e-03]])
adapter.fc1.bias
gradient:tensor([-0.0043, -0.0006])
adapter.fc2.weight
gradient:tensor([[ 0.0065, -0.0079],
        [ 0.0049, -0.0060],
        [ 0.0074, -0.0090],
        [ 0.0069, -0.0085],
        [-0.0105,  0.0128]])
adapter.fc2.bias
gradient:tensor([-0.0130, -0.0098, -0.0147, -0.0139,  0.0210])
classhead.weight
gradient:tensor([[-0.0263, -0.0578, -0.0509, -0.0682,  0.0748]])
classhead.bias
gradient:tensor([0.0564])

可以发现上面的base model都没有梯度，因为它前面被我们冻结了。
另外，梯度的形状（注意，如果用大小可能会有歧义）和参数的形状一致。因为每个参数都需要配备一个对应的梯度来指导其应该如何更新。梯度实际上就是损失函数对该参数的偏导数。

更新前的模型参数

# 更新前的模型参数
print(model)
for name, param in model.named_parameters():
    print(f"name:{name}")
    print(f"param:{param}")
    print("-" * 50)

BasewithAdapter(
  (base_model): BaseModel(
    (fc1): Linear(in_features=5, out_features=10, bias=True)
    (fc2): Linear(in_features=10, out_features=5, bias=True)
  )
  (adapter): Adapter(
    (fc1): Linear(in_features=5, out_features=2, bias=True)
    (fc2): Linear(in_features=2, out_features=5, bias=True)
  )
  (classhead): Linear(in_features=5, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)
name:base_model.fc1.weight
param:Parameter containing:
tensor([[-0.3368,  0.1063, -0.0966, -0.0316,  0.0564],
        [ 0.0717,  0.1012, -0.0392, -0.1883, -0.1120],
        [-0.1414,  0.0691, -0.1963, -0.3627,  0.0400],
        [ 0.0006,  0.2310,  0.1643,  0.1907, -0.2957],
        [-0.2284, -0.4337,  0.0293, -0.3698, -0.0502],
        [ 0.0180, -0.2635,  0.4388, -0.2568, -0.2210],
        [-0.3673, -0.3681,  0.1915, -0.3489,  0.1758],
        [-0.4011,  0.2684,  0.3424, -0.3033, -0.4273],
        [ 0.0496, -0.2914,  0.0873, -0.3749,  0.1635],
        [-0.1358,  0.2501, -0.0757,  0.3155, -0.2628]])
--------------------------------------------------
name:base_model.fc1.bias
param:Parameter containing:
tensor([-0.1694, -0.1769, -0.3724,  0.1771, -0.0550, -0.0744,  0.4022, -0.2611,
        -0.1714,  0.1903])
--------------------------------------------------
name:base_model.fc2.weight
param:Parameter containing:
tensor([[-0.1357,  0.2190, -0.2924, -0.1935, -0.0521,  0.0705,  0.0679, -0.1433,
          0.1924, -0.0517],
        [-0.1323, -0.2032,  0.0015,  0.2776, -0.2703,  0.1662, -0.0765,  0.1111,
          0.0926,  0.1197],
        [-0.1398,  0.1887,  0.1364, -0.1965, -0.0594,  0.3036, -0.0004, -0.0255,
         -0.2863, -0.2791],
        [ 0.0979, -0.1706,  0.1123, -0.0108, -0.0526, -0.0403, -0.1269, -0.2003,
          0.1699, -0.1547],
        [ 0.3081, -0.3112, -0.0128,  0.0525, -0.1151, -0.1979,  0.3086,  0.1071,
         -0.1751, -0.1322]])
--------------------------------------------------
name:base_model.fc2.bias
param:Parameter containing:
tensor([-0.2160, -0.2652,  0.1969, -0.1783,  0.0823])
--------------------------------------------------
name:adapter.fc1.weight
param:Parameter containing:
tensor([[ 0.2952, -0.0903,  0.3795,  0.2233, -0.3799],
        [ 0.3118, -0.3888,  0.0493,  0.2254,  0.2531]], requires_grad=True)
--------------------------------------------------
name:adapter.fc1.bias
param:Parameter containing:
tensor([0.2686, 0.2880], requires_grad=True)
--------------------------------------------------
name:adapter.fc2.weight
param:Parameter containing:
tensor([[-0.6225,  0.0204],
        [ 0.2991,  0.3418],
        [ 0.3018, -0.1149],
        [ 0.5925, -0.2609],
        [-0.5754, -0.1206]], requires_grad=True)
--------------------------------------------------
name:adapter.fc2.bias
param:Parameter containing:
tensor([ 0.4099,  0.1889, -0.2469, -0.1469, -0.3643], requires_grad=True)
--------------------------------------------------
name:classhead.weight
param:Parameter containing:
tensor([[-0.2298, -0.1740, -0.2616, -0.2471,  0.3723]], requires_grad=True)
--------------------------------------------------
name:classhead.bias
param:Parameter containing:
tensor([-0.1815], requires_grad=True)
--------------------------------------------------

optimizer.step()

1 2	`# 更新模型参数 optimizer.step()`

更新后的模型参数

print(model)
for name, param in model.named_parameters():
    print(f"name:{name}")
    print(f"param:{param}")
    print("-" * 50)

BasewithAdapter(
  (base_model): BaseModel(
    (fc1): Linear(in_features=5, out_features=10, bias=True)
    (fc2): Linear(in_features=10, out_features=5, bias=True)
  )
  (adapter): Adapter(
    (fc1): Linear(in_features=5, out_features=2, bias=True)
    (fc2): Linear(in_features=2, out_features=5, bias=True)
  )
  (classhead): Linear(in_features=5, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)
name:base_model.fc1.weight
param:Parameter containing:
tensor([[-0.3368,  0.1063, -0.0966, -0.0316,  0.0564],
        [ 0.0717,  0.1012, -0.0392, -0.1883, -0.1120],
        [-0.1414,  0.0691, -0.1963, -0.3627,  0.0400],
        [ 0.0006,  0.2310,  0.1643,  0.1907, -0.2957],
        [-0.2284, -0.4337,  0.0293, -0.3698, -0.0502],
        [ 0.0180, -0.2635,  0.4388, -0.2568, -0.2210],
        [-0.3673, -0.3681,  0.1915, -0.3489,  0.1758],
        [-0.4011,  0.2684,  0.3424, -0.3033, -0.4273],
        [ 0.0496, -0.2914,  0.0873, -0.3749,  0.1635],
        [-0.1358,  0.2501, -0.0757,  0.3155, -0.2628]])
--------------------------------------------------
name:base_model.fc1.bias
param:Parameter containing:
tensor([-0.1694, -0.1769, -0.3724,  0.1771, -0.0550, -0.0744,  0.4022, -0.2611,
        -0.1714,  0.1903])
--------------------------------------------------
name:base_model.fc2.weight
param:Parameter containing:
tensor([[-0.1357,  0.2190, -0.2924, -0.1935, -0.0521,  0.0705,  0.0679, -0.1433,
          0.1924, -0.0517],
        [-0.1323, -0.2032,  0.0015,  0.2776, -0.2703,  0.1662, -0.0765,  0.1111,
          0.0926,  0.1197],
        [-0.1398,  0.1887,  0.1364, -0.1965, -0.0594,  0.3036, -0.0004, -0.0255,
         -0.2863, -0.2791],
        [ 0.0979, -0.1706,  0.1123, -0.0108, -0.0526, -0.0403, -0.1269, -0.2003,
          0.1699, -0.1547],
        [ 0.3081, -0.3112, -0.0128,  0.0525, -0.1151, -0.1979,  0.3086,  0.1071,
         -0.1751, -0.1322]])
--------------------------------------------------
name:base_model.fc2.bias
param:Parameter containing:
tensor([-0.2160, -0.2652,  0.1969, -0.1783,  0.0823])
--------------------------------------------------
name:adapter.fc1.weight
param:Parameter containing:
tensor([[ 0.2942, -0.0893,  0.3805,  0.2223, -0.3789],
        [ 0.3108, -0.3898,  0.0483,  0.2264,  0.2541]], requires_grad=True)
--------------------------------------------------
name:adapter.fc1.bias
param:Parameter containing:
tensor([0.2696, 0.2890], requires_grad=True)
--------------------------------------------------
name:adapter.fc2.weight
param:Parameter containing:
tensor([[-0.6235,  0.0214],
        [ 0.2981,  0.3428],
        [ 0.3008, -0.1139],
        [ 0.5915, -0.2599],
        [-0.5744, -0.1216]], requires_grad=True)
--------------------------------------------------
name:adapter.fc2.bias
param:Parameter containing:
tensor([ 0.4109,  0.1899, -0.2459, -0.1459, -0.3653], requires_grad=True)
--------------------------------------------------
name:classhead.weight
param:Parameter containing:
tensor([[-0.2288, -0.1730, -0.2606, -0.2461,  0.3713]], requires_grad=True)
--------------------------------------------------
name:classhead.bias
param:Parameter containing:
tensor([-0.1825], requires_grad=True)
--------------------------------------------------

累积梯度运算就是有多次的loss.backward,但是会过几次再进行optimizer.step() 通过这种方式来进行参数更新。模拟大批次。

#模型 #pytorch #梯度

torch代码演示模型训练流程中的梯度变化

https://abigail61.github.io/2024/12/05/模型训练流程中的梯度变化/

作者

Yajing Luo

发布于

2024年12月5日

许可协议

结合代码理解各种注意力机制（一）：自注意力机制上一篇

在服务器上git clone github项目的过程下一篇