# 引言

y

=

w

1

x

1

+

w

2

x

2

+

w

3

x

3

+

b

y=w_1x_1+w_2x_2+w_3x_3+b

y=w1x1+w2x2+w3x3+b

# 损失函数

l

(

w

,

b

)

=

1

2

(

x

w

+

b

y

)

2

l(w, b)=frac{1}{2}(xw+b-y)^2

l(w,b)=21(xw+by)2

L

(

W

,

b

)

=

1

2

n

i

=

1

n

(

X

(

i

)

W

+

b

Y

(

i

)

)

2

L(W, b)= frac{1}{2n}sum_{i=1}^n(X^{(i)}W+b-Y^{(i)})^2

L(W,b)=2n1i=1n(X(i)W+bY(i))2

[

x

1

(

1

)

x

2

(

1

)

1

x

1

(

2

)

x

2

(

2

)

1

x

1

(

3

)

x

2

(

3

)

1

]

[

w

1

w

2

b

]

=

[

y

(

1

)

y

(

2

)

y

(

3

)

]

begin{bmatrix} x_1^{(1)}&x_2^{(1)}&1\ x_1^{(2)}&x_2^{(2)}&1\ x_1^{(3)}&x_2^{(3)}&1\ end{bmatrix} cdot begin{bmatrix} w_1 \ w_2 \ b end{bmatrix}= begin{bmatrix} y^{(1)} \ y^{(2)} \ y^{(3)} end{bmatrix}

x1(1)x1(2)x1(3)x2(1)x2(2)x2(3)111

w1w2b

=

y(1)y(2)y(3)

W

=

[

w

1

w

2

b

]

W=begin{bmatrix} w_1 \ w_2 \ b end{bmatrix}

W=

w1w2b

L

(

W

,

b

)

=

1

2

n

i

=

1

n

(

X

(

i

)

W

Y

(

i

)

)

2

L(W, b)= frac{1}{2n}sum_{i=1}^n(X^{(i)}W-Y^{(i)})^2

L(W,b)=2n1i=1n(X(i)WY(i))2

# 解析解

## 公式

w

L

=

1

n

i

=

1

n

(

X

(

i

)

W

Y

(

i

)

)

T

X

(

i

)

nabla _wL= frac{1}{n} sum_{i=1}^n(X^{(i)}W-Y^{(i)})^{T}X^{(i)}

wL=n1i=1n(X(i)WY(i))TX(i)

W

(

)

T

X

(

i

)

T

X

(

i

)

Y

(

i

)

T

X

(

i

)

=

0

W

(

)

T

=

Y

(

i

)

T

X

(

i

)

(

X

(

i

)

T

X

(

i

)

)

1

W

(

)

=

(

X

(

i

)

T

X

(

i

)

)

1

X

(

i

)

T

Y

(

i

)

W^{(*)T}X^{(i)T}X^{(i)}-Y^{(i)T}X^{(i)}=0 \ Rightarrow W^{(*)T}=Y^{(i)T}X^{(i)}(X^{(i)T}X^{(i)})^{-1} \ Rightarrow W^{(*)} = (X^{(i)T}X^{(i)})^{-1} X^{(i)T}Y^{(i)}

W()TX(i)TX(i)Y(i)TX(i)=0W()T=Y(i)TX(i)(X(i)TX(i))1W()=(X(i)TX(i))1X(i)TY(i)

## 代码

# 第一步是增加一列1，这样可以使得w和b合并
X_b = np.c_[np.ones((X.shape[0], 1)), X]
theta_best = np.linalg.inv(X_b.T.dot(X_b)).dot(X_b.T).dot(y)


## 实例

import torch
from torch.utils import data
import numpy as np
import random


def synthetic_data(w, b, num_examples):
X = torch.normal(0, 1, (num_examples, w.shape[0]))  # 生成均值为0，方差为1的数据
y = torch.matmul(X, w) + b  # 生成标签
y += torch.normal(0, 0.01, y.shape)  # 均值为0，方差为0.01的正态分布
return X, y.reshape((-1, 1))


true_w = torch.tensor([2, -3.4])
true_b = 4.2
features, labels = synthetic_data(true_w, true_b, 1000)


X = np.array(features)
X_b = np.c_[np.ones((X.shape[0], 1)), X]
y = np.array(labels)


theta_best = np.linalg.inv(X_b.T.dot(X_b)).dot(X_b.T).dot(y)
theta_best


# 梯度下降

## 理论

(

w

,

b

)

(

w

,

b

)

(

w

,

b

)

L

(w, b) leftarrow (w, b)-nabla _{(w, b)}L

(w,b)(w,b)(w,b)L

## 随机梯度下降的手动实现代码

def data_iter(batch_size, features, labels):
num_examples = len(features)
indices = list(range(num_examples))
random.shuffle(indices)
for i in range(0, num_examples, batch_size):
batch_indices = torch.tensor(indices[i:min(i+batch_size, num_examples)])
yield features[batch_indices], labels[batch_indices]


w = torch.normal(0, 0.01, size=(2, 1), requires_grad=True)


def linreg(X, w, b):


def squared_loss(y_hat, y):
return (y_hat - y) ** 2 / 2


def sgd(params, lr, batch_size):
for param in params:
param -= lr * param.grad / batch_size


lr = 0.03
num_epochs = 3
net = linreg
loss = squared_loss


for epoch in range(num_epochs):
for X, y in data_iter(batch_size, features,labels):
l = loss(net(X, w, b), y)  # 小批量损失
l.sum().backward()  # 总损失进行反向传递
sgd([w, b], lr, batch_size)

train_1 = loss(net(features, w, b), labels)
print(f'epoch {epoch+1}, loss:{float(train_1.mean()):f}')


epoch 1, loss:0.033043
epoch 2, loss:0.000118
epoch 3, loss:0.000050

print(f'w损失：{true_w - w.reshape(true_w.shape)}')
print(f'b损失：{true_b - b}')


## torch中的随机梯度下降

def load_array(data_arrays, batch_size, is_train=True):
dataset = data.TensorDataset(*data_arrays)


batch_size = 10
next(iter(data_iter))


from torch import nn
​
net = nn.Sequential(nn.Linear(2, 1))

help(nn.Linear)


net[0].weight


Parameter containing:

net[0].bias


Parameter containing:

help(net[0].weight)


net[0].weight.data


tensor([[0.6657, 0.1449]])

net[0].weight.data.normal_(0, 0.01)


tensor([[-0.0092, 0.0053]])

net[0].weight.data


tensor([[-0.0092, 0.0053]])

net[0].bias.data.fill_(0)


tensor([0.])

net[0].bias.data


tensor([0.])

loss = nn.MSELoss()
trainer = torch.optim.SGD(net.parameters(), lr=0.03)


num_epochs= 3
for epoch in range(num_epochs):
for X, y in data_iter:
l = loss(net(X), y)
l.backward()
trainer.step()

l = loss(net(features), labels)
print(f'epoch:{epoch+1}, loss:{l:f}')


epoch:1, loss:0.000201
epoch:2, loss:0.000100
epoch:3, loss:0.000100

net[0].weight.data


tensor([[ 1.9992, -3.3989]])

net[0].bias.data


tensor([4.2002])

print(f'w损失：{net[0].weight.data - true_w}')
print(f'b损失：{net[0].bias.data - true_b}')


w损失：tensor([[-0.0008, 0.0011]])
b损失：tensor([0.0002])

THE END

)">