# 前言

该内容为笔者学习中国大学慕课中北京大学曹健老师Tensorflow笔记所总结
在此之前，笔者观看过吴恩达老师的深度学习和CS231n，其中都对几种优化器进行了讲解，并对几种不同的优化器为什么有效进行了说明，但相比直接曹健老师的讲解更便于记忆

# 一、预备知识和参数说明

w

w

w

l

o

s

s

loss

loss

l

r

lr

lr

b

a

t

c

h

batch

batch

t

t

t表示当前

b

a

t

c

h

batch

batch迭代的总次数

1. 计算t时刻损失函数关于当前参数的梯度

g

t

=

l

o

s

s

=

loss

(

w

t

)

g_t=nabla loss =dfrac{partial text { loss }}{partialleft(w_{t}right)}

2. 计算t时刻一阶动量

m

t

m_t

和二阶动量

V

t

V_t

3. 计算t时刻下降梯度：

η

t

=

l

r

m

t

/

V

t

eta_t=lr cdot m_t/sqrt{V_t}

4. 计算t+1时刻参数：

w

t

+

1

=

w

t

η

t

=

w

t

l

r

m

t

/

V

t

w_{t+1}=w_t-eta_t=w_t-lr cdot m_t/sqrt{V_t}

# 二、随机梯度下降SGD

m

t

=

g

t

m_t=g_t

mt=gt        二阶动量：

V

t

=

1

V_t=1

Vt=1

η

t

=

l

r

m

t

/

V

t

eta_t=lrcdot m_t/sqrt{V_t}

ηt=lrmt/Vt

=

l

r

g

t

=lrcdot g_t

=lrgt

w

t

+

1

=

w

t

η

t

w_{t+1}=w_t-eta_t

wt+1=wtηt

=

w

t

l

r

m

t

V

t

=w_t-lrcdot m_tsqrt{V_t}

=wtlrmtVt

=

w

t

l

r

g

t

= w_t-lrcdot g_t

=wtlrgt

# 三、SGDM

m

t

m_t

mt 表示各时刻梯度方向的指数滑动平均

m

t

=

β

m

t

1

+

(

1

β

)

g

t

m_t=beta cdot m_{t-1}+(1-beta ) cdot g_t

mt=βmt1+(1β)gt        二阶动量：

V

t

=

1

V_t=1

Vt=1

η

t

=

l

r

m

t

/

V

t

eta_t=lrcdot m_t/sqrt{V_t}

ηt=lrmt/Vt

=

l

r

m

t

=lrcdot m_t

=lrmt

=

l

r

(

β

m

t

1

+

(

1

β

)

g

t

)

=lr cdot(beta cdot m_{t-1}+(1-beta ) cdot g_t)

=lr(βmt1+(1β)gt)

w

t

+

1

=

w

t

η

t

w_{t+1}=w_t-eta_t

wt+1=wtηt

=

w

t

l

r

(

β

m

t

1

+

(

1

β

)

g

t

)

=w_t-lr cdot(beta cdot m_{t-1}+(1-beta ) cdot g_t)

=wtlr(βmt1+(1β)gt)

m

t

=

g

t

m_t=g_t

mt=gt         二阶动量：

V

t

=

τ

t

g

τ

2

V_t=sum^t_{tau}g_{tau}^2

Vt=τtgτ2

η

t

=

l

r

m

t

/

(

V

t

)

eta_t=lr cdot m_t/(sqrt{V_t})

ηt=lrmt/(Vt

)

=

l

r

g

t

/

(

τ

=

1

t

)

g

τ

2

)

=lr cdot g_t/(sqrt{sum^t_{tau=1})g_{tau}^2})

=lrgt/(τ=1t)gτ2

)

w

t

+

1

=

w

t

η

t

w_{t+1}=w_t-eta_t

wt+1=wtηt

=

w

t

l

r

g

t

/

(

τ

=

1

t

)

g

τ

2

)

=w_t-lr cdot g_t/(sqrt{sum^t_{tau=1})g_{tau}^2})

=wtlrgt/(τ=1t)gτ2

)

# 四、RMSProp

m

t

=

g

t

m_t=g_t

mt=gt         二阶动量：

V

t

=

β

V

t

1

+

(

1

β

)

g

2

2

V_t=beta cdot V_{t-1}+(1-beta)cdot g_2^2

Vt=βVt1+(1β)g22

η

t

=

l

r

m

t

/

(

(

V

t

)

)

eta_t=lr cdot m_t/(sqrt(V_t))

ηt=lrmt/((

Vt))

=

l

r

g

t

/

(

β

V

t

1

+

(

1

β

)

g

2

2

)

=lr cdot g_t/(sqrt{beta cdot V_{t-1}+(1-beta)cdot g_2^2})

=lrgt/(βVt1+(1β)g22

)

w

t

+

1

=

w

t

η

t

w_{t+1}=w_t-eta_t

wt+1=wtηt

=

w

t

l

r

g

t

/

(

β

V

t

1

+

(

1

β

)

g

2

2

)

=w_t-lr cdot g_t/(sqrt{beta cdot V_{t-1}+(1-beta)cdot g_2^2})

=wtlrgt/(βVt1+(1β)g22

)

m

t

=

β

1

m

t

1

+

(

1

β

1

)

m_t=beta_1 cdot m_{t-1}+(1-beta_1 )

mt=β1mt1+(1β1)

m

t

^

=

m

t

1

β

1

t

hat{m_t}=dfrac{m_t}{1-beta_1^t}

mt^=1β1tmt

V

t

=

β

2

V

t

1

+

(

1

β

2

)

g

2

2

V_t=beta_2 cdot V_{t-1}+(1-beta_2)cdot g_2^2

Vt=β2Vt1+(1β2)g22

V

t

^

=

V

t

1

β

2

t

hat{V_t}=dfrac{V_t}{1-beta_2^t}

Vt^=1β2tVt

η

t

=

l

r

m

t

^

/

(

V

t

^

)

eta_t=lr cdot hat{m_t}/(sqrt{hat{V_t}})

ηt=lrmt^/(Vt^

)

=

l

r

m

t

1

β

1

t

/

V

t

1

β

2

t

=lr cdot dfrac{m_t}{1-beta_1^t}/sqrt{dfrac{V_t}{1-beta_2^t}}

=lr1β1tmt/1β2tVt

w

t

+

1

=

w

t

η

t

w_{t+1}=w_t-eta_t

wt+1=wtηt

=

w

t

l

r

m

t

1

β

1

t

/

V

t

1

β

2

t

=w_t-lr cdot dfrac{m_t}{1-beta_1^t}/sqrt{dfrac{V_t}{1-beta_2^t}}

=wtlr1β1tmt/1β2tVt

THE END