自适应学习率BP神经网络【代码】
BP_network.py
# -*- coding: utf-8 -*
'''
the definition of BP network class
'''
class BP_network:
def __init__(self):
'''
initial variables
'''
# node number each layer
self.i_n = 0
self.h_n = 0
self.o_n = 0
# output value for each layer
self.i_v = []
self.h_v = []
self.o_v = []
# parameters (w, t)
self.ih_w = [] # weight for each link
self.ho_w = []
self.h_t = [] # threshold for each neuron
self.o_t = []
# definition of alternative activation functions and it's derivation
self.fun = {
'Sigmoid': Sigmoid,
'SigmoidDerivate': SigmoidDerivate,
'Tanh': Tanh,
'TanhDerivate': TanhDerivate,
# for more, add here
}
# initial the learning rate
self.lr1 = [] # output layer
self.lr2 = [] # hidden layer
def CreateNN(self, ni, nh, no, actfun, learningrate):
'''
建立BP网络并初始化参数
@param ni, nh, no: the neuron number of each layer
@param actfun: string, the name of activation function
@param learningrate: learning rate of gradient algorithm
'''
# dependent packages
import numpy as np
# 初始化每层的结点数
self.i_n = ni # 输入层
self.h_n = nh # 隐层
self.o_n = no # 输出层
# 初始化每层的输出值
self.i_v = np.zeros(self.i_n)
self.h_v = np.zeros(self.h_n)
self.o_v = np.zeros(self.o_n)
# 初始化每层的权重矩阵(随机初始化)
self.ih_w = np.zeros([self.i_n, self.h_n])
self.ho_w = np.zeros([self.h_n, self.o_n])
for i in range(self.i_n):
for h in range(self.h_n):
self.ih_w[i][h] = rand(0, 1)
for h in range(self.h_n):
for j in range(self.o_n):
self.ho_w[h][j] = rand(0, 1)
# 初始化每层的阈值
self.h_t = np.zeros(self.h_n)
self.o_t = np.zeros(self.o_n)
for h in range(self.h_n): self.h_t[h] = rand(0, 1)
for j in range(self.o_n): self.o_t[j] = rand(0, 1)
# 初始化激活函数
self.af = self.fun[actfun]
self.afd = self.fun[actfun + 'Derivate']
# 初始化学习率
self.lr1 = np.ones(self.o_n) * learningrate
self.lr2 = np.ones(self.h_n) * learningrate
def Pred(self, x):
'''
前向传播的过程
predict process through the network
@param x: the input array for input layer
'''
# activate input layer
for i in range(self.i_n):
self.i_v[i] = x[i]
# activate hidden layer
for h in range(self.h_n):
total = 0.0
for i in range(self.i_n):
total += self.i_v[i] * self.ih_w[i][h]
self.h_v[h] = self.af(total - self.h_t[h])
# activate output layer
for j in range(self.o_n):
total = 0.0
for h in range(self.h_n):
total += self.h_v[h] * self.ho_w[h][j]
self.o_v[j] = self.af(total - self.o_t[j])
'''
for fixed learning rate
'''
def BackPropagate(self, x, y):
'''
反向传播
the implementation of BP algorithms on one slide of sample
@param x, y: array, input and output of the data sample
'''
# dependent packages
import numpy as np
# get current network output
self.Pred(x)
# calculate the gradient based on output
o_grid = np.zeros(self.o_n)
for j in range(self.o_n):
o_grid[j] = (y[j] - self.o_v[j]) * self.afd(self.o_v[j])
h_grid = np.zeros(self.h_n)
for h in range(self.h_n):
for j in range(self.o_n):
h_grid[h] += self.ho_w[h][j] * o_grid[j]
h_grid[h] = h_grid[h] * self.afd(self.h_v[h])
# 更新参数
for h in range(self.h_n):
for j in range(self.o_n):
self.ho_w[h][j] += self.lr1[j] * o_grid[j] * self.h_v[h]
for i in range(self.i_n):
for h in range(self.h_n):
self.ih_w[i][h] += self.lr2[h] * h_grid[h] * self.i_v[i]
for j in range(self.o_n):
self.o_t[j] -= self.lr1[j] * o_grid[j]
for h in range(self.h_n):
self.h_t[h] -= self.lr2[h] * h_grid[h]
def TrainStandard(self, data_in, data_out):
'''
standard BP training
@param lr, learning rate, default 0.05
@return: e, accumulated error
@return: e_k, error array of each step
'''
e_k = []
for k in range(len(data_in)):
x = data_in[k]
y = data_out[k]
self.BackPropagate(x, y)
# error in train set for each step
y_delta2 = 0.0
for j in range(self.o_n):
y_delta2 += (self.o_v[j] - y[j]) * (self.o_v[j] - y[j])
e_k.append(y_delta2 / 2)
# total error of training
e = sum(e_k) / len(e_k)
return e, e_k
'''
for dynamic learning rate
'''
def BackPropagate_Dynamic_Lr(self, x, y, d_ho_w_p, d_ih_w_p, d_o_t_p, d_h_t_p, o_grid_p, h_grid_p, alpha):
'''
the implementation of BP algorithms on one slide of sample
@param x, y: array, input and output of the data sample
@param d_ho_w_p, d_ih_w_p, d_o_t_p, d_h_t_p: adjust values (delta) of last step
@param o_grid_p, h_grid_p: gradient of last step
@param alpha: forget factor
@return adjust values (delta) of ho_w, ih_w, o_t, h_t,
and gradient value of o_grid, h_grid for this step
'''
# dependent packages
import numpy as np
# get current network output
self.Pred(x)
# 基于当前输出计算梯度
o_grid = np.zeros(self.o_n) # 输出层神经元的梯度项
for j in range(self.o_n):
o_grid[j] = (y[j] - self.o_v[j]) * self.afd(self.o_v[j])
# 输出层梯度 = (真实结果 - 输出结果) * y( 1-y ) ( 西瓜书式 5.10 )
h_grid = np.zeros(self.h_n) # 隐层神经元的梯度项
for h in range(self.h_n):
for j in range(self.o_n):
h_grid[h] += self.ho_w[h][j] * o_grid[j] # ∑ (输出层权重参数 * 输出层梯度项)
h_grid[h] = h_grid[h] * self.afd(self.h_v[h])
# 隐层梯度 = bh(1-bh)∑ (输出层权重参数 * 输出层梯度项) ( 西瓜书式 5.15 )
# ( bh为隐层神经元的输出 )
# 输出层更新参数
lamda = np.sign(o_grid * o_grid_p)
# λ = sign(g(t)g(t-1))
# 当x>0,sign(x)=1; 当x=0,sign(x)=0; 当x<0, sign(x)=-1
o_grid_p = o_grid # o_grid是当前梯度 o_grid_p是上一次梯度
for h in range(self.h_n):
for j in range(self.o_n):
# 自适应学习率
o_grid_p[j] = o_grid[j]
lr = self.lr1[j] * (3 ** lamda[j])
# 基本思想:如果连续两次梯度方向一致,λ为正数,学习率会增大,加速收敛
# 如果如果连续两次梯度方向不一致,λ为负数,学习率减小
self.lr1[j] = 0.5 if lr > 0.5 else (0.005 if lr < 0.005 else lr)
# 不让学习率过大或者过小,处于0.005到0.5之间
# 改进后的输出层权重 对应西瓜书式 5.11 alpha是遗忘因子,取0到1,加强当前数据的影响,减少历史数据的影响
d_ho_w_p[h][j] = self.lr1[j] * o_grid[j] * self.h_v[h] + alpha * d_ho_w_p[h][j]
self.ho_w[h][j] += d_ho_w_p[h][j]
# 同上
lamda = np.sign(h_grid * h_grid_p)
h_grid_p = h_grid
for i in range(self.i_n):
for h in range(self.h_n):
# adjust learning rate
lr = self.lr2[h] * (3 ** lamda[h])
self.lr2[j] = 0.5 if lr > 0.5 else (0.005 if lr < 0.005 else lr)
# updating parameter
d_ih_w_p[i][h] = self.lr2[h] * h_grid[h] * self.i_v[i] + alpha * d_ih_w_p[i][h]
self.ih_w[i][h] += d_ih_w_p[i][h]
for j in range(self.o_n):
d_o_t_p[j] = -(self.lr1[j] * o_grid[j] + alpha * d_o_t_p[j])
self.o_t[j] += d_o_t_p[j]
for h in range(self.h_n):
d_h_t_p[h] = -(self.lr2[h] * h_grid[h] + alpha * d_h_t_p[h])
self.h_t[h] += d_h_t_p[h]
return d_ho_w_p, d_ih_w_p, d_o_t_p, d_h_t_p, o_grid_p, h_grid_p
def TrainStandard_Dynamic_Lr(self, data_in, data_out):
'''
standard BP training
@param lr, learning rate, default 0.05
@return: e, accumulated error
@return: e_k, error array of each step
'''
# dependent packages
import numpy as np
d_ih_w_p = np.zeros([self.i_n, self.h_n]) # initial delta values = 0.0
d_ho_w_p = np.zeros([self.h_n, self.o_n])
d_h_t_p = np.zeros(self.h_n)
d_o_t_p = np.zeros(self.o_n)
o_grid_p = np.zeros(self.o_n) # initial gradient = 0.01
h_grid_p = np.zeros(self.h_n)
e_k = []
for k in range(len(data_in)):
x = data_in[k]
y = data_out[k]
d_ho_w_p, d_ih_w_p, d_o_t_p, d_h_t_p, o_grid_p, h_grid_p
= self.BackPropagate_Dynamic_Lr(x, y, d_ho_w_p, d_ih_w_p, d_o_t_p, d_h_t_p,
o_grid_p, h_grid_p, 0.2)
# error in train set for each step
y_delta2 = 0.0
for j in range(self.o_n):
y_delta2 += (self.o_v[j] - y[j]) * (self.o_v[j] - y[j])
e_k.append(y_delta2 / 2)
# total error of training
e = sum(e_k) / len(e_k)
return e, e_k
def PredLabel(self, X):
'''
predict process through the network
@param X: the input sample set for input layer
@return: y, array, output set (0,1,2... - class) based on [winner-takes-all]
'''
import numpy as np
y = []
for m in range(len(X)):
self.Pred(X[m])
# if self.o_v[0] > 0.5: y.append(1)
# else : y.append(0)
max_y = self.o_v[0]
label = 0
for j in range(1, self.o_n):
if max_y < self.o_v[j]:
label = j
max_y = self.o_v[j]
y.append(label)
return np.array(y)
'''
the definition of activation functions
'''
def Sigmoid(x):
'''
definition of sigmoid function and it's derivation
'''
from math import exp
return 1.0 / (1.0 + exp(-x))
def SigmoidDerivate(y):
return y * (1 - y)
def Tanh(x):
'''
definition of sigmoid function and it's derivation
'''
from math import tanh
return tanh(x)
def TanhDerivate(y):
return 1 - y * y
'''
the definition of random function
'''
def rand(a, b):
'''
random value generation for parameter initialization
@param a,b: the upper and lower limitation of the random value
'''
from random import random
return (b - a) * random() + a
BP_improve.py
# -*- coding: utf-8 -*
'''
preparation of data
'''
import pandas as pd
import matplotlib.pyplot as plt
# online loading
from urllib.request import urlopen
url = "http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
raw_data = urlopen(url) # download the file
attr = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']
dataset = pd.read_csv(raw_data, delimiter=",", header=None, names=attr)
#visualization of data
# import seaborn as sns
# sns.pairplot(dataset, hue='species', vars = ['sepal_length','petal_length'],diag_kind = None)
# plt.show()
# generation of input, output, label
# input variables (assignment directly)
X = dataset.iloc[:, :4].values
#print(X)
# label (generation after transform output to categorical variables)
dataset.iloc[:, -1] = dataset.iloc[:, -1].astype('category')
label = dataset.iloc[:, 4].values.categories
# output 1 (generation after string categorical variables to numerical values)
dataset.iloc[:, 4].cat.categories = [0, 1, 2] # 三分类
y = dataset.iloc[:, 4].values
# output 2 (generation after one hot encoding)
Y = pd.get_dummies(dataset.iloc[:, 4]).values
'''
split of train set and test set (using sklearn function)
'''
from sklearn.model_selection import train_test_split
train_X, test_X, train_y, test_y, train_Y, test_Y = train_test_split(X, y, Y, test_size=0.5, random_state=42)
'''
construction of BP network
'''
from BP_network import *
bpn1 = BP_network() # initial a BP network class
bpn1.CreateNN(4, 5, 3, actfun='Sigmoid', learningrate=0.05) # build the network
'''
experiment of fixed learning rate
'''
# parameter training with fixed learning rate initial above
e = []
for i in range(1000):
err, err_k = bpn1.TrainStandard(train_X, train_Y)
e.append(err)
# draw the convergence curve of output error by each step of iteration
import matplotlib.pyplot as plt
f1 = plt.figure(1)
plt.xlabel("epochs")
plt.ylabel("error")
plt.ylim(0, 1)
plt.title("training error convergence curve with fixed learning rate")
plt.title("training error convergence curven learning rate = 0.05")
plt.plot(e)
plt.show()
# get the test error in test set
pred = bpn1.PredLabel(test_X)
count = 0
for i in range(len(test_y)):
if pred[i] == test_y[i]: count += 1
test_err = 1 - count / len(test_y)
print("固定学习率的错误率: %.3f" % test_err)
'''
experiment of dynamic learning rate
'''
bpn2 = BP_network() # initial a BP network class
bpn2.CreateNN(4, 5, 3, actfun='Sigmoid', learningrate=0.05) # build the network
# parameter training with fixed learning rate initial above
e = []
for i in range(1000):
err, err_k = bpn2.TrainStandard_Dynamic_Lr(train_X, train_Y)
e.append(err)
# draw the convergence curve of output error by each step of iteration
# import matplotlib.pyplot as plt
f2 = plt.figure(2)
plt.xlabel("epochs")
plt.ylabel("error")
plt.ylim(0, 1)
plt.title("training error convergence curve with dynamic learning rate")
plt.plot(e)
plt.show()
# get the test error in test set
pred = bpn2.PredLabel(test_X)
count = 0
for i in range(len(test_y)):
if pred[i] == test_y[i]: count += 1
test_err = 1 - count / len(test_y)
print("动态调整学习率的错误率: %.3f" % test_err)
plt.show()
本图文内容来源于网友网络收集整理提供,作为学习参考使用,版权属于原作者。
THE END
二维码