本文将利用多层全连接网络的梯度推导,直接利用Python循环计算每一层的梯度,并按着梯度下降算法手动更新。不用TensorFlow等框架的自动求导功能,选择没有自动求导功能的实现网络,并利用Numpy手动计算激活函数和梯度并手动更新网络参数。梯度传播公式是针对多(4)层全连接层。可以选择使用Sigmoid、ReLU、leakyrelu、tanh等多种种激活函数,并且损失函数为均方误差函数的网络类型。
本文重在体现纯python的方法,也没有采用Softmax函数将网络输出概率值之和进行约束,而是直接利用均方误差函数计算与One-hot编码的真实标签之间的误差,这些设计都是为了能直接利用梯度传播公式。对于其他类型的网络,例如损失函数采用交叉熵的网络,需要重新推导梯度传播表达式,但是方法是一样。正是因为手动推导梯度的方法局限性较大,在实践中采用极少,更多的是利用自动求导工具计算。本分类方法的源代码如下:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns #要注意的是一旦导入了seaborn, matplotlib 的默认作图风格就会被改成seaborn的格式
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split
N_SAMPLES = 2000 #采祥点数
TEST_SIZE=0.3#测试数量比率
#利用工具函数直接生成(月牙形)数据集
X, y = make_moons(n_samples = N_SAMPLES, noise=0.2, random_state=100)
#将2000个点按照7:3分割为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=TEST_SIZE, random_state=42)
print(X.shape, y.shape)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)
#绘制数据集的分布,X为2D坐标,y.为数据点的标签
def make_plot(X, y, plot_name, file_name=None, XX=None,YY=None, preds=None , dark=False):
if (dark):
plt.style.use('dark_background' )
else :
sns.set_style("whitegrid")
plt.figure(figsize=(16,12))
axes = plt.gca()
axes.set(xlabel="$x_1#34;,ylabel="$x_2#34; )
plt.title(plot_name, fontsize=30)
plt.subplots_adjust(left=0.20)
plt.subplots_adjust(right=0.80)
if(XX is not None and YY is not None and preds is not None ):
plt.contourf(XX, YY, preds.reshape(XX.shape), 25, alpha = 1, cmap=plt.cm.Spectra1)
plt.contour(XX, YY, preds.reshape(XX.shape), levels=[ .5], cmap="Greys", vmin=0, vmax=.6)
#绘制敏感点图,根据标签区分颜色
plt.scatter(X[:, 0], X[:, 1], c=y.ravel(), s=40, cmap=plt.cm.Spectral , edgecolors='none' )
plt.savefig( 'dataset.svg')
plt.show( )
plt.close()
#调用make_plot函数绘制数据的分布,其中X为2D坐标,y为标签
make_plot(X, y, "Classification Dataset Visualization" )
p=0.18 #自定义leakyrelu函数的超参数
class Layer:
# 全连接层网络
def __init__(self, n_input, n_neurons, activation=None, weights=None, bias=None):
"""int n_input: 输入节点数, int n_neurons: 输出节点数
str activation: 激活函数类型, weights: 权值张量,内部生成
bias: 偏置,内部生成 """
# 通过正态分布生成初始化的参数
self.weights = weights if weights is not None else np.random.randn(n_input, n_neurons) * np.sqrt(1/n_neurons)
self.bias = bias if bias is not None else np.random.randn(n_neurons) * 0.1
self.activation = activation
self.last_activation = None
self.error = None
self.delta = None
# 网络的前向传播
def activate(self, x):
r = np.dot(x, self.weights) + self.bias # X@W+ b
self.last_activation = self._apply_activation(r) # 激活函数
return self.last_activation
# 不同类型的激活函数
def _apply_activation(self, r):
if self.activation is None:
return r
elif self.activation == 'relu':
return np.maximum(r, 0)
elif self.activation == 'tanh':
return np.tanh(r)
elif self.activation == 'sigmoid':
return 1 / (1 + np.exp(-r))
elif self.activation == 'leakyrelu':
return np.maximum(r, 0)+p*np.minimum(r,0) #自定义leakyrelu函数
return r
# 不同类型激活函数的导数实现:
def apply_activation_derivation(self, r):
if self.activation is None:
return np.ones_like(r)
elif self.activation == 'relu':
grad = np.array(r, copy=True)
grad[r > 0] = 1.
grad[r <= 0] = 0.
return grad
elif self.activation == 'tanh':
return 1 - r**2
elif self.activation == 'sigmoid':
return r * (1 - r)
elif self.activation == 'leakyrelu':
grad = np.array(r, copy=True)
grad[r > 0] = 1.
grad[r <= 0] = p
return grad
return r
# 神经网络模型
class NeuralNetwork:
#神经网络模型大类
def __init__ (self):#如果只有一条下划线后面self._layers.append(layer)会报错
self._layers = [] #网络层对象列表
def add_layer(self, layer): #追加网络层
self._layers.append(layer)
def feed_forward(self, X):
#前向传播
for layer in self._layers:
#依次通过各个网络层
X = layer.activate(X)
return X
# 网络模型的反向传播
def backpropagation(self, X, y, learning_rate):
output = self.feed_forward(X)
'''反向循环需要从最末层开始,计算每层的δ 变量,然后根据推导出的梯度公式,
将计算出的δ 变量存储在Layer类的delta变量中。'''
for i in reversed(range(len(self._layers))):
layer = self._layers[i] # 得到当前层对象
if layer == self._layers[-1]: #如果是输出层
layer.error = y - output
layer.delta = layer.error * layer.apply_activation_derivation(output)
else: # 计算隐藏层
next_layer = self._layers[i + 1] # 得到下一层对象
layer.error = np.dot(next_layer.weights, next_layer.delta) # 矩阵乘法
layer.delta = layer.error * layer.apply_activation_derivation(layer.last_activation)
for i in range(len(self._layers)):#参数更新
layer = self._layers[i]
# o_i为上一层网络输出
o_i = np.atleast_2d(X if i == 0 else self._layers[i - 1].last_activation) # 将数据视为2维数据
layer.weights += layer.delta * o_i.T * learning_rate # .T是转置
#由于代码中的delta计算的其实是-δ,因此更新时使用了加号
# 网络的训练
def train(self, X_train, X_test, y_train, y_test, learning_rate, max_epochs):
temp1 = y_train.shape[0]
y_onehot = np.zeros((temp1, 2))
temp2 = np.arange(y_train.shape[0]) # 线性 0 - 1399
y_onehot[temp2, y_train] = 1
mses = []
accuracy = []
for i in range(max_epochs):
for j in range(len(X_train)): # 一次训练一个样本
self.backpropagation(X_train[j], y_onehot[j], learning_rate)
if i % 10 == 0:
mse = np.mean(np.square(y_onehot - self.feed_forward(X_train)))
mses.append(mse)
print('Epoch: #%s, MSE: %f' % (i, float(mse)))
acc = self.accuracy(self.predict(X_test), y_test.flatten())
print('Accuracy: %.2f%%' % (acc * 100))
accuracy.append(acc*100)
return mses, accuracy
def accuracy(self, y_output, y_test):
return np.mean((np.argmax(y_output, axis=1) == y_test))
def predict(self, X_test):
return self.feed_forward(X_test)
# 4层全连接网络 实例化训练和预测
nn = NeuralNetwork() # 实例化网络
nn.add_layer(Layer(2, 25, 'leakyrelu')) # 2 --> 25
nn.add_layer(Layer(25, 50, 'leakyrelu')) # 25 --> 50
nn.add_layer(Layer(50, 25, 'leakyrelu')) # 50 --> 25
nn.add_layer(Layer(25, 2, 'leakyrelu')) # 25 --> 2
learning_rate = 0.01
max_epochs = 1000
mses, accuracy = nn.train(X_train, X_test, y_train, y_test, learning_rate, max_epochs)
#绘出损失、准确率的图像
plt.figure()
plt.plot(mses, 'b', label='MSE Loss')
plt.xlabel('Epoch')
plt.ylabel('MSE')
plt.legend()
plt.savefig('exam5.2 MSE Loss.png')
plt.show()
plt.figure()
plt.plot(accuracy, 'r', label='Accuracy rate')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.savefig('exam5.2 Accuracy.png')
plt.show()
运行结果(以激活函数leakyrelu,超参数p=0.18为例):