0%

DNN

课程笔记:深度神经网络。

在非线性问题中,原始参数的数目增加往往会导致实际参数数目爆炸式增长(如果我们要拟合二次曲线,那么是$O(N^2)$,如果是三次,那么是$O(N^3)$),这会对线性回归或逻辑斯蒂回归很不利

0. 约定

  • $a_i^{(l)}$: the ith activation value in layer l
  • $\theta_{ba}^{(l)}$: the parameter of the edge between $a_a^{(l)}$ and $a_b^{(l+1)}$
  • $a^{(l)}=g(z^{(l)})=g(\theta^{(l-1)}\cdot a^{(l-1)})$

1. Cost function

$J(\theta)$的求法和Logistic Regression基本相同,注意one-versus-all需要将K个Cost求和,同时因为存在多层参数,正则化也需要将所有的参数求和.

2. Activation Function

激活函数有很多种,除了sigmoid,ReLU,Tanh,这些激活函数都是非线性的。这是为了给线性的矩阵乘法引入非线性性,提升模型的表达能力。

  • sigmoid将值映射到0-1区间,往往可以表达强度或者概率。
  • ReLU简单,计算量少,有DropOut的功能。
  • Tanh经过原点
  • softmax一般用于多分类预测,并且配合CrossEntropy会更容易优化。

3. Back propagation

算法的关键在于求$\delta^{(l)}=\frac{\partial J(\theta)}{\partial z^{(l)}}$,我们可以形象地理解为其对最终结果的偏差所需要负的责任

  1. 依据$J(\theta)$,我们其实可以用微积分的知识很方便地推出$\delta^{(L)}=a^{(L)}-y$,下面都用一个训练样例来说明,因为训练样例的上标容易和层数的上标混淆
  2. 利用微分的链式法则,我们可以方便地求出每一层的

$$
\delta^{(l)}=\frac{\partial J(\theta)}{\partial z^{(L)}}…\frac{\partial z^{(l+1)}}{\partial z^{(l)}}=\delta^{(l+1)}\frac{\partial z^{(l+1)}}{\partial z^{(l)}}=(\theta^{(l)})^T\delta^{(l+1)}.*(1-a^{(l)})a^{(l)}
$$

  1. 求出梯度下降所需要的梯度

$$
\frac{\partial J(\theta)}{\partial \theta^{(l)}}=\frac{\partial J(\theta)}{\partial z^{(l+1)}} \frac{\partial z^{(l+1)}}{\partial \theta^{(l)}}=a^{l}(\delta^{l+1})^T
$$

4. Gradient checking

一个简单高效的检测back propagation是否有错的办法是在用微积分极限的思想求出梯度的近似值

$$
\frac{\partial J(\theta)}{\partial \theta_i}=\frac{J(…,\theta_i+\epsilon,…)-J(…,\theta_i-\epsilon,…)}{2*\epsilon}
$$

在如果多次反向传播中,近似的梯度和反向传播计算出的梯度非常近似,那么说明梯度下降没有问题。

5. Random initialization

如果用0初始化参数(例如np.zeros)可能会造成非常糟糕的结果。在数学上可以证明:初始$\theta=0$=>hidden layer的$\delta$和a层内是相同的,单个内部节点到下一层的所有参数都相等。最终的结果是hidden node实际上在计算相同的特征

6. Lab

注:Ng的训练集中0表示为10,在其给出的参考参数中,预测0也是最后一个预测器值最大。

6.1 Logistic regression

Cost function曲线。迭代1000次,学习率0.01,耗时数分钟。最后用训练之后的参数预测原有数据,准确率约为85.4%,尝试过更多次数的迭代,准确率没有明显提升。

6.2 Fully connected neural network

Accuracy曲线。迭代2000词,学习率3,耗时数分钟。训练数据大小4000,测试数据大小1000,无重合。测试准确率最后稳定在91%,尝试过更多次数的迭代,准确率没有明显提升。(如果测试数据和训练数据是同一个集合,那么准确率会偏高)

一件很尴尬的事:在学习率为0.01时nn学习速度很慢,我TM还以为是有bug调了好久。以后切记超参数先随便试试再debug。

6.3 Code

逻辑斯蒂的代码针对之前代码进行了少许修改,下面是最终版本

logistic.pyview raw
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import numpy as np
import matplotlib.pyplot as plt
from scipy.io import loadmat


class Linear:
@staticmethod
def sigmoid(z):
return 1 / (1 + np.exp(-z))

def __init__(self, nin, nout):
self.nin = nin
self.nout = nout
self.theta = np.zeros((nout, nin + 1))
self.x = np.array([])
self.nx = 0
self.y = np.array([])
self.mu = []
self.s = []
self.reg_param = 0
self.feature_normed = False

def set_data(self, training_data, do_feature_norm=True, reg_param=0):
self.x = training_data[0]
self.nx = len(self.x[0])
self.y = training_data[1]
if do_feature_norm:
self.__feature_norm()
self.feature_normed = do_feature_norm
self.reg_param = reg_param

def learn_debug(self, iteration=1500, lr=0.01):
for i in range(iteration):
yield self.j_theta()
self.__step(lr)
yield self.j_theta()

def j_theta(self):
h_theta = self.__forward(self.x)
j_theta = -(self.y * np.log(h_theta) + (1 - self.y) * np.log(1 - h_theta)).sum() / self.nx
return j_theta + self.reg_param * (self.theta * self.theta)[1:].sum() / 2 / self.nx

def predict(self, x):
return self.__forward((x - self.mu) / self.s) if self.feature_normed \
else self.__forward(x)

def test_accuracy(self, test_data):
count = 0
total = 0
for p, a in zip(np.argmax(self.predict(test_data[0]), axis=0), np.argmax(test_data[1], axis=0)):
if p == a:
count += 1
total += 1
return float(count) / total

def __feature_norm(self):
self.mu = self.x.mean(0)
self.s = self.x.max(0) - self.x.min(0)
for i in range(len(self.x)):
self.x[i] = (self.x[i] - self.mu) / self.s

def __forward(self, x):
return Linear.sigmoid(self.theta.dot(np.insert(x, 0, 1, axis=0)))

def __step(self, lr):
for i in range(len(self.theta) - 1):
self.theta[i + 1] = self.theta[i + 1] * (1 - lr * self.reg_param / self.nx)
self.theta = self.theta - (self.__forward(self.x) - self.y) \
.dot(np.insert(self.x, 0, 1, axis=0).transpose()) * lr / self.nx


if __name__ == "__main__":
data = loadmat('ex3data1.mat')
y = np.zeros((5000, 10), dtype=int)
for i in range(len(y)):
y[i][round(data['y'][i][0] - 1)] = 1
training_data = (data['X'].transpose(), y.transpose())
s = Linear(400, 10)
s.set_data(training_data, do_feature_norm=False)
iteration = 100
lr = 0.01

accuracy = [s.test_accuracy(training_data) * 100 for _ in s.learn_debug()]

x = np.linspace(0, len(accuracy), len(accuracy))
plt.scatter(x, accuracy)
plt.title("iteration=%d, lr=%f, final=%f" % (iteration, lr, accuracy[-1]))
plt.show()

下面是神经网络的代码

dnn.pyview raw
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import numpy as np
import matplotlib.pyplot as plt
from scipy.io import loadmat


class NeuralNetwork:
@staticmethod
def sigmoid(z):
return 1 / (1 + np.exp(-z))

def __init__(self, shape):
self.shape = shape
self.thetas = []

def predict(self, x):
x = x.transpose()
for theta in self.thetas:
x = NeuralNetwork.sigmoid(theta.dot(np.insert(x, 0, 1, axis=0)))
return x

def test_accuracy(self, data):
hit = 0
total = 0
for p, a in zip(np.argmax(self.predict(data[0]), axis=0), np.argmax(data[1].transpose(), axis=0)):
if p == a:
hit += 1
total += 1
return hit, total

def j_theta(self, data, reg_param=0):
x = data[0].transpose()
y = data[1].transpose()
m = len(data[0])
for theta in self.thetas:
x = NeuralNetwork.sigmoid(theta.dot(np.insert(x, 0, 1, axis=0)))
j_theta = -(y * np.log(x) + (1 - y) * np.log(1 - x)).sum() / m
for theta in self.thetas:
j_theta = j_theta + reg_param * (theta * theta).transpose()[1:].sum() / 2 / m
return j_theta

def _gradient_checking(self, data, reg_param, res, m):
ys = []
epsilon = 1e-4
for i in range(len(self.thetas)):
for r in range(len(self.thetas[i])):
for c in range(len(self.thetas[i][r])):
self.thetas[i][r][c] -= epsilon
t = self.j_theta(data, reg_param)
self.thetas[i][r][c] += 2 * epsilon
t -= self.j_theta(data, reg_param)
t /= (2 * epsilon)
self.thetas[i][r][c] -= epsilon
ys.append(t - res[i][r][c] / m)
xs = np.linspace(0, len(ys), len(ys))
plt.scatter(xs, ys)
plt.show()

def train(self, training_data, test_data=None, iteration=1500, lr=0.01, reg_param=0):
self.thetas = [np.random.randn(y, x + 1) for x, y in zip(self.shape[:-1], self.shape[1:])]
ret = [] # accuracy list
m = len(training_data[0])
x = training_data[0].transpose()
y = training_data[1].transpose()
for i in range(iteration):
delta_theta = self._back_prop(x, y)
# gradient checking if necessary
# self.__gradient_checking(training_data, reg_param, delta_theta, m)
# apply delta_theta
for i in range(len(self.thetas)):
self.thetas[i] = self.thetas[i] * (1 - lr / m * reg_param) - lr / m * delta_theta[i]
ret.append(self.test_accuracy(test_data if test_data is not None else training_data))
return ret

def _back_prop(self, x, y):
"""

:param x: transposed, x_0 not inserted
:param y:
:return:
"""
a = [x]
delta_theta = [np.zeros(t.shape) for t in self.thetas]
# forward
for theta in self.thetas:
a.append(NeuralNetwork.sigmoid(theta.dot(np.insert(a[-1], 0, 1, axis=0))))
# back prop
delta = a[-1] - y
for i in range(len(delta_theta)):
delta_theta[-1 - i] = delta.dot(np.insert(a[-2 - i], 0, 1, axis=0).transpose())
delta = (self.thetas[-1 - i].transpose().dot(delta)[1:] * (1 - a[-2 - i]) * a[-2 - i])
return delta_theta


if __name__ == "__main__":
data = loadmat('ex4data1.mat')
y = np.zeros((len(data['y']), 10), dtype=int)
for i in range(len(y)):
y[i][round(data['y'][i][0] - 1)] = 1

data = [(x, y) for x, y in zip(data['X'], y)]
np.random.shuffle(data)

x = np.array([data[i][0] for i in range(0, 4000)])
y = np.array([data[i][1] for i in range(0, 4000)])
training_data = (x, y)

x = np.array([data[i][0] for i in range(4000, 5000)])
y = np.array([data[i][1] for i in range(4000, 5000)])
test_data = (x, y)

s = NeuralNetwork((400, 25, 10))
iteration = 2000
lr = 3
accuracy_data = s.train(training_data, test_data, iteration, lr)
accuracy = []
for hit, count in accuracy_data:
accuracy.append(float(hit) / count)
x = np.linspace(0, len(accuracy), len(accuracy))
plt.scatter(x, accuracy)
plt.title("iteration=%d, lr=%f, final=%f" % (iteration, lr, accuracy[-1]))
plt.show()