DNN | Xi's Blog

课程笔记：深度神经网络。

在非线性问题中，原始参数的数目增加往往会导致实际参数数目爆炸式增长（如果我们要拟合二次曲线，那么是$O(N^2)$，如果是三次，那么是$O(N^3)$），这会对线性回归或逻辑斯蒂回归很不利

0. 约定

$a_i^{(l)}$: the ith activation value in layer l
$\theta_{ba}^{(l)}$: the parameter of the edge between $a_a^{(l)}$ and $a_b^{(l+1)}$
$a^{(l)}=g(z^{(l)})=g(\theta^{(l-1)}\cdot a^{(l-1)})$

1. Cost function

$J(\theta)$的求法和Logistic Regression基本相同，注意one-versus-all需要将K个Cost求和，同时因为存在多层参数，正则化也需要将所有的参数求和.

2. Activation Function

激活函数有很多种，除了sigmoid，ReLU，Tanh，这些激活函数都是非线性的。这是为了给线性的矩阵乘法引入非线性性，提升模型的表达能力。

sigmoid将值映射到0-1区间，往往可以表达强度或者概率。
ReLU简单，计算量少，有DropOut的功能。
Tanh经过原点
softmax一般用于多分类预测，并且配合CrossEntropy会更容易优化。

3. Back propagation

算法的关键在于求$\delta^{(l)}=\frac{\partial J(\theta)}{\partial z^{(l)}}$，我们可以形象地理解为其对最终结果的偏差所需要负的责任

依据$J(\theta)$，我们其实可以用微积分的知识很方便地推出$\delta^{(L)}=a^{(L)}-y$，下面都用一个训练样例来说明，因为训练样例的上标容易和层数的上标混淆
利用微分的链式法则，我们可以方便地求出每一层的

$$
\delta^{(l)}=\frac{\partial J(\theta)}{\partial z^{(L)}}…\frac{\partial z^{(l+1)}}{\partial z^{(l)}}=\delta^{(l+1)}\frac{\partial z^{(l+1)}}{\partial z^{(l)}}=(\theta^{(l)})^T\delta^{(l+1)}.*(1-a^{(l)})a^{(l)}
$$

求出梯度下降所需要的梯度

$$
\frac{\partial J(\theta)}{\partial \theta^{(l)}}=\frac{\partial J(\theta)}{\partial z^{(l+1)}} \frac{\partial z^{(l+1)}}{\partial \theta^{(l)}}=a^{l}(\delta^{l+1})^T
$$

4. Gradient checking

一个简单高效的检测back propagation是否有错的办法是在用微积分极限的思想求出梯度的近似值

$$
\frac{\partial J(\theta)}{\partial \theta_i}=\frac{J(…,\theta_i+\epsilon,…)-J(…,\theta_i-\epsilon,…)}{2*\epsilon}
$$

在如果多次反向传播中，近似的梯度和反向传播计算出的梯度非常近似，那么说明梯度下降没有问题。

5. Random initialization

如果用0初始化参数（例如np.zeros）可能会造成非常糟糕的结果。在数学上可以证明：初始$\theta=0$=>hidden layer的$\delta$和a层内是相同的，单个内部节点到下一层的所有参数都相等。最终的结果是hidden node实际上在计算相同的特征

6. Lab

注：Ng的训练集中0表示为10，在其给出的参考参数中，预测0也是最后一个预测器值最大。

6.1 Logistic regression

Cost function曲线。迭代1000次，学习率0.01，耗时数分钟。最后用训练之后的参数预测原有数据，准确率约为85.4%，尝试过更多次数的迭代，准确率没有明显提升。

6.2 Fully connected neural network

Accuracy曲线。迭代2000词，学习率3，耗时数分钟。训练数据大小4000，测试数据大小1000，无重合。测试准确率最后稳定在91%，尝试过更多次数的迭代，准确率没有明显提升。（如果测试数据和训练数据是同一个集合，那么准确率会偏高）

一件很尴尬的事：在学习率为0.01时nn学习速度很慢，我TM还以为是有bug调了好久。以后切记超参数先随便试试再debug。

6.3 Code

逻辑斯蒂的代码针对之前代码进行了少许修改，下面是最终版本

logistic.pyview raw

import numpy as np
import matplotlib.pyplot as plt
from scipy.io import loadmat


class Linear:
    @staticmethod
    def sigmoid(z):
        return 1 / (1 + np.exp(-z))

    def __init__(self, nin, nout):
        self.nin = nin
        self.nout = nout
        self.theta = np.zeros((nout, nin + 1))
        self.x = np.array([])
        self.nx = 0
        self.y = np.array([])
        self.mu = []
        self.s = []
        self.reg_param = 0
        self.feature_normed = False

    def set_data(self, training_data, do_feature_norm=True, reg_param=0):
        self.x = training_data[0]
        self.nx = len(self.x[0])
        self.y = training_data[1]
        if do_feature_norm:
            self.__feature_norm()
        self.feature_normed = do_feature_norm
        self.reg_param = reg_param

    def learn_debug(self, iteration=1500, lr=0.01):
        for i in range(iteration):
            yield self.j_theta()
            self.__step(lr)
        yield self.j_theta()

    def j_theta(self):
        h_theta = self.__forward(self.x)
        j_theta = -(self.y * np.log(h_theta) + (1 - self.y) * np.log(1 - h_theta)).sum() / self.nx
        return j_theta + self.reg_param * (self.theta * self.theta)[1:].sum() / 2 / self.nx

    def predict(self, x):
        return self.__forward((x - self.mu) / self.s) if self.feature_normed \
                else self.__forward(x)

    def test_accuracy(self, test_data):
        count = 0
        total = 0
        for p, a in zip(np.argmax(self.predict(test_data[0]), axis=0), np.argmax(test_data[1], axis=0)):
            if p == a:
                count += 1
            total += 1
        return float(count) / total

    def __feature_norm(self):
        self.mu = self.x.mean(0)
        self.s = self.x.max(0) - self.x.min(0)
        for i in range(len(self.x)):
            self.x[i] = (self.x[i] - self.mu) / self.s

    def __forward(self, x):
        return Linear.sigmoid(self.theta.dot(np.insert(x, 0, 1, axis=0)))

    def __step(self, lr):
        for i in range(len(self.theta) - 1):
            self.theta[i + 1] = self.theta[i + 1] * (1 - lr * self.reg_param / self.nx)
        self.theta = self.theta - (self.__forward(self.x) - self.y) \
            .dot(np.insert(self.x, 0, 1, axis=0).transpose()) * lr / self.nx


if __name__ == "__main__":
    data = loadmat('ex3data1.mat')
    y = np.zeros((5000, 10), dtype=int)
    for i in range(len(y)):
        y[i][round(data['y'][i][0] - 1)] = 1
    training_data = (data['X'].transpose(), y.transpose())
    s = Linear(400, 10)
    s.set_data(training_data, do_feature_norm=False)
    iteration = 100
    lr = 0.01

    accuracy = [s.test_accuracy(training_data) * 100 for _ in s.learn_debug()]

    x = np.linspace(0, len(accuracy), len(accuracy))
    plt.scatter(x, accuracy)
    plt.title("iteration=%d, lr=%f, final=%f" % (iteration, lr, accuracy[-1]))
    plt.show()

下面是神经网络的代码

dnn.pyview raw

import numpy as np
import matplotlib.pyplot as plt
from scipy.io import loadmat


class NeuralNetwork:
    @staticmethod
    def sigmoid(z):
        return 1 / (1 + np.exp(-z))

    def __init__(self, shape):
        self.shape = shape
        self.thetas = []

    def predict(self, x):
        x = x.transpose()
        for theta in self.thetas:
            x = NeuralNetwork.sigmoid(theta.dot(np.insert(x, 0, 1, axis=0)))
        return x

    def test_accuracy(self, data):
        hit = 0
        total = 0
        for p, a in zip(np.argmax(self.predict(data[0]), axis=0), np.argmax(data[1].transpose(), axis=0)):
            if p == a:
                hit += 1
            total += 1
        return hit, total

    def j_theta(self, data, reg_param=0):
        x = data[0].transpose()
        y = data[1].transpose()
        m = len(data[0])
        for theta in self.thetas:
            x = NeuralNetwork.sigmoid(theta.dot(np.insert(x, 0, 1, axis=0)))
        j_theta = -(y * np.log(x) + (1 - y) * np.log(1 - x)).sum() / m
        for theta in self.thetas:
            j_theta = j_theta + reg_param * (theta * theta).transpose()[1:].sum() / 2 / m
        return j_theta

    def _gradient_checking(self, data, reg_param, res, m):
        ys = []
        epsilon = 1e-4
        for i in range(len(self.thetas)):
            for r in range(len(self.thetas[i])):
                for c in range(len(self.thetas[i][r])):
                    self.thetas[i][r][c] -= epsilon
                    t = self.j_theta(data, reg_param)
                    self.thetas[i][r][c] += 2 * epsilon
                    t -= self.j_theta(data, reg_param)
                    t /= (2 * epsilon)
                    self.thetas[i][r][c] -= epsilon
                    ys.append(t - res[i][r][c] / m)
        xs = np.linspace(0, len(ys), len(ys))
        plt.scatter(xs, ys)
        plt.show()

    def train(self, training_data, test_data=None, iteration=1500, lr=0.01, reg_param=0):
        self.thetas = [np.random.randn(y, x + 1) for x, y in zip(self.shape[:-1], self.shape[1:])]
        ret = []  # accuracy list
        m = len(training_data[0])
        x = training_data[0].transpose()
        y = training_data[1].transpose()
        for i in range(iteration):
            delta_theta = self._back_prop(x, y)
            # gradient checking if necessary
            # self.__gradient_checking(training_data, reg_param, delta_theta, m)
            # apply delta_theta
            for i in range(len(self.thetas)):
                self.thetas[i] = self.thetas[i] * (1 - lr / m * reg_param) - lr / m * delta_theta[i]
            ret.append(self.test_accuracy(test_data if test_data is not None else training_data))
        return ret

    def _back_prop(self, x, y):
        """

        :param x: transposed, x_0 not inserted
        :param y:
        :return:
        """
        a = [x]
        delta_theta = [np.zeros(t.shape) for t in self.thetas]
        # forward
        for theta in self.thetas:
            a.append(NeuralNetwork.sigmoid(theta.dot(np.insert(a[-1], 0, 1, axis=0))))
        # back prop
        delta = a[-1] - y
        for i in range(len(delta_theta)):
            delta_theta[-1 - i] = delta.dot(np.insert(a[-2 - i], 0, 1, axis=0).transpose())
            delta = (self.thetas[-1 - i].transpose().dot(delta)[1:] * (1 - a[-2 - i]) * a[-2 - i])
        return delta_theta


if __name__ == "__main__":
    data = loadmat('ex4data1.mat')
    y = np.zeros((len(data['y']), 10), dtype=int)
    for i in range(len(y)):
        y[i][round(data['y'][i][0] - 1)] = 1

    data = [(x, y) for x, y in zip(data['X'], y)]
    np.random.shuffle(data)

    x = np.array([data[i][0] for i in range(0, 4000)])
    y = np.array([data[i][1] for i in range(0, 4000)])
    training_data = (x, y)

    x = np.array([data[i][0] for i in range(4000, 5000)])
    y = np.array([data[i][1] for i in range(4000, 5000)])
    test_data = (x, y)

    s = NeuralNetwork((400, 25, 10))
    iteration = 2000
    lr = 3
    accuracy_data = s.train(training_data, test_data, iteration, lr)
    accuracy = []
    for hit, count in accuracy_data:
        accuracy.append(float(hit) / count)
    x = np.linspace(0, len(accuracy), len(accuracy))
    plt.scatter(x, accuracy)
    plt.title("iteration=%d, lr=%f, final=%f" % (iteration, lr, accuracy[-1]))
    plt.show()