飞道的博客

深度神经网络(Deep Learning)学习笔记

323人阅读  评论(0)

1、神经网络基本原理

前向传播(Forward-Pass)


对于一个神经元来说,其对前一层网络中的所有输出分别乘上权重w1、w2……再加上偏执b(图中未展现)并求和,得到该神经元的输入值,随后再通过该神经元中的激活函数f(x)输出给下一层神经元,依次传递下去,直到损失函数Loss为止,停止计算。

反向传播(Backward-Pass)


区别于前向传播,反向传播需要在前向传播完成的条件下才可以执行,而反向传播的目的是为了更新网络中的权重和偏执,也就是w和b,从而使得损失函数Loss更小,以符合我们的要求。

2、利用Python进行简单的网络模型搭建

依赖库

import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_boston

对每一层网络创建一个类

由于每一层网络中的神经元个数未知,所以我们可以将一整层神经元当作一个对象进行类的创建,不同层的神经元之间的计算可以使用矩阵形式进行。

class Layers:

    def __init__(self, nodes_num=0, name=None, is_trainable=False):
        self.nodes_num = nodes_num
        self.name = name
        self.is_trainable = is_trainable
        self.value = None
        self.gradients = {
   }

    def __repr__(self):
        return '{}'.format(self.name)

初始化的时候我们需要nodes_num,name,is_trainable这3个参数,分别代表了该层神经元个数、该层网络的名字以及该层是否有权重w和偏执b需要训练。除此之外,还需要初始化一个self.value和self.gradients,分别用来储存该层网络前向计算后的输出值以及反向传播时对不同参数求得的梯度值,方便调用。

输入层(输入数据)

class Placeholder(Layers):

    def __init__(self, nodes_num=0, inputs=None, name=None, is_trainable=False):
        Layers.__init__(self, nodes_num=nodes_num, name=name, is_trainable=is_trainable)
        self.x = inputs
        self.outputs = []

    def forward(self):
        self.value = self.x

    def backward(self):
        for n in self.outputs:
            self.gradients[self] = n.gradients[self] * 1

输入层初始化的时候,需要一个self.x和self.outputs,该层的输入为一系列的数据,故前向传播的时候不需要任何的权重和偏执以及激活函数,只需要把数据送入到下一层。此处的ouputs没有进行任何操作,在后面就会发现,后一层的网络会调用前一层的outputs参数,并将后一层加入到前一层的ouputs中,从而让整个神经网络在反向传播的过程中知道自己的后一层是哪个。(此处可能会有少许难理解,整个流程多走几遍就明白了)

隐藏层(激活函数)

class Sigmoid(Layers):

    def __init__(self, nodes_num=0, inputs=None, name=None, is_trainable=False):
        Layers.__init__(self, nodes_num=nodes_num, name=name, is_trainable=is_trainable)
        self.x = inputs
        self.w_matrix = np.random.normal(size=[self.nodes_num, self.x.nodes_num])
        self.b = np.random.randint(0, 9)
        self.outputs = []
        self.x.outputs.append(self)

    def x_value_before_activate(self):
        return np.dot(self.w_matrix, self.x.value) + self.b

    def _sigmoid(self, x):
        return 1. / (1 + np.exp(-1 * x))

    def partial(self):
        return self._sigmoid(self.x_value_before_activate()) * (1 - self._sigmoid(self.x_value_before_activate()))

    def forward(self):
        self.value = self._sigmoid(self.x_value_before_activate())

    def backward(self):
        for n in self.outputs:
            x = np.array([self.x.value])
            before_activate = n.gradients[self] * self.partial()
            before_activate_m = np.transpose(np.array([before_activate]))
            self.gradients[self.x] = np.dot(np.transpose(self.w_matrix), before_activate)
            self.gradients['w_matrix'] = np.matmul(before_activate_m, x)
            self.gradients['b'] = np.sum(before_activate)


class ReLU(Layers):

    def __init__(self, nodes_num=0, inputs=None, name=None, is_trainable=False):
        Layers.__init__(self, nodes_num=nodes_num, name=name, is_trainable=is_trainable)
        self.x = inputs
        self.w_matrix = np.random.normal(size=[self.nodes_num, self.x.nodes_num])
        self.b = np.random.randint(0, 9)
        self.outputs = []
        self.x.outputs.append(self)

    def x_value_before_activate(self):
        return np.dot(self.w_matrix, self.x.value) + self.b

    def partial(self):
        p_vector = self.x_value_before_activate()
        p_vector[p_vector <= 0] = 0
        p_vector[p_vector > 0] = 1
        return p_vector

    def forward(self):
        self.value = self.x_value_before_activate()
        self.value[self.value <= 0] = 0

    def backward(self):
        for n in self.outputs:
            before_activate = n.gradients[self] * self.partial()
            x = np.array([self.x.value])
            before_activate_m = np.transpose(np.array([before_activate]))
            self.gradients[self.x] = np.dot(np.transpose(self.w_matrix), before_activate)
            self.gradients['w_matrix'] = np.matmul(before_activate_m, x)
            self.gradients['b'] = np.sum(before_activate)

相较于输入层,隐藏层多了一些权重w和偏执b,这唯一不同的就是在前向计算的时候将上一层输出的值value(后面用x表示)先经过一个z=wx+b,随后再将z输入至激活函数进行运算输出value,再反向传播的时候同理,先对激活函数求导,再对z=wx+b求梯度。

self.x.outputs.append(self)这行代码的用途就是让前一层网络知道自己后一层网络是哪个,从而让反向传播连起来,上面有讲。

中间层的激活函数可以有很多种,这里只列出了Sigmoid和ReLU两种,感兴趣的可以按照模板自行添加其他新的激活函数。

输出层(损失函数)

class Mean(Layers):

    def __init__(self, nodes_num=0, y=None, x=None, name=None, is_trainable=False):
        Layers.__init__(self, nodes_num=nodes_num, name=name, is_trainable=is_trainable)
        self.x = x
        self.y = y
        self.w_matrix = np.random.normal(size=[self.nodes_num, self.x.nodes_num])
        self.b = np.random.randint(0, 9)
        self.x.outputs.append(self)

    def y_hat_value(self):
        return np.dot(self.w_matrix, self.x.value) + self.b

    def forward(self):
        self.value = np.mean((self.y.value - self.y_hat_value()) ** 2)

    def backward(self):
        x = np.array([self.x.value])
        before_activate = -2 * (self.y.value - self.y_hat_value())
        before_activate_m = np.transpose(np.array([before_activate]))
        self.gradients[self.y] = 2 * (self.y.value - self.y_hat_value())
        self.gradients[self.x] = np.dot(np.transpose(self.w_matrix), before_activate)
        self.gradients['w_matrix'] = np.matmul(before_activate_m, x)
        self.gradients['b'] = np.sum(before_activate)


class SoftMax(Layers):

    def __init__(self, nodes_num=0, y=None, x=None, name=None, is_trainable=False):
        Layers.__init__(self, nodes_num=nodes_num, name=name, is_trainable=is_trainable)
        self.x = x
        self.y = y
        self.w_matrix = np.random.normal(size=[self.nodes_num, self.x.nodes_num])
        self.b = np.random.randint(0, 9)
        self.x.outputs.append(self)

    def y_hat_value(self):
        x_value_before_activate = np.exp(np.dot(self.w_matrix, self.x.value) + self.b)
        total = np.sum(x_value_before_activate)
        return x_value_before_activate / total

    def forward(self):
        self.value = - np.dot(self.y.value, np.log(self.y_hat_value()))

    def backward(self):
        x = np.array([self.x.value])
        before_activate = self.y_hat_value() * np.sum(self.y.value) - self.y.value
        before_activate_m = np.transpose(np.array([before_activate]))
        self.gradients[self.x] = np.dot(np.transpose(self.w_matrix), before_activate)
        self.gradients['w_matrix'] = np.matmul(before_activate_m, x)
        self.gradients['b'] = np.sum(before_activate)

输出层与其他层不同的是,作为最后一层,它不需要outputs参数同时,他的输入除了上一层的神经网络之外,还需要一个样本label对网络的损失率进行一个判断。

同样,损失函数可以有很多种,这里只列出了均方差和Softmax两种损失函数。

优化器

def sgd(layers, learning_rate=1e-2):
    for l in layers:
        if l.is_trainable:
            w_matrix = np.transpose(l.w_matrix)
            w_gradients = np.transpose(l.gradients['w_matrix'])
            l.w_matrix = np.transpose(w_matrix - 1 * w_gradients * learning_rate)
            l.b += -1 * l.gradients['b'] * learning_rate

为了更新网络中的权重和偏执,这里采用最常见的SGD(随机梯度下降)优化器。

利用模型进行预测

def predict(node, Loss, test, order, monitor=False):
    Loss.y.value = 0
    node.x = test
    forward_and_backward(order, monitor=monitor, predict_mode=True)
    return np.max(Loss.y_hat_value()), np.argmax(Loss.y_hat_value())

网络搭建

data = load_boston()
X_, y_ = data['data'], data['target']
X_rm = X_[:, 5]

x = Placeholder(nodes_num=1, inputs=None, name='x', is_trainable=False)
y = Placeholder(nodes_num=1, inputs=None, name='y', is_trainable=False)

Layer1 = Sigmoid(nodes_num=100, inputs=x, name='Layer1', is_trainable=True)
Layer2 = Sigmoid(nodes_num=100, inputs=Layer1, name='Layer2', is_trainable=True)
Loss = Mean(nodes_num=1, y=y, x=Layer2, name='Loss', is_trainable=True)

order = [x, Layer1, Layer2, Loss]

losses = []

EPOCHS = 100

for e in range(EPOCHS):

    print('这是第{}轮'.format(e+1))

    batch_loss = 0

    batch_size = 100

    for b in range(batch_size):
        LOSS = 0
        index = np.random.choice(range(len(X_rm) - 1))
        x.x = X_rm[index:index + 1]
        y.value = y_[index:index + 1]

        forward_and_backward(order, monitor=False)
        sgd(order, learning_rate=1e-3)

        batch_loss += Loss.value

    losses.append(batch_loss / batch_size)
    print('本轮Loss:{}'.format(batch_loss / batch_size))

num = np.linspace(min(X_rm), max(X_rm), 2000)
pre_num = []
for n in num:
    n = [n]
    a, _ = predict(x, Loss, n, order, monitor=False)
    pre_num.append(a)
plt.scatter(X_rm, y_)
plt.plot(num, pre_num)
plt.show()

这里使用了波士顿房价的数据集进行测试,首先我们需要两个输入层x,y,将波士顿的房价信息输入进去,接着根据需要搭建多层神经网络(层数可自行选择,每一层的神经元个数以及网络层数并非越多越好),除了输入层,其它层都具有权重和偏置,故is_trainable为True,接着我们需要将网络层连接顺序记录下来放在数组order中,以便于计算机知道不同层的运行顺序(当然你也可以稍加修改让电脑自行记录搭建顺序,不过这些都是小事)。

后面的内容就和其他神经网络框架一样,喂入数据,采用最简单的SGD优化器对模型进行训练,最终输入模型进行预测。

图中曲线即为利用训练好的模型预测出来的房价走势图。

完整代码

import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_boston


class Layers:

    def __init__(self, nodes_num=0, name=None, is_trainable=False):
        self.nodes_num = nodes_num
        self.name = name
        self.is_trainable = is_trainable
        self.value = None
        self.gradients = {
   }

    def __repr__(self):
        return '{}'.format(self.name)


class Placeholder(Layers):

    def __init__(self, nodes_num=0, inputs=None, name=None, is_trainable=False):
        Layers.__init__(self, nodes_num=nodes_num, name=name, is_trainable=is_trainable)
        self.x = inputs
        self.outputs = []

    def forward(self):
        self.value = self.x

    def backward(self):
        for n in self.outputs:
            self.gradients[self] = n.gradients[self] * 1


class Sigmoid(Layers):

    def __init__(self, nodes_num=0, inputs=None, name=None, is_trainable=False):
        Layers.__init__(self, nodes_num=nodes_num, name=name, is_trainable=is_trainable)
        self.x = inputs
        self.w_matrix = np.random.normal(size=[self.nodes_num, self.x.nodes_num])
        self.b = np.random.randint(0, 9)
        self.outputs = []
        self.x.outputs.append(self)

    def x_value_before_activate(self):
        return np.dot(self.w_matrix, self.x.value) + self.b

    def _sigmoid(self, x):
        return 1. / (1 + np.exp(-1 * x))

    def partial(self):
        return self._sigmoid(self.x_value_before_activate()) * (1 - self._sigmoid(self.x_value_before_activate()))

    def forward(self):
        self.value = self._sigmoid(self.x_value_before_activate())

    def backward(self):
        for n in self.outputs:
            x = np.array([self.x.value])
            before_activate = n.gradients[self] * self.partial()
            before_activate_m = np.transpose(np.array([before_activate]))
            self.gradients[self.x] = np.dot(np.transpose(self.w_matrix), before_activate)
            self.gradients['w_matrix'] = np.matmul(before_activate_m, x)
            self.gradients['b'] = np.sum(before_activate)


class ReLU(Layers):

    def __init__(self, nodes_num=0, inputs=None, name=None, is_trainable=False):
        Layers.__init__(self, nodes_num=nodes_num, name=name, is_trainable=is_trainable)
        self.x = inputs
        self.w_matrix = np.random.normal(size=[self.nodes_num, self.x.nodes_num])
        self.b = np.random.randint(0, 9)
        self.outputs = []
        self.x.outputs.append(self)

    def x_value_before_activate(self):
        return np.dot(self.w_matrix, self.x.value) + self.b

    def partial(self):
        p_vector = self.x_value_before_activate()
        p_vector[p_vector <= 0] = 0
        p_vector[p_vector > 0] = 1
        return p_vector

    def forward(self):
        self.value = self.x_value_before_activate()
        self.value[self.value <= 0] = 0

    def backward(self):
        for n in self.outputs:
            before_activate = n.gradients[self] * self.partial()
            x = np.array([self.x.value])
            before_activate_m = np.transpose(np.array([before_activate]))
            self.gradients[self.x] = np.dot(np.transpose(self.w_matrix), before_activate)
            self.gradients['w_matrix'] = np.matmul(before_activate_m, x)
            self.gradients['b'] = np.sum(before_activate)


class Mean(Layers):

    def __init__(self, nodes_num=0, y=None, x=None, name=None, is_trainable=False):
        Layers.__init__(self, nodes_num=nodes_num, name=name, is_trainable=is_trainable)
        self.x = x
        self.y = y
        self.w_matrix = np.random.normal(size=[self.nodes_num, self.x.nodes_num])
        self.b = np.random.randint(0, 9)
        self.x.outputs.append(self)

    def y_hat_value(self):
        return np.dot(self.w_matrix, self.x.value) + self.b

    def forward(self):
        self.value = np.mean((self.y.value - self.y_hat_value()) ** 2)

    def backward(self):
        x = np.array([self.x.value])
        before_activate = -2 * (self.y.value - self.y_hat_value())
        before_activate_m = np.transpose(np.array([before_activate]))
        self.gradients[self.y] = 2 * (self.y.value - self.y_hat_value())
        self.gradients[self.x] = np.dot(np.transpose(self.w_matrix), before_activate)
        self.gradients['w_matrix'] = np.matmul(before_activate_m, x)
        self.gradients['b'] = np.sum(before_activate)


class SoftMax(Layers):

    def __init__(self, nodes_num=0, y=None, x=None, name=None, is_trainable=False):
        Layers.__init__(self, nodes_num=nodes_num, name=name, is_trainable=is_trainable)
        self.x = x
        self.y = y
        self.w_matrix = np.random.normal(size=[self.nodes_num, self.x.nodes_num])
        self.b = np.random.randint(0, 9)
        self.x.outputs.append(self)

    def y_hat_value(self):
        x_value_before_activate = np.exp(np.dot(self.w_matrix, self.x.value) + self.b)
        total = np.sum(x_value_before_activate)
        return x_value_before_activate / total

    def forward(self):
        self.value = - np.dot(self.y.value, np.log(self.y_hat_value()))

    def backward(self):
        x = np.array([self.x.value])
        before_activate = self.y_hat_value() * np.sum(self.y.value) - self.y.value
        before_activate_m = np.transpose(np.array([before_activate]))
        self.gradients[self.x] = np.dot(np.transpose(self.w_matrix), before_activate)
        self.gradients['w_matrix'] = np.matmul(before_activate_m, x)
        self.gradients['b'] = np.sum(before_activate)


def sgd(layers, learning_rate=1e-2):
    for l in layers:
        if l.is_trainable:
            w_matrix = np.transpose(l.w_matrix)
            w_gradients = np.transpose(l.gradients['w_matrix'])
            l.w_matrix = np.transpose(w_matrix - 1 * w_gradients * learning_rate)
            l.b += -1 * l.gradients['b'] * learning_rate


def forward_and_backward(order, monitor=False, predict_mode=False):
    if not predict_mode:
        # 整体的参数更新一次
        for layer in order:
            if monitor:
                print("前向计算Node:{}".format(layer))
            layer.forward()

        for layer in order[::-1]:
            if monitor:
                print("后向传播Node:{}".format(layer))
            layer.backward()
    else:
        for n in range(len(order) - 1):
            if monitor:
                print("前向计算Node:{}".format(order[n]))
            order[n].forward()


def predict(node, Loss, test, order, monitor=False):
    Loss.y.value = 0
    node.x = test
    forward_and_backward(order, monitor=monitor, predict_mode=True)
    return np.max(Loss.y_hat_value()), np.argmax(Loss.y_hat_value())


data = load_boston()
X_, y_ = data['data'], data['target']
X_rm = X_[:, 5]

# 网络框架搭建
x = Placeholder(nodes_num=1, inputs=None, name='x', is_trainable=False)
y = Placeholder(nodes_num=1, inputs=None, name='y', is_trainable=False)

Layer1 = Sigmoid(nodes_num=100, inputs=x, name='Layer1', is_trainable=True)
Layer2 = Sigmoid(nodes_num=100, inputs=Layer1, name='Layer2', is_trainable=True)
Loss = Mean(nodes_num=1, y=y, x=Layer2, name='Loss', is_trainable=True)

order = [x, Layer1, Layer2, Loss]

# 开始训练模型
losses = []
EPOCHS = 100

for e in range(EPOCHS):

    print('这是第{}轮'.format(e+1))

    batch_loss = 0

    batch_size = 100

    for b in range(batch_size):
        LOSS = 0
        index = np.random.choice(range(len(X_rm) - 1))
        x.x = X_rm[index:index + 1]
        y.value = y_[index:index + 1]

        forward_and_backward(order, monitor=False)
        sgd(order, learning_rate=1e-3)

        batch_loss += Loss.value

    losses.append(batch_loss / batch_size)
    print('本轮Loss:{}'.format(batch_loss / batch_size))

# 利用模型进行预测
num = np.linspace(min(X_rm), max(X_rm), 2000)
pre_num = []
for n in num:
    n = [n]
    a, _ = predict(x, Loss, n, order, monitor=False)
    pre_num.append(a)
plt.scatter(X_rm, y_)
plt.plot(num, pre_num)
plt.show()


转载:https://blog.csdn.net/weixin_43130561/article/details/115532292
查看评论
* 以上用户言论只代表其个人观点,不代表本网站的观点或立场