《机器学习》 西瓜书的代码实现.
这里用的神经网络表示方法不是西瓜书里的, 而是我习惯的一种表达方式 (反正都是等价的) . 我个人不是很喜欢西瓜书里的表达方式, 它将阀值单独看待, 而且输入层和隐层的参数还分别对待, 我觉得这样表达很麻烦.
import sys
sys.path.append('/home/aistudio/external-libraries')
import numpy as np
np.set_printoptions(suppress=True) # 禁用科学计数
import pandas as pd
import matplotlib.pyplot as plt
path = "work/西瓜数据集3.0.txt"
data = pd.read_csv(
path,
)
data.shape
由于数据是离散的, 要用 $\mathrm{one-hot}$ 处理.
from sklearn import preprocessing
enc = preprocessing.OneHotEncoder()
enc.fit(data.iloc[:, :6])
a = np.array(enc.transform(data.iloc[:, :6]).toarray())
b = np.array(data.iloc[:, 6]).reshape(data.shape[0], 1)
X = np.c_[a, b]
enc.fit(data.iloc[:, 7:])
y = np.array(enc.transform(data.iloc[:, 7:]).toarray())
根据经验公式 $m = log_2(n)$ ( m 为隐层节点数, n 为输入层节点数) 可得隐层应为 5 层.
hidden_size = 5
input_size = X.shape[1]
num_labels = y.shape[1]
def sigmoid(x):
return 1 / (1 + np.exp(-x))
def separate_params(params, hidden_size, input_size, num_labels):
theta1 = np.reshape(params[:hidden_size * (input_size + 1)], (hidden_size, (input_size + 1)))
theta2 = np.reshape(params[hidden_size * (input_size + 1):], (num_labels, (hidden_size + 1)))
return [theta1, theta2]
def forward_propagate(params, X, hidden_size, input_size, num_labels):
theta = separate_params(params, hidden_size, input_size, num_labels)
n = X.shape[0]
ones = np.ones([n, 1])
a_0 = np.c_[X, ones] # 加全1列表示偏置项
z_1 = a_0 @ theta[0].T
a_1 = np.c_[sigmoid(z_1), ones] # 同上
z_2 = a_1 @ theta[1].T
res = sigmoid(z_2)
return a_0, z_1, a_1, z_2, res
def create_params(hidden_size, input_size, num_labels):
return (np.random.random(size=hidden_size * (input_size + 1) + num_labels * (hidden_size + 1)) - 0.5) * 0.25 # 减0.5的用处是为了让随机值有正有负
params = create_params(hidden_size, input_size, num_labels)
测试一下.
forward_propagate(params, X[0:1], hidden_size, input_size, num_labels)
def cost(params, X, y, hidden_size, input_size, num_labels):
a_0, z_1, a_1, z_2, y_pre = forward_propagate(params, X, hidden_size, input_size, num_labels)
return np.sum(((y_pre - y) ** 2)) / 2
cost(params, X[0:1], y[0:1], hidden_size, input_size, num_labels)
def sigmoid_gradient(z): # sigmoid导数
return np.multiply(sigmoid(z), (1 - sigmoid(z)))
def backprop_one(params, X, y, hidden_size, input_size, num_labels, learning_rate):
theta = separate_params(params, hidden_size, input_size, num_labels)
a_0, z_1, a_1, z_2, y_pre = forward_propagate(params, X, hidden_size, input_size, num_labels)
z_1 = np.c_[z_1, 1] # 这里为什么要加一列呢, 因为delta_1应该只有5列, 但是theta[1]有6列, 多一列是增加的全1列导致的
gradient = [None, None]
delta_2 = (y_pre - y) * y_pre * (1 - y_pre)
gradient[1] = delta_2.T @ a_1
delta_1 = delta_2 @ theta[1] * sigmoid_gradient(z_1)
gradient[0] = delta_1[:, :-1].T @ a_0 # 这里再去掉一列
return np.concatenate((np.ravel(gradient[0]), np.ravel(gradient[1]))) * learning_rate
learning_rate = 0.1
backprop_one(params, X[0:1], y[0:1], hidden_size, input_size, num_labels, learning_rate)
def backprop_all(params, X, y, hidden_size, input_size, num_labels, learning_rate):
n = X.shape[0]
gradient = create_params(hidden_size, input_size, num_labels)
gradient -= gradient
for i in range(n):
temp = backprop_one(params, X[i:i + 1], y[i:i + 1], hidden_size, input_size, num_labels, learning_rate)
gradient += temp
c = cost(params, X, y, hidden_size, input_size, num_labels)
return c, gradient / n
from scipy.optimize import minimize # 优化函数
fmin = minimize(fun=backprop_all, x0=params, args=(X, y, hidden_size, input_size, num_labels, learning_rate),
method='TNC', jac=True, options={'maxiter': 250}, )
fmin
cost(fmin.x, X, y, hidden_size, input_size, num_labels)
forward_propagate(fmin.x, X, hidden_size, input_size, num_labels)
效果不错
from scipy.io import loadmat
data = loadmat('/home/aistudio/data/data20138/ex4data1.mat')
data
X = np.array(data['X'])
y_prev = data['y']
y = enc.fit_transform(data['y']).toarray()
def backprop_one(params, X, y, hidden_size, input_size, num_labels, learning_rate):
theta = separate_params(params, hidden_size, input_size, num_labels)
a_0, z_1, a_1, z_2, y_pre = forward_propagate(params, X, hidden_size, input_size, num_labels)
z_1 = np.c_[z_1, 1] # 这里为什么要加一列呢, 因为delta_1应该只有5列, 但是theta[1]有6列, 多一列是增加的全1列导致的
gradient = [None, None]
delta_2 = (y_pre - y) * y_pre * (1 - y_pre)
gradient[1] = delta_2.T @ a_1
delta_1 = delta_2 @ theta[1] * sigmoid_gradient(z_1)
gradient[0] = delta_1[:, :-1].T @ a_0
return np.concatenate((np.ravel(gradient[0]), np.ravel(gradient[1]))) * learning_rate
hidden_size = 25
num_labels = 10
input_size = X.shape[1]
params = create_params(hidden_size, input_size, num_labels)
learning_rate = 0.1
fmin = minimize(fun=backprop_all, x0=params, args=(X, y, hidden_size, input_size, num_labels, learning_rate),
method='TNC', jac=True, options={'maxiter': 250}, )
fmin
cost(fmin.x, X, y, hidden_size, input_size, num_labels)
a_0, z_1, a_1, z_2, y_pre = forward_propagate(fmin.x, X, hidden_size, input_size, num_labels)
y_pred = np.array(np.argmax(y_pre, axis=1) + 1)
correct = [1 if a == b else 0 for (a, b) in zip(y_pred, y_prev)]
accuracy = (sum(map(int, correct)) / float(len(correct)))
print ('accuracy = {0}%'.format(accuracy * 100))
我们要解决的是异或问题, 所以我们先人工制造数据.
X = np.array([[1, 0], [1, 1], [0, 0], [0, 1]])
y = np.array([[1], [0], [0], [1]])
def gaussian_radial_base(X, c): # 高斯径向基函数
res = np.ones([X.shape[0], c.shape[0]])
for i in range(c.shape[0]):
res[:, i] = np.exp(-np.sum((X - c[i]) ** 2, axis=1))
return res
gaussian_radial_base(np.array([[1, 1], [0, 0]]), c)
中心就随便选了...
c = np.array([[1, 1], [0, 0]])
c
重写反向传播
hidden_size = 2
num_labels = 1
input_size = X.shape[1]
params = (np.random.random(hidden_size + 1) - 0.5) * 0.25
learning_rate = 0.1
def forward_propagate(params, X, hidden_size, input_size, num_labels, c):
theta = params.reshape(1, hidden_size + 1)
n = X.shape[0]
a_0 = X
z_1 = gaussian_radial_base(a_0, c)
a_1 = np.c_[z_1, np.ones([X.shape[0], 1])]
z_2 = a_1 @ theta.T
res = sigmoid(z_2)
return a_0, z_1, a_1, z_2, res
def backprop_one(params, X, y, hidden_size, input_size, num_labels, learning_rate, c):
theta = params.reshape(1, hidden_size + 1)
a_0, z_1, a_1, z_2, y_pre = forward_propagate(params, X, hidden_size, input_size, num_labels, c)
delta_2 = (y_pre - y) * y_pre * (1 - y_pre)
gradient = delta_2.T @ a_1 # 前面一层没有参数可以计算
return gradient * learning_rate
backprop_one(params, X[0:1], y[0:1], hidden_size, input_size, num_labels, learning_rate, c)
def backprop_all(params, X, y, hidden_size, input_size, num_labels, learning_rate, c):
a_0, z_1, a_1, z_2, y_pre = forward_propagate(params, X, hidden_size, input_size, num_labels, c)
cost = np.sum((y - y_pre) ** 2)
gradient = 0
for i in range(X.shape[0]):
gradient += backprop_one(params, X[i:i + 1], y[i:i + 1], hidden_size, input_size, num_labels, learning_rate, c)
return cost, gradient.ravel()
backprop_all(params, X, y, hidden_size, input_size, num_labels, learning_rate, c)
gmin = minimize(fun=backprop_all, x0=params, args=(X, y, hidden_size, input_size, num_labels, learning_rate, c),
method='TNC', jac=True, options={'maxiter': 250}, )
gmin
a_0, z_1, a_1, z_2, y_pre = forward_propagate(gmin.x, X, hidden_size, input_size, num_labels, c)
y_pre
基本上是正确的, 解决了异或问题.