《机器学习》 西瓜书的代码实现.
import numpy as np
np.set_printoptions(suppress=True) # 禁用科学计数
import pandas as pd
import matplotlib.pyplot as plt
path = 'work/西瓜数据集alpha.txt' # 导入数据
data = pd.read_csv(path)
data.head()
# 将'是', '否'换成'Y', 'N'
data['Good melon'][data['Good melon'].isin(['是'])] = 1
data['Good melon'][data['Good melon'].isin(['否'])] = 0
data.head()
# 分离出 X 与 y
X = data.iloc[:, :2]
y = data.iloc[:, 2]
print(X)
print(y)
接下来进行可视化.
yes = data[data['Good melon'].isin([1])]
no = data[data['Good melon'].isin([0])]
fig, ax = plt.subplots(figsize=(12, 8))
ax.scatter(yes['Density'], yes['Sugar content'], marker='o', c='b', label='Yes')
ax.scatter(no['Density'], no['Sugar content'], marker='x', c='r', label='No')
ax.legend()
ax.set_xlabel('Density')
ax.set_ylabel('Sugar content')
plt.show()
$Sigmoid$ 函数: $$\frac{1}{1 + e^{-x}}$$ 对数似然(损失函数): $$\ell(\boldsymbol\beta) = \sum_{i = 1}^{m}(-y_i\boldsymbol\beta^\mathrm{T}\hat{\boldsymbol{x}}_i + ln(1 + \mathrm{e}^{\boldsymbol\beta^\mathrm{T}\hat{\boldsymbol{x}}_i}))$$ 对其求偏导(过程比较简单就不展开了, 就是简单的展开计算): $$\begin{aligned} \frac{\partial\ell(\boldsymbol\beta)}{\partial\beta_i}&=(X^\mathrm{T})_i(\frac{1}{1+\mathrm{e}^{-X\beta}}-\boldsymbol{y})\\\\ &=(X^{\mathrm{T}})_i(sigmoid(X\beta)-\boldsymbol{y}) \end{aligned}$$ 其中 $(X^\mathrm{T})_i$ 是 $X^\mathrm{T}$ 的第 $i$ 行.
def sigmoid(x):
return 1 / (1 + np.exp(-x))
def cost(params, X, y):
beta = params[None].T
res = np.multiply(-y, (X @ beta)) + np.log(1 + np.exp(X @ beta))
return np.sum(res)
def gradient(params, X, y):
beta = params[None].T
grad = np.zeros([1, beta.shape[0]]).T # 定义梯度数组存储梯度
for i in range(beta.shape[0]):
grad[i] = X.T[i][None] @ (sigmoid(X @ beta) - y)
return grad.T[0]
试试效果, 其中带后缀 _v 的是某个数据的向量版本.
params = np.array([1, 1, 1])
y_v = np.array([y], dtype=float).T
X_v = np.array(X)
X_v = np.insert(X_v, axis=1, values=np.ones(X_v.shape[0]), obj=2) # 为了更方便的表示配置项, 添加值全 1 的项
gradient(params, X_v, y_v), cost(params, X_v, y_v)
我们用 SciPy's truncated newton (TNC) 实现寻找最优参数 (是啥自己百度) .
import scipy.optimize as opt
result = opt.fmin_tnc(func=cost, x0=beta_v, fprime=gradient, args=(X_v, y_v))
result
result_v = result[0]
cost(result_v, X_v, y_v)
def predict(params, X):
theta = params[None].T
probability = sigmoid(X @ theta)
return np.array([[1 if x >= 0.5 else 0 for x in probability]]).T
x1_max = X_v[:, 0].max()
x1_min = X_v[:, 0].min()
x2_max = X_v[:, 1].max()
x2_min = X_v[:, 1].min()
x1, x2 = np.mgrid[x1_min:x1_max:500j, x2_min:x2_max:500j] # np.mgrid的用法自行百度
x_pre = np.stack((x1.flat, x2.flat), axis=1) # 合成坐标
x_pre = np.insert(x_pre, values=np.ones(x_pre.shape[0]), obj=2, axis=1)
x_pre
import matplotlib as mpl
y_pre = predict(result_v, x_pre)
y_pre = y_pre.reshape(x1.shape)
cm_light = mpl.colors.ListedColormap(['#77E0A0', '#FF8080', '#A0A0FF'])
fig, ax = plt.subplots(figsize=(12, 8))
plt.pcolormesh(x1, x2, y_pre, cmap=cm_light) # 网格
ax.scatter(yes['Density'], yes['Sugar content'], marker='o', c='b', label='Yes')
ax.scatter(no['Density'], no['Sugar content'], marker='x', c='r', label='No')
ax.legend()
plt.xlim(x1_min, x1_max)
plt.ylim(x2_min, x2_max)
ax.set_xlabel('Density')
ax.set_ylabel('Sugar content')
plt.show()
效果好一般... 应该是训练集太小的缘故.
path = 'work/ex2data1.txt'
iris = pd.read_csv(path, header=None, names=['Exam 1', 'Exam 2', 'Admitted'])
X = np.c_[iris.iloc[:, :2], np.ones([1, iris.shape[0]]).T]
y = np.array(iris.iloc[:, 2])[None].T
beta_v = np.ones(3)
beta_v
cost(beta_v, X, y)
import scipy.optimize as opt
result = opt.fmin_tnc(func=cost, x0=beta_v, fprime=gradient, args=(X, y))
result
cost(result[0], X, y)
公式直接给了, 实现就很简单了.
$\boldsymbol\mu$ 是均值.
def PDA(X_0, X_1):
mu_0 = X_0.mean(axis=0).reshape(1, X_0.shape[1]).T
mu_1 = X_1.mean(axis=0).reshape(1, X_1.shape[1]).T
Sigma_0 = (X_0 - mu_0.T).T @ (X_0 - mu_0.T)
Sigma_1 = (X_1 - mu_1.T).T @ (X_1 - mu_1.T)
S_w = Sigma_0 + Sigma_1
w = np.linalg.inv(S_w) @ (mu_0 - mu_1) # 求矩阵的逆
return w
X_0 = np.array(data[data['Good melon'].isin([0])].iloc[:, 0:2], dtype='float')
X_1 = np.array(data[data['Good melon'].isin([1])].iloc[:, 0:2], dtype='float')
w = PDA(X_0, X_1)
w
def mapping(w, X):
k = w[1] / w[0]
new_X = ((k * X[:, 1] + X[:, 0]) / (k * k + 1)).reshape(1, X.shape[0])
new_X = np.insert(new_X, values=k * new_X, obj=1, axis=0)
return new_X.T
np.insert(w, values=0, axis=1, obj=0).T
new_yes = mapping(w, X_1)
new_no = mapping(w, X_0)
fig, ax = plt.subplots(figsize=(12, 8))
ax.scatter(new_yes[:, 0], new_yes[:, 1], marker='^', c='b', label='new_Yes')
ax.scatter(new_no[:, 0], new_no[:, 1], marker='*', c='r', label='new_No')
ax.scatter(yes['Density'], yes['Sugar content'], marker='o', c='b', label='Yes')
ax.scatter(no['Density'], no['Sugar content'], marker='x', c='r', label='No')
ax.legend()
ax.set_xlabel('Density')
ax.set_ylabel('Sugar content')
plt.show()
那条明显的直线就是降维后的数据.
这里我们选择了输血服务中心数据集.
UCI 的数据集好多都很大, 特征很多, 这个比较简单.
path = 'work/transfusion.data' # 导入数据
data = pd.read_csv(
path,
)
data.head()
随机打乱数据.
rand_data = np.random.permutation(data)
rand_data
同样是分离 X , y 且向量化.
X_v = rand_data[:, 0:4]
X_v = np.insert(X_v, values=1, obj=4, axis=1)
y_v = rand_data[:, [4]]
开始 10 折交叉验证.
def accuracy(pre, y):
return np.sum(pre == y) / len(pre)
def data_split_test(seq, X, y):
beta_v = np.array([[0, 0, 0, 0, 0]]).T
sum = 0
ran = int(X.shape[0] / seq)
for i in range(seq):
rest_X = X[i * ran:(i + 1) * ran]
rest_y = y[i * ran:(i + 1) * ran]
temp_X = X[list(range(0, i * ran)) + list(range((i + 1) * ran, X.shape[0]))]
temp_y = y[list(range(0, i * ran)) + list(range((i + 1) * ran, y.shape[0]))]
result = opt.fmin_tnc(func=cost, x0=beta_v, fprime=gradient, args=(temp_X, temp_y))
pre_res = predict(result[0], rest_X)
sum += accuracy(pre_res, rest_y)
return sum / seq
acc = data_split_test(10, X_v, y_v)
print("accuracy is {}%".format(acc * 100))
asdfafdasdf...sdf
训练要好久...有时间慢慢训练吧, 等不下去就算了.
acc = data_split_test(X_v.shape[0], X_v, y_v)
print("accuracy is {}%".format(acc * 100))
最终正确率都差不多.