我的博客: https://yunist.cn ¶

《机器学习》西瓜书第 3 章编程实例¶

数据准备¶

import numpy as np
np.set_printoptions(suppress=True)  # 禁用科学计数
import pandas as pd
import matplotlib.pyplot as plt

path = 'work/西瓜数据集alpha.txt'  # 导入数据
data = pd.read_csv(path)
data.head()

# 将'是', '否'换成'Y', 'N'
data['Good melon'][data['Good melon'].isin(['是'])] = 1
data['Good melon'][data['Good melon'].isin(['否'])] = 0
data.head()

/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/ipykernel_launcher.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until

# 分离出 X 与 y
X = data.iloc[:, :2]
y = data.iloc[:, 2]
print(X)
print(y)

    Density  Sugar content
0     0.697          0.460
1     0.774          0.376
2     0.634          0.264
3     0.608          0.318
4     0.556          0.215
5     0.403          0.237
6     0.481          0.149
7     0.437          0.211
8     0.666          0.091
9     0.243          0.267
10    0.245          0.057
11    0.343          0.099
12    0.639          0.161
13    0.657          0.198
14    0.360          0.370
15    0.593          0.042
16    0.719          0.103
0     1
1     1
2     1
3     1
4     1
5     1
6     1
7     1
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
16    0
Name: Good melon, dtype: object

接下来进行可视化.

yes = data[data['Good melon'].isin([1])]
no = data[data['Good melon'].isin([0])]
fig, ax = plt.subplots(figsize=(12, 8))
ax.scatter(yes['Density'], yes['Sugar content'], marker='o', c='b', label='Yes')
ax.scatter(no['Density'], no['Sugar content'], marker='x', c='r', label='No')
ax.legend()
ax.set_xlabel('Density')
ax.set_ylabel('Sugar content')
plt.show()

对率回归(逻辑回归)¶

$Sigmoid$ 函数: $$\frac{1}{1 + e^{-x}}$$ 对数似然(损失函数): $$\ell(\boldsymbol\beta) = \sum_{i = 1}^{m}(-y_i\boldsymbol\beta^\mathrm{T}\hat{\boldsymbol{x}}_i + ln(1 + \mathrm{e}^{\boldsymbol\beta^\mathrm{T}\hat{\boldsymbol{x}}_i}))$$ 对其求偏导(过程比较简单就不展开了, 就是简单的展开计算): $$\begin{aligned} \frac{\partial\ell(\boldsymbol\beta)}{\partial\beta_i}&=(X^\mathrm{T})_i(\frac{1}{1+\mathrm{e}^{-X\beta}}-\boldsymbol{y})\\\\ &=(X^{\mathrm{T}})_i(sigmoid(X\beta)-\boldsymbol{y}) \end{aligned}$$ 其中 $(X^\mathrm{T})_i$ 是 $X^\mathrm{T}$ 的第 $i$ 行.

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def cost(params, X, y):
    beta = params[None].T
    res = np.multiply(-y, (X @ beta)) + np.log(1 + np.exp(X @ beta))
    return np.sum(res)

def gradient(params, X, y):
    beta = params[None].T
    grad = np.zeros([1, beta.shape[0]]).T  # 定义梯度数组存储梯度
    for i in range(beta.shape[0]):
        grad[i] = X.T[i][None] @ (sigmoid(X @ beta) - y)
    return grad.T[0]

试试效果, 其中带后缀 _v 的是某个数据的向量版本.

params = np.array([1, 1, 1])
y_v = np.array([y], dtype=float).T
X_v = np.array(X)
X_v = np.insert(X_v, axis=1, values=np.ones(X_v.shape[0]), obj=2)  # 为了更方便的表示配置项, 添加值全 1 的项
gradient(params, X_v, y_v), cost(params, X_v, y_v)

(array([3.16583886, 0.8786374 , 6.43813213]), 17.638839972291155)

我们用 SciPy's truncated newton (TNC) 实现寻找最优参数 (是啥自己百度) .

import scipy.optimize as opt
result = opt.fmin_tnc(func=cost, x0=beta_v, fprime=gradient, args=(X_v, y_v))
result

(array([ 3.15720884, 12.52156329, -4.4283509 ]), 36, 1)

result_v = result[0]
cost(result_v, X_v, y_v)

8.68366062959816

def predict(params, X):
    theta = params[None].T
    probability = sigmoid(X @ theta)
    return np.array([[1 if x >= 0.5 else 0 for x in probability]]).T

x1_max = X_v[:, 0].max()
x1_min = X_v[:, 0].min()
x2_max = X_v[:, 1].max()
x2_min = X_v[:, 1].min()
x1, x2 = np.mgrid[x1_min:x1_max:500j, x2_min:x2_max:500j]  # np.mgrid的用法自行百度
x_pre = np.stack((x1.flat, x2.flat), axis=1)  # 合成坐标
x_pre = np.insert(x_pre, values=np.ones(x_pre.shape[0]), obj=2, axis=1)
x_pre

array([[0.243     , 0.042     , 1.        ],
       [0.243     , 0.04283768, 1.        ],
       [0.243     , 0.04367535, 1.        ],
       ...,
       [0.774     , 0.45832465, 1.        ],
       [0.774     , 0.45916232, 1.        ],
       [0.774     , 0.46      , 1.        ]])

import matplotlib as mpl
y_pre = predict(result_v, x_pre)
y_pre = y_pre.reshape(x1.shape)
cm_light = mpl.colors.ListedColormap(['#77E0A0', '#FF8080', '#A0A0FF'])
fig, ax = plt.subplots(figsize=(12, 8))
plt.pcolormesh(x1, x2, y_pre, cmap=cm_light)  # 网格
ax.scatter(yes['Density'], yes['Sugar content'], marker='o', c='b', label='Yes')
ax.scatter(no['Density'], no['Sugar content'], marker='x', c='r', label='No')
ax.legend()
plt.xlim(x1_min, x1_max)
plt.ylim(x2_min, x2_max)
ax.set_xlabel('Density')
ax.set_ylabel('Sugar content')
plt.show()

效果好一般... 应该是训练集太小的缘故.

训练 iris 数据集¶

注意 ! 该数据集只有iris中的两个特征.¶

path = 'work/ex2data1.txt'
iris = pd.read_csv(path, header=None, names=['Exam 1', 'Exam 2', 'Admitted'])

X = np.c_[iris.iloc[:, :2], np.ones([1, iris.shape[0]]).T]
y = np.array(iris.iloc[:, 2])[None].T

beta_v = np.ones(3)
beta_v

array([1., 1., 1.])

cost(beta_v, X, y)

4306.107727791531

import scipy.optimize as opt
result = opt.fmin_tnc(func=cost, x0=beta_v, fprime=gradient, args=(X, y))
result

(array([  0.20606315,   0.20129743, -25.14002914]), 52, 1)

cost(result[0], X, y)

20.349776924826397

线性判别分析(PDA)¶

公式直接给了, 实现就很简单了.

$$\boldsymbol{w} = \mathrm{S}^{-1}_w(\boldsymbol{\mu}_0 - \boldsymbol{\mu}_1)$$$$\begin{aligned} \mathrm{S}_w &= \Sigma_0 + \Sigma_1\\\\ &=\sum_{\boldsymbol{x}\in X_0}(\boldsymbol{x} - \boldsymbol{\mu}_0)(\boldsymbol{x} - \boldsymbol{\mu}_0)^\mathrm{T} + \sum_{\boldsymbol{x}\in X_1}(\boldsymbol{x} - \boldsymbol{\mu}_1)(\boldsymbol{x} - \boldsymbol{\mu}_1)^\mathrm{T} \end{aligned}$$

$\boldsymbol\mu$ 是均值.

def PDA(X_0, X_1):
    mu_0 = X_0.mean(axis=0).reshape(1, X_0.shape[1]).T
    mu_1 = X_1.mean(axis=0).reshape(1, X_1.shape[1]).T
    Sigma_0 = (X_0 - mu_0.T).T @ (X_0 - mu_0.T)
    Sigma_1 = (X_1 - mu_1.T).T @ (X_1 - mu_1.T)
    S_w = Sigma_0 + Sigma_1
    w = np.linalg.inv(S_w) @ (mu_0 - mu_1)  # 求矩阵的逆
    return w

X_0 = np.array(data[data['Good melon'].isin([0])].iloc[:, 0:2], dtype='float')
X_1 = np.array(data[data['Good melon'].isin([1])].iloc[:, 0:2], dtype='float')

w = PDA(X_0, X_1)
w

array([[-0.14650982],
       [-0.73871557]])

def mapping(w, X):
    k = w[1] / w[0]
    new_X = ((k * X[:, 1] + X[:, 0]) / (k * k + 1)).reshape(1, X.shape[0])
    new_X = np.insert(new_X, values=k * new_X, obj=1, axis=0)
    return new_X.T

np.insert(w, values=0, axis=1, obj=0).T

array([[ 0.        ,  0.        ],
       [-0.14650982, -0.73871557]])

new_yes = mapping(w, X_1)
new_no = mapping(w, X_0)
fig, ax = plt.subplots(figsize=(12, 8))
ax.scatter(new_yes[:, 0], new_yes[:, 1], marker='^', c='b', label='new_Yes')
ax.scatter(new_no[:, 0], new_no[:, 1], marker='*', c='r', label='new_No')
ax.scatter(yes['Density'], yes['Sugar content'], marker='o', c='b', label='Yes')
ax.scatter(no['Density'], no['Sugar content'], marker='x', c='r', label='No')
ax.legend()
ax.set_xlabel('Density')
ax.set_ylabel('Sugar content')
plt.show()

那条明显的直线就是降维后的数据.

UCI 数据集的训练¶

这里我们选择了输血服务中心数据集.

UCI 的数据集好多都很大, 特征很多, 这个比较简单.

path = 'work/transfusion.data'  # 导入数据
data = pd.read_csv(
    path,
)
data.head()

$10$ 折交叉验证法¶

随机打乱数据.

rand_data = np.random.permutation(data)
rand_data

array([[   2,   14, 3500,   57,    1],
       [  16,    3,  750,   50,    0],
       [  14,    2,  500,   14,    0],
       ...,
       [   2,    1,  250,    2,    0],
       [  23,    3,  750,   41,    0],
       [   2,    2,  500,   23,    0]])

同样是分离 X , y 且向量化.

X_v = rand_data[:, 0:4]
X_v = np.insert(X_v, values=1, obj=4, axis=1)
y_v = rand_data[:, [4]]

开始 10 折交叉验证.

def accuracy(pre, y):
    return np.sum(pre == y) / len(pre)

def data_split_test(seq, X, y):
    beta_v = np.array([[0, 0, 0, 0, 0]]).T
    sum = 0
    ran = int(X.shape[0] / seq)
    for i in range(seq):
        rest_X = X[i * ran:(i + 1) * ran]
        rest_y = y[i * ran:(i + 1) * ran]
        temp_X = X[list(range(0, i * ran)) + list(range((i + 1) * ran, X.shape[0]))]
        temp_y = y[list(range(0, i * ran)) + list(range((i + 1) * ran, y.shape[0]))]
        result = opt.fmin_tnc(func=cost, x0=beta_v, fprime=gradient, args=(temp_X, temp_y))
        pre_res = predict(result[0], rest_X)
        sum += accuracy(pre_res, rest_y)
    return sum / seq

acc = data_split_test(10, X_v, y_v)
print("accuracy is {}%".format(acc * 100))

accuracy is 76.0810810810811%

留一法¶

asdfafdasdf...sdf

训练要好久...有时间慢慢训练吧, 等不下去就算了.

acc = data_split_test(X_v.shape[0], X_v, y_v)
print("accuracy is {}%".format(acc * 100))

accuracy is 76.20320855614973%

最终正确率都差不多.

	Density	Sugar content	Good melon
0	0.697	0.460	是
1	0.774	0.376	是
2	0.634	0.264	是
3	0.608	0.318	是
4	0.556	0.215	是

	Recency (months)	Frequency (times)	Monetary (c.c. blood)	Time (months)	whether he/she donated blood in March 2007
0	2	50	12500	98	1
1	0	13	3250	28	1
2	1	16	4000	35	1
3	2	20	5000	45	1
4	1	24	6000	77	0

我的博客: https://yunist.cn¶

《机器学习》西瓜书 第 3 章 编程实例¶