k近邻算法-3.算法应用

算法具体应用

import numpy as np
import matplotlib.pyplot as plt
import matplotlib
from sklearn import datasets

加载手写数据集

digits = datasets.load_digits()  #加载手写数据集

手写数据集共有5620个样本，每个样本有64个特征，为手写数据集的像素点，其样本的结果为0-9的手写数字，其数据集描述如下：

样本结构：

数据可视化，查看某个样本的特征和结果：

x =digits.data
y = digits.target
# 数据集中第222个样本
some_digit = x[222]
#一个手写数字有64个特征，将一维数组的特征变为8*8的矩阵
some_digit_image = some_digit.reshape(8, 8)  
plt.imshow(some_digit_image, cmap=matplotlib.cm.binary)
plt.show()

查看此数据的结果：

封装之前的代码，实现手写数据集的预测

定义K近邻算法(KNN.py)：

import numpy as np
from math import sqrt
from collections import Counter
class KNNClassifier:
    """docstring for KNNClassifier"""
    def __init__(self, k):
        """初始化KNN分类器"""
        assert k >= 1, 'k must be valid'
        self.k = k
        self._x_train = None
        self._y_train = None
    def fit(self, _x_train, _y_train):
        """根据训练数据集训练KNN分类器"""
        self._x_train = _x_train
        self._y_train = _y_train
        # 返回对象本身（高级操作）
        return self
    def predict(self,x_predict):
        """给定待测试的数据集x_predict，返回结果向量"""
        assert self._x_train is not None and self._y_train is not None, \
            "must fit before predict!"
        assert self._x_train.shape[0] == self._y_train.shape[0], \
            "the size of x_train must equal to the size of y_train" 
        assert self._x_train.shape[1] == x_predict.shape[1], \
            "the feature number of x must be equal to x_train"
        y_predict = [self._predict(x) for x in x_predict]
        return np.array(y_predict)
    def _predict(self, x):
        """给定单个的待测数据x,返回x的预测结果"""
        assert self._x_train.shape[1] == x.shape[0], \
            "the feature number of x must be equal to x_train"
        #求出一个预测的数据 和 每个数据集的距离，是一个无序列表
        distances = [sqrt(np.sum((x_train -x) ** 2)) for x_train in self._x_train]
        #根据索引排序
        nearest = np.argsort(distances)
        #找出距离此新样本最近的k个原始样本的结果
        topK_y = [self._y_train[i] for i in nearest[:self.k]]
        #统计数组中的元素，及它出现的次数
        votes = Counter(topK_y)
        #找到票数最多的n个元素 ，按票数从多到少 排序 [（元素，票数）]
        return votes.most_common()[0][0]
    def __repr__(self):
        return 'KNN(k=%d)'%self.k

定义模型选择库(model_selection.py)

import numpy as np
#训练 测试数据集分离
def train_test_split(x, y, test_ratio=0.2, seed=None):
    assert x.shape[0] == y.shape[0],\
        "the size of x must be equal to the size of y"
    assert 0.0 <= test_ratio <= 1.0,\
        "test_ratio must be valid"
    if seed:
        np.random.seed(seed)
    shuffle_index = np.random.permutation(len(x))
    test_size = int(len(x) * test_ratio)
    test_index = shuffle_index[:test_size]
    train_index = shuffle_index[test_size:]
    x_train = x[train_index]
    x_test = x[test_index]
    y_train = y[train_index]
    y_test = y[test_index]
    return x_train, x_test, y_train, y_test

使用自己封装的库：

from mylib.model_selection import train_test_split
from mylib.KNN import KNNClassifier
x_train,x_test,y_train,y_test = train_test_split(x, y,test_ratio=0.2)
my_clf = KNNClassifier(k=3)
my_clf.fit(x_train,y_train)
y_predict = my_clf.predict(x_test)

验证算法的准确率：

score = numpy.sum(y_predict==y_test)/len(y_test)

封装，实现解耦：

# metrics.py   (metrics 意为衡量标准)
import numpy
import math
def accuracy_score(y_true, y_predict):
    '''计算准确率'''
    assert y_true.shape[0] == y_predict.shape[0], \
        "size of y_true must be equal to the size of y_predict"
    return numpy.sum(y_true == y_predict)/len(y_true)
# KNN.py     添加求准确率方法
from .metrics import accuracy_score
def score(self,x_test,y_test):
    y_predict = self.predict(x_test)
    return accuracy_score(y_test, y_predict)