KNN 分类程序

# coding: utf-8
import numpy as np
import operator
import matplotlib
from numpy import *
import matplotlib.pyplot as plt
import os


def CreateDataSet():
  group = np.array([
    [1.0, 1.1],
    [1.0, 1.0],
    [0.0, 0.0],
    [0.0, 0.1]])
  label = ['a', 'a', 'b', 'b']
  return group, label


def Classify(intx, datax, label, k):
  datasize = datax.shape[0]
  diffmat = np.tile(intx, (datasize, 1)) - datax #每一位相减
  sqdiffmat = diffmat ** 2 #每一位平方
  sqdistence = sqdiffmat.sum(axis=1) #axis=1按照行求和 axix=0按照列求和
  distence = sqdistence ** 0.5
  sorteddistenceindicies = distence.argsort()
  classcount = {}
  for i in range(k):
    voteilabel = label[sorteddistenceindicies[i]]
    classcount[voteilabel] = classcount.get(voteilabel, 0) + 1 #map标记
  sortedclasscount = sorted(classcount.items(), key=operator.itemgetter(1), reverse = True) #map排序
  return sortedclasscount[0][0]
def file2matrix(filename) :
  with open(filename, mode = "r") as fr : #表示打开文件,使用这一句会系统自动调用 fr.close关闭文件,无论文件是否打开都会调用
    arrayolines = fr.readlines() #https://blog.csdn.net/liuyhoo/article/details/80756812
    numberoflines = len(arrayolines)
    returnmat = np.zeros((numberoflines, 3)) #生成一个 num * 3 d的全0矩阵
    labels = []
    index = 0
    for line in arrayolines :
      listfromline = line.split("\t") #数据中间是\t 结尾是\n
      returnmat[index, :] = listfromline[0: 3]
      labels.append(int(listfromline[-1])) # 处理结尾\n
      index = index + 1
    return returnmat, labels

def autonorm(datax) :
  minval = datax.min(0) #min() 表示矩阵中最小是 min(0)表示每列中最小值 min(1)表示每行中最小值
  maxval = datax.max(0)
  ranges = maxval - minval
  rows = datax.shape[0] #查看矩阵的维数
  newval = datax - tile(minval, (rows, 1)) #minval是三维,后面的是生成的矩阵为 rows * 1 倍
  newval = newval / tile(ranges, (rows, 1)) # 矩阵除法相当于c中每一位直接整除
  return newval, ranges, minval

def datingClassTest():
  hoRatio = 0.1  # 设置测试集百分比
  filename = "datingTestSet2.txt"
  dataX, labels = file2matrix(filename) #读数据
  normMat, ranges, minVals = autonorm(dataX)  # 归一化
  m = dataX.shape[0]  #numbers of rows
  numTestVecs = int(m * hoRatio)
  errorcount = 0  # 错误数
  for i in range(numTestVecs):
    classifierResult = Classify(normMat[i, :], normMat[numTestVecs:m, :], labels[numTestVecs:m], 5)  # 前10%作为测试数据
    #   print("the classifier predict %d, the real answer is :%d" %((classifierResult),labels[i]))
    if (classifierResult != labels[i]):
      errorcount = errorcount + 1.0
  print("error rate :%f" % ((errorcount) / (numTestVecs)))

def plot():  # 画datingTestSet2.txt这个数据的图像
  k = 3
  filename = "datingTestSet2.txt"
  dataX, labels = file2matrix(filename)
  fig = plt.figure() #创建一个图
  ax = fig.add_subplot(111)
  ax.scatter(dataX[:, 0], dataX[:, 1], c=15 * np.array(labels), s=15 * np.array(labels))
  ax = fig.add_subplot(121)
  ax.scatter(dataX[:, 0], dataX[:, 2], c=15 * np.array(labels), s=15 * np.array(labels))
  ax = fig.add_subplot(131)
  ax.scatter(dataX[:, 1], dataX[:, 2], c=15 * np.array(labels), s=15 * np.array(labels))
  plt.show()

if __name__ == '__main__':
    # plot()
    datingClassTest()

 搬运门

posted @ 2019-07-24 21:59  啦啦啦天啦噜  阅读(287)  评论(0编辑  收藏  举报