lession 5 KNN分类算法 - fackbook预测签到地点的案例

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

def facebook_demo_0():
    # 获取数据
    data = pd.read_csv("D:/BaiduNetdiskDownload/Python3天快速入门机器学习项目资料/机器学习day2资料/02-代码/FBlocation/train.csv")

    # 由于数据太大,先把数据范围缩小一下
    data = data.query("x < 2.5 & x > 2 & y < 1.5 & y > 1.0")
    data.to_csv("D:/BaiduNetdiskDownload/Python3天快速入门机器学习项目资料/机器学习day2资料/02-代码/FBlocation/train2.csv")
    print(data.shape)
    return None


def facebook_demo():
    # 获取数据
    data = pd.read_csv("D:/BaiduNetdiskDownload/Python3天快速入门机器学习项目资料/机器学习day2资料/02-代码/FBlocation/train2.csv")

    # 把time这个时间戳特征转换为年月日时分秒
    time_value = pd.to_datetime(data["time"], unit="s")
    print(time_value)

    # 把time_value转为DateTimeIndex
    date = pd.DatetimeIndex(time_value)

    # 这样,date.weekday就可以直接输出星期几了 也可以输出年 月 日 等信息
    print(date.weekday)
    print(date.year)

    data["day"] = date.day
    data["weekday"] = date.weekday
    data["hour"] = date.hour
    print(data.head())

    # groupby一下place_id,看看每个地点签到次数,找出签到次数比较少的
    place_count = data.groupby("place_id").count()
    print(place_count)

    # 取出大于3次的数据,因为每个字段的值都是表示统计次数,所以不一定是row_id,用其他字段过滤也一样
    place_count = place_count.query("row_id > 3")
    print(place_count)

    # place_count.index就是place_id
    print(place_count.index)

    # 拿到place_id在place_count里的索引布尔值
    data_index = data["place_id"].isin(place_count.index)
    # 通过索引布尔值,过滤data
    data = data[data_index]
    print(data.head())

    # 筛选目标值y 和 特征值x
    x = data[["x","y","accuracy","day","weekday","hour"]]
    y = data["place_id"]
    print(x.head())
    print(y.head())

    # 数据集划分
    x_train, x_test, y_train, y_test = train_test_split(x, y)
    # 数据标准化
    transfer = StandardScaler()
    x_train = transfer.fit_transform(x_train)
    x_test = transfer.transform(x_test)

    # knn算法训练
    estimator = KNeighborsClassifier(n_neighbors=7)
    estimator.fit(x_train, y_train)

    # 计算一个预测值
    y_predict = estimator.predict(x_test)
    print("y_predict:", y_predict)
    print("对比预测值和测试值", y_predict == y_test)

    # 用测试集计算评估的准确率
    scroe = estimator.score(x_test, y_test)
    print("准确率为:", scroe)
    # 因为数据量太少,所以准确率不高
    return None

facebook_demo()

 

posted @ 2020-11-22 22:38  IWing  阅读(181)  评论(0)    收藏  举报