import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
def facebook_demo_0():
# 获取数据
data = pd.read_csv("D:/BaiduNetdiskDownload/Python3天快速入门机器学习项目资料/机器学习day2资料/02-代码/FBlocation/train.csv")
# 由于数据太大,先把数据范围缩小一下
data = data.query("x < 2.5 & x > 2 & y < 1.5 & y > 1.0")
data.to_csv("D:/BaiduNetdiskDownload/Python3天快速入门机器学习项目资料/机器学习day2资料/02-代码/FBlocation/train2.csv")
print(data.shape)
return None
def facebook_demo():
# 获取数据
data = pd.read_csv("D:/BaiduNetdiskDownload/Python3天快速入门机器学习项目资料/机器学习day2资料/02-代码/FBlocation/train2.csv")
# 把time这个时间戳特征转换为年月日时分秒
time_value = pd.to_datetime(data["time"], unit="s")
print(time_value)
# 把time_value转为DateTimeIndex
date = pd.DatetimeIndex(time_value)
# 这样,date.weekday就可以直接输出星期几了 也可以输出年 月 日 等信息
print(date.weekday)
print(date.year)
data["day"] = date.day
data["weekday"] = date.weekday
data["hour"] = date.hour
print(data.head())
# groupby一下place_id,看看每个地点签到次数,找出签到次数比较少的
place_count = data.groupby("place_id").count()
print(place_count)
# 取出大于3次的数据,因为每个字段的值都是表示统计次数,所以不一定是row_id,用其他字段过滤也一样
place_count = place_count.query("row_id > 3")
print(place_count)
# place_count.index就是place_id
print(place_count.index)
# 拿到place_id在place_count里的索引布尔值
data_index = data["place_id"].isin(place_count.index)
# 通过索引布尔值,过滤data
data = data[data_index]
print(data.head())
# 筛选目标值y 和 特征值x
x = data[["x","y","accuracy","day","weekday","hour"]]
y = data["place_id"]
print(x.head())
print(y.head())
# 数据集划分
x_train, x_test, y_train, y_test = train_test_split(x, y)
# 数据标准化
transfer = StandardScaler()
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)
# knn算法训练
estimator = KNeighborsClassifier(n_neighbors=7)
estimator.fit(x_train, y_train)
# 计算一个预测值
y_predict = estimator.predict(x_test)
print("y_predict:", y_predict)
print("对比预测值和测试值", y_predict == y_test)
# 用测试集计算评估的准确率
scroe = estimator.score(x_test, y_test)
print("准确率为:", scroe)
# 因为数据量太少,所以准确率不高
return None
facebook_demo()