-- 数据准备
create database taobao;
use taobao;
create table UserBehavior(
user_id int,
item_id int,
item_category int,
behavior_type varchar(10),
user_geohash varchar(10),
times datetime,
amount decimal(5,2)
);
show variables like '%secure%';-- 查看安全路径,随后将表格粘贴到该安全路径下,
load data infile "C:/ProgramData/MySQL/MySQL Server 8.0/Uploads/UserBehavior.csv"
into table UserBehavior
fields terminated by ','
ignore 1 lines;
select * from UserBehavior limit 10;
select count(*) from UserBehavior;#1048575
-- 数据清洗
-- 缺失值处理
select
count(user_id),
count(item_id),
count(item_category),
count(behavior_type),
count(user_geohash),
count(times),
count(amount)
from UserBehavior;
-- 异常值检查
select min(times),max(times),min(amount),max(amount) from userbehavior;
-- 重复记录处理
select distinct * from userbehavior;
-- 字段处理:根据times字段增加计算字段用户行为日期、周和小时,排除后续分析不需要的user_geohash字段,并将筛选后的结果保存到新表
#数据保存方式方法
#保存方式一:直接导出表格csv文件
select
user_id,
item_id,
item_category,
behavior_type,
date(times) as 日期,
hour(times) as 小时,
date_format(times,'%w') as 星期,
amount
from (select distinct * from userbehavior) as t #从去重的表里面去查询
limit 100;
#保存方式二:代码保存(至安全路径)
select
user_id,
item_id,
item_category,
behavior_type,
date(times) as 日期,
hour(times) as 小时,
date_format(times,'%w') as 星期,
amount
from (select distinct * from userbehavior) as t
limit 100
into outfile "C:/ProgramData/MySQL/MySQL Server 8.0/Uploads/t2.csv"-- 保存到安全路径并命名为t2
fields terminated by ','
lines terminated by '\r\n';
#保存方式三:视图
#视图:优点 1.简化SQL语句,快速提取,将复杂查询语句进行封装;2.安全,保护数据,设置访问权限 让不同级别用户看到不同的视图;缺点 操作视图比直接操作基础表要慢
create view userbehavior_new as
select
user_id,
item_id,
item_category,
behavior_type,
date(times) as 日期,
hour(times) as 小时,
date_format(times,'%w') as 星期,
amount
from (select distinct * from userbehavior) as t;
select * from userbehavior_new limit 100;
select count(*) from userbehavior_new;