官方数据分析1

大数据处理与应用hive

#修改云主机host文件,添加内网IP,对应映射名为hadoop000,实现云主机自身使用root用户ssh访问hadoop000免密登陆

vi /etc/hosts

172.18.39.103 hadoop000

hostnamectl set-hostname hadoop000
bash

#配置免密
ssh hadoop000
exit


#2.格式化HDFS文件系统########################################################

hdfs namenode -format


#3.启动Hadoop集群############################################################


#在本机上使用以下指令启动 NameNode 进程:
hadoop-daemon.sh start namenode  
 


#脚本一键启动和关闭
start-all.sh



#4.开启mysql服务###############################################################
#启动MySQL:
systemctl start mysqld.service



#5.初始化Hive元数据库(数据库类型为mysql),进入Hive客户端,创建hive数据库#########################
schematool -dbType mysql -initSchema


#创建hive数据库:
create database hive;

use hive;


#数据分析#################################
########################################

#1.创建project数据###################################
create database project;
create database if not exists project;

#2.project数据库下创建theft数据表,字段见任务说明,分割字符为‘,’
use project;

 create table if not exists project.theft(
id string,
case_type string,
case_subtype string,
casename string,
loss string,
case_source string,
time_toplimit string,
time_lowerlimit string,
address string,
accept_time string,
report_time string
)
row format delimited fields terminated by ',';


show tables;


#3.使用load data子句加载云主机数据/root/college/theft.csv据至管理表theft

load data local inpath '/root/college/theft.csv' into table project.theft;


#4.统计2021年5月份发生的案件总数(以报案时间为准),结果写入云主机/root/theft/result01/000000_0文件中


insert overwrite local directory '/root/theft/result01'
row format delimited fields terminated by '\t'
select count(report_time) num from project.theft
where substr(report_time,1,8)='2021年05月';




#5.统计2021年4月份经济损失总额(以报案时间为准),结果写入云主机/root/theft/result02/000000_0文件中


insert overwrite local directory '/root/theft/result02'
row format delimited fields terminated by '\t'
select sum(split(loss,'元')[0]) sum_loss from project.theft
where substr(report_time,1,8)='2021年4月';


INSERT OVERWRITE DIRECTORY '/root/theft/result02'
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
SELECT SUM(CAST(SUBSTR(loss, 1, LENGTH(loss) - 1) AS FLOAT)) as total_loss
FROM project.theft
WHERE substr(report_time,1,8)='2021年4月';

#6.查询案发频次最高的地区及对应的案发频次,结果写入云主机/root/theft/result03/000000_0文件中


INSERT OVERWRITE DIRECTORY '/user/hadoop/result03'
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
SELECT district, count(*) as freq
FROM project.theft
GROUP BY district
ORDER BY freq DESC
LIMIT 1;



#7.统计"经济损失"最少的案件副类别,结果写入云主机/root/theft/result04/000000_0文件中


#8.统计在A市C区案件中的各案件损失金额前三的副类别,结果写入云主机/root/theft/result05/000000_0文件中

insert overwrite 1ocal directory '/root/theft/result05'
row format delimited fields terminated by '\t' 
select case_subtype,sum(split(loss,'元')[0]) sum_loss
from project.theft
where address='A市C区'
group by case_subtype
order by sum_loss desc 1imit 3;


#9.统计盗窃居民小区车辆案件数,结果写入云主机/root/theft/result06/000000_0文件中

insert overwrite local directory '/root/theft/result06'
row format delimited fields terminated by '\t'
select count(case_subtype) num
from project.theft 
where case_subtype='盗窃居民小区车辆';

#10.分析2021年6月经济损失最多的案件副类别,结果写入云主机/root/theft/result07/000000_0文件中


#https://blog.csdn.net/lulu001128/article/details/131031727
posted @ 2023-06-09 23:24  Cuckoo~  阅读(81)  评论(0)    收藏  举报