30.将跑出的数据结果saveAsTextFile保存到hdfs上之后,将数据导入到hbase中的方法
1项目
# 1.生成周报结果文件part-10000
calc_all_g37.py
def json_data(df):
o_hive = {
'row_key': str(role_id)[::-1],
'value': {
'info:': json.dumps(o)
}
}
return json.dumps(o_hive)
MONDAY=20171002
rets.rdd.setName('mark.g37').map(json_data).saveAsTextFile('/home/workspace/g37/weekly/date=%s' % MONDAY)
# 2.创建Hive关联表
CREATE EXTERNAL TABLE `g37_weekly`(
`source` string COMMENT '')
PARTITIONED BY (
`date` string)
ROW FORMAT SERDE
'org.apache.hadoop.hive.serde2.RegexSerDe'
WITH SERDEPROPERTIES (
'columns.comments'='1',
'input.regex'='^(.*?)$')
STORED AS INPUTFORMAT
'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION
'hdfs://neophdfs/home/workspace/g37/weekly'
# 3.脚本使用MSCK命令修复Hive表分区
原理相当简单,执行后,Hive会检测如果HDFS目录下存在但表的metastore中不存在的partition元信息,更新到metastore中
sudo -u hive hive --auxpath /usr/lib/hive/lib/neop-hive.jar:/usr/lib/hive/lib/fastjson.jar:/usr/lib/hive/lib/neop-hadoop-lzo.jar -e "MSCK REPAIR TABLE sparkuser.g37_weekly"
首次运行会出现信息:Tables not in metastore:g37_weekly
再次运行就会正常
# 4.此时查看hive中的表就可以看到有数据了
# 5.查看分区信息show partitions g37_weekly
date=20171002
# 6.在hbase shell中创建hbase表
create 'g37:weekly_20171002','info:'
# 7.将数据导入hbase中:
monday = 20171002
cd /home/workspace/hbase-importer/app && sudo -u sparkuser ./start-spark-hbase-importer.sh sparkuser.g37_weekly g37:weekly_${monday} "$monday"
# 8.查看hbase中是否导入数据
scan 'g37:weekly_20171002',{LIMIT=>2}
2.MSCK
一、介绍
我们平时通常是通过alter table add partition方式增加Hive的分区的,但有时候会通过HDFS put/cp命令往表目录下拷贝分区目录,如果目录多,需要执行多条alter语句,非常麻烦。Hive提供了一个"Recover Partition"的功能
保存结果到hdfs上
def get_josn(row):
o = {
'role_id': row['role_id'],
'server': row['server'],
'role_name': row['role_name'],
'os_name': row['os_name'],
'total_num': row['total_num'] if row['total_num'] else 0,
'top1_num': row['top1_num'] if row['top1_num'] else 0,
'top10_num': row['top10_num'] if row['top10_num'] else 0,
'kill_num': row['kill_num'] if row['kill_num'] else 0,
'top_kill': row['top_kill'] if row['top_kill'] else 0,
'all_num_lastdate': Sunday
}
return row['role_id'] + '\t' + row['server'] + '\t' + json.dumps(o)
rets.na.fill(0).rdd.saveAsPickleFile('/home/workspace/g83/career/week/pickle_'+Sunday)
创建hive表
CREATE EXTERNAL TABLE `sparkuser.g83_week`( `role_id` string, `server` string, `source` string) PARTITIONED BY ( `date` string) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' LOCATION 'hdfs://neophdfs/home/workspace/g83/career/week'
修复添加partition
sudo -u hive hive --auxpath /usr/lib/hive/lib/neop-hive.jar:/usr/lib/hive/lib/fastjson.jar:/usr/lib/hive/lib/neop-hadoop-lzo.jar -e "MSCK REPAIR TABLE sparkuser.g83_week"
或者
hive hive>msck repair table sparkuser.g83_week

二、备注
每次在hdfs上生成文件,都要修复一次,不然不会关联,在hive中查不到数据
http://www.cnblogs.com/makexu/

浙公网安备 33010602011771号