31.spark之rdd dataframe
0.前言
有用的网站信息
http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#
https://stackoverflow.com/questions/37580782/pyspark-collect-set-or-collect-list-with-groupby
http://blog.csdn.net/sparkexpert/article/details/51042970
1.自己的理解
A:首先获取sparkContext
MONDAY='20171002' print 'link...' conf = SparkConf() app_name = "mark.charts.g37.weekly" + ".%s" % MONDAY sc, hsc = get_context(conf, appName=app_name) print 'link success'
B:将中间部分变量保存起来,并传递参数进行计算
from collections import namedtuple
Param = namedtuple('Param', 'sc dt product uid_dict b_uid_set hsc')
param = Param(sc, dt, product, uid_dict, b_uid_set, hsc)
calc_g37(param)
C:获取参数开始计算:
第一种是不需要进行处理的,一般来说结果集的内容相对来说不是很大,比如副本数据的统计count(*),实际上不会执行,在最后进行结果汇总action操作的时候才会执行sql语句
# 副本数据统计
def calc_g37_fuben(para):
global MONDAY
global SUNDAY
# 御魂副本统计
sql_fuben = """
select a.role_id,o.yuhun_cnt,b.xiezu_cnt,c.juexing_cnt,d.haoyoufb_cnt,ee.shiju_cnt,f.nianshou_cnt,g.tupo_cnt
from(
select role_id
from g37.logoutrole
where date between {monday} and {sunday}
group by role_id
) as a
left join(
select role_id,count(*) as yuhun_cnt
from g37.battle_yuhun
where date between {monday} and {sunday}
group by role_id) as o on (a.role_id = o.role_id)
left join(
select role_id,count(*) as xiezu_cnt
from g37.battle_xiezu
where date between {monday} and {sunday}
group by role_id) as b on (a.role_id = b.role_id)
left join(
select role_id,count(*) as juexing_cnt
from g37.battle_juexing
where date between {monday} and {sunday}
group by role_id) as c on (a.role_id = c.role_id)
left join(
select role_id,count(*) as haoyoufb_cnt
from g37.battle_haoyoufb
where date between {monday} and {sunday}
group by role_id) as d on (a.role_id = d.role_id)
left join(
select role_id,count(*) as shiju_cnt
from g37.battle
where date between {monday} and {sunday}
group by role_id) as ee on (a.role_id = ee.role_id)
left join(
select role_id,count(*) as nianshou_cnt
from g37.battle
where date between {monday} and {sunday}
group by role_id) as f on (a.role_id = f.role_id)
left join(
select role_id,count(*) as tupo_cnt
from g37.battle_singlebattle
where date between {monday} and {sunday}
group by role_id) as g on (a.role_id = g.role_id)
""".format(monday=MONDAY, sunday=SUNDAY)
print 'fuben complete'
return para.hsc.sql(sql_fuben)
第二种是需要进行处理的,一般来说含有很多无用信息,需要对dataFrame进行二次包装过滤,又分为几种情况
简单的可以用分隔符的方式进行简单处理:比如获得皮肤数据统计,最后是1,2,3,4的形式
sql='''
select role_id,concat_ws(',',collect_list(fn.json(source,"$.currency_id"))) as pifu
from g37.currency
where fn.json(source,'$.code_op') = '1'
and date between {monday} and {sunday}
group by role_id
'''.format(monday=MONDAY, sunday=SUNDAY)
para.hsc.sql(ziyuan_sql)
稍微复杂一点的需要多次链接:比如御魂的统计,不仅有id还有星级star,最后是1:123,2:456,3:789,4:987,5:654,6:321
# 御魂数据统计
def calc_g37_yuhun(para):
global MONDAY
global SUNDAY
sql_yuhun = """
select role_id,concat_ws(',',collect_list(concat_ws(":", fn.json(source,"$.detail.qua"), fn.json(source,"$.detail.itemId")))) as detail
from g37.currency_yuhun
where fn.json(source,'$.code_op')='1'
and date between {monday} and {sunday}
group by role_id
""".format(monday=MONDAY, sunday=SUNDAY)
ret_yuhun = para.hsc.sql(sql_yuhun)
print 'yuhun complete'
return ret_yuhun
更复杂的就需要重新封装dataFrame,过滤掉无用的信息,并对数据进行适当的处理,然后转换为新的dataFrame,对新的DataFrame可以利用pyspark提供的函数进行聚合,对最后的action操作进行准备,比如斗技信息的统计
def calc_g37_doufa(para):
global MONDAY
global SUNDAY
sql_doufa = """
select a.role_id as role_id ,b.d_role_name as d_role_name,a.source as source from
(
select role_id,fn.json(source,"$.d_role_id") as d_role_id,source
from g37.battle_doufa
where date between {monday} and {sunday}
)as a
left join
(
select distinct(role_id) ,fn.json(source,'$.role_name') as d_role_name
from g37.battle_doufa
where date between {monday} and {sunday}
)as b on (a.d_role_id = b.role_id)
""".format(monday=MONDAY, sunday=SUNDAY)
# 对数据进行加工返回新的数据
def json_data_doufa(df):
o = {
'role_id': '',
'role_name': '',
'role_lv': '',
'duan': '',
'score': '',
'battle_result': '',
'd_role_id': '',
'd_role_name': '',
'd_role_lv': '',
'battle_list': '',
'd_battle_list': ''
}
if df['source'] is not None and df['source'].strip() is not None: # 判断不为空
doufa = json.loads(df['source'])
o = {
'role_id': doufa['role_id'],
'role_name': doufa['role_name'],
'role_lv':doufa['role_lv'],
'duan': doufa['duan'],
'score': doufa['score'],
'battle_result':doufa['battle_result'],
'd_role_id': doufa['d_role_id'],
'd_role_name': df['d_role_name'],
'd_role_lv': doufa['d_role_lv'],
'battle_list': doufa['battle_list'],
'd_battle_list': doufa['d_battle_list']
}
return df['role_id'], json.dumps(o)
print time.strftime('%Y-%m-%d %X', time.localtime())
# ret_doufa.rdd.map(json_data_doufa).saveAsTextFile('/home/workspace/g37/weekly/date=%s' % MONDAY)
ret_doufa = para.hsc.sql(sql_doufa).rdd.map(json_data_doufa)
schema = StructType([
StructField("role_id", StringType(), True),
StructField("source", StringType(), True)])
sqlContext = HiveContext(para.sc)
ret_doufa = sqlContext.createDataFrame(ret_doufa, schema)
ret_doufa.printSchema()
ret_doufa = ret_doufa.groupby("role_id").agg(F.collect_set("source")).withColumnRenamed('collect_set(source)', 'source')
ret_doufa.printSchema()
def save_fun(df):
info_doufa = []
if df['source'] is not None: # 判断不为空
for doufa in df['source']:
doufa = json.loads(doufa)
duan = doufa['duan']
score = doufa['score']
battle_list = doufa['battle_list']
battle_list_final = []
d_battle_list = doufa['d_battle_list']
d_battle_list_final = []
if battle_list is not None and len(battle_list) > 0:
for item in battle_list:
o_t = {
'star': item['star'],
'level': item['level'],
'shishen_id': item['shishen_id'],
}
battle_list_final.append(o_t)
if d_battle_list is not None and len(d_battle_list) > 0:
for item in d_battle_list:
o_t = {
'star': item['star'],
'level': item['level'],
'shishen_id': item['shishen_id'],
}
d_battle_list_final.append(o_t)
o = {
'role_id': doufa['role_id'],
'role_name': doufa['role_name'],
'role_lv': doufa['role_lv'] if doufa['role_lv'] is not None else 0,
'duan': duan,
'score': score,
'battle_result': doufa['battle_result'],
'd_role_name': doufa['d_role_name'],
'd_role_lv': doufa['d_role_lv'] if doufa['d_role_lv'] is not None else 0,
'd_role_id': doufa['d_role_id'],
'battle_list': battle_list_final,
'd_battle_list': d_battle_list_final
}
info_doufa.append(o)
o = {
'role_id': df['role_id'],
'doufa': info_doufa
}
return json.dumps(o)
# ret_doufa.rdd.map(save_fun).saveAsTextFile('/home/workspace/g37/weekly/date=%s' % MONDAY)
# print ret_doufa.count()
# for row in ret_doufa.collect():
# print row
print 'doufa complete'
return ret_doufa
2.项目中的问题和代码
对于每一步返回的dataFrame进行join,最后进行action操作,全部代码如下
#!/usr/bin/env python
# encoding=utf-8
import json
import os
import traceback
import get_date
from pyspark import SparkConf
from pyspark.sql import HiveContext
from pyspark.sql.types import *
from pyspark.sql import functions as F
import threading
import base64
import time
from collections import namedtuple
LOGGER = None
MONDAY, SUNDAY = get_date.getDayLastWeekMondayAndSunday()
MONDAY = MONDAY.replace('-', '')
SUNDAY = SUNDAY.replace('-', '')
NUM_EXEC = None
DEBUG = None
threads = []
f = open("shishen.txt", "r")
sss = eval(f.readline())
ssrs = sss['5']
srs = sss['4']
rs = sss['3']
ns = sss['2']
NS = 'g37'
HEADER = {
'Accept': 'application/json',
'Content-Type': 'application/json',
}
COL_NAME = 'info'
COL_CSTR = base64.b64encode(COL_NAME)
Param = namedtuple('Param', 'sc dt product uid_dict b_uid_set hsc')
def _q(obj):
global DEBUG
if DEBUG:
import pprint
import StringIO
from pyspark.sql.dataframe import DataFrame as DataFrame
io = StringIO.StringIO()
if isinstance(obj, DataFrame):
io.write(pprint.pformat(obj.take(10)))
else:
io.write(pprint.pformat(obj))
print io.getvalue()
def get_context(conf, appName=None):
import pyspark
IS_SPARK_2 = hasattr(pyspark.sql, 'SparkSession')
if not IS_SPARK_2:
from pyspark import SparkContext
get_logger().debug('init context')
sc = SparkContext(appName=appName, conf=conf)
get_logger().debug('begin to broadcast')
get_logger().debug('init hive context')
from pyspark.sql import HiveContext
hsc = HiveContext(sc)
else:
from pyspark.context import SparkContext
from pyspark.sql import SparkSession
import py4j
if os.environ.get("SPARK_EXECUTOR_URI"):
SparkContext.setSystemProperty("spark.executor.uri", os.environ["SPARK_EXECUTOR_URI"])
SparkContext._ensure_initialized()
try:
# Try to access HiveConf, it will raise exception if Hive is not added
SparkContext._jvm.org.apache.hadoop.hive.conf.HiveConf()
spark = SparkSession.builder \
.appName(appName) \
.enableHiveSupport() \
.getOrCreate()
except py4j.protocol.Py4JError:
spark = SparkSession.builder.appName(appName).getOrCreate()
except TypeError:
spark = SparkSession.builder.appName(appName).getOrCreate()
sc = spark.sparkContext
get_logger().debug('init sparksession')
from pyspark.sql import SparkSession
hsc = SparkSession(sc).builder.appName(appName).enableHiveSupport().getOrCreate()
_q(sc.getConf().getAll())
_q(hsc.sql('show databases'))
return sc, hsc
# 多线程
class MyThread(threading.Thread):
def __init__(self, func, args=()):
super(MyThread, self).__init__()
self.func = func
self.args = args
def run(self):
self.result = self.func(self.args)
def get_result(self):
try:
return self.result # 如果子线程不使用join方法,此处可能会报没有self.result的错误
except Exception:
return None
def get_logger():
global LOGGER
if LOGGER is not None:
return LOGGER
import logging
logger = logging.getLogger('calc_all_g37.log')
logger.setLevel(logging.ERROR) # DEBUG
fmt = logging.Formatter('%(levelname)s:%(name)s:%(message)s')
h = logging.StreamHandler()
h.setFormatter(fmt)
logger.addHandler(h)
LOGGER = logger
return logger
# 副本数据统计
def calc_g37_fuben(para):
global MONDAY
global SUNDAY
# 御魂副本统计
sql_fuben = """
select a.role_id,o.yuhun_cnt,b.xiezu_cnt,c.juexing_cnt,d.haoyoufb_cnt,ee.shiju_cnt,f.nianshou_cnt,g.tupo_cnt
from(
select role_id
from g37.logoutrole
where date between {monday} and {sunday}
group by role_id
) as a
left join(
select role_id,count(*) as yuhun_cnt
from g37.battle_yuhun
where date between {monday} and {sunday}
group by role_id) as o on (a.role_id = o.role_id)
left join(
select role_id,count(*) as xiezu_cnt
from g37.battle_xiezu
where date between {monday} and {sunday}
group by role_id) as b on (a.role_id = b.role_id)
left join(
select role_id,count(*) as juexing_cnt
from g37.battle_juexing
where date between {monday} and {sunday}
group by role_id) as c on (a.role_id = c.role_id)
left join(
select role_id,count(*) as haoyoufb_cnt
from g37.battle_haoyoufb
where date between {monday} and {sunday}
group by role_id) as d on (a.role_id = d.role_id)
left join(
select role_id,count(*) as shiju_cnt
from g37.battle
where date between {monday} and {sunday}
group by role_id) as ee on (a.role_id = ee.role_id)
left join(
select role_id,count(*) as nianshou_cnt
from g37.battle
where date between {monday} and {sunday}
group by role_id) as f on (a.role_id = f.role_id)
left join(
select role_id,count(*) as tupo_cnt
from g37.battle_singlebattle
where date between {monday} and {sunday}
group by role_id) as g on (a.role_id = g.role_id)
""".format(monday=MONDAY, sunday=SUNDAY)
print 'fuben complete'
return para.hsc.sql(sql_fuben)
# 资源数据统计
def calc_g37_ziyuan(para):
global MONDAY
global SUNDAY
ziyuan_sql = '''
select a.role_id,a.server,a.qiandao,o.role_name,o.role_level,a.online_cnt,g.money_get,b.money_spend,c.gouyu_get,d.gouyu_spend,ee.enegy_get,f.enegy_spend,h.pifu
from(
select role_id, server,count(*) as qiandao, sum(cast(fn.json(source,'$.online_time') as int)) as online_cnt
from g37.logoutrole
where date between {monday} and {sunday}
group by role_id,server
) as a
left join (
select role_id,fn.json(source,'$.role_name') as role_name, fn.json(source,'$.role_level') as role_level
from g37.logoutrole
where date between {monday} and {sunday}
) as o on (a.role_id = o.role_id)
left join(
select role_id,sum(cast(fn.json(source,'$.num') as int)) as money_spend
from g37.currency_money
where fn.json(source,'$.code_op') = '2'
and date between {monday} and {sunday}
group by role_id) as b on (a.role_id = b.role_id)
left join(
select role_id,sum(cast(fn.json(source,'$.num') as int)) as gouyu_get
from g37.currency_gouyu
where fn.json(source,'$.code_op') = '1'
and date between {monday} and {sunday}
group by role_id) as c on (a.role_id = c.role_id)
left join(
select role_id,sum(cast(fn.json(source,'$.num') as int)) as gouyu_spend
from g37.currency_gouyu
where fn.json(source,'$.code_op') = '2'
and date between {monday} and {sunday}
group by role_id) as d on (a.role_id = d.role_id)
left join(
select role_id,sum(cast(fn.json(source,'$.num') as int)) as enegy_get
from g37.currency_enegy
where fn.json(source,'$.code_op') = '1'
and date between {monday} and {sunday}
group by role_id) as ee on (a.role_id = ee.role_id)
left join(
select role_id,sum(cast(fn.json(source,'$.num') as int)) as enegy_spend
from g37.currency_enegy
where fn.json(source,'$.code_op') = '2'
and date between {monday} and {sunday}
group by role_id) as f on (a.role_id = f.role_id)
left join(
select role_id,sum(cast(fn.json(source,'$.num') as int)) as money_get
from g37.currency_money
where fn.json(source,'$.code_op') = '1'
and date between {monday} and {sunday}
group by role_id
) as g on (a.role_id = g.role_id)
left join(
select role_id,concat_ws(',',collect_list(fn.json(source,"$.currency_id"))) as pifu
from g37.currency
where fn.json(source,'$.code_op') = '1'
and date between {monday} and {sunday}
group by role_id
) as h on (a.role_id = h.role_id)
'''.format(monday=MONDAY, sunday=SUNDAY)
print 'ziyuan complete'
return para.hsc.sql(ziyuan_sql)
# 御魂数据统计
def calc_g37_yuhun(para):
global MONDAY
global SUNDAY
sql_yuhun = """
select role_id,concat_ws(',',collect_list(concat_ws(":", fn.json(source,"$.detail.qua"), fn.json(source,"$.detail.itemId")))) as detail
from g37.currency_yuhun
where fn.json(source,'$.code_op')='1'
and date between {monday} and {sunday}
group by role_id
""".format(monday=MONDAY, sunday=SUNDAY)
def json_data_yuhun(df):
details = json.loads("["+df['detail']+"]")
item = {'1': [], '2': [], '3': [], '4': [], '5': [], '6': []}
for detail in details:
yuhun_id = detail['itemId']
star = detail['qua']
item[str(star)].append(int(yuhun_id))
o = {
'role_id': df['role_id'],
# 'server': df['server'],
'info_yuhun': item
}
return json.dumps(o)
ret_yuhun = para.hsc.sql(sql_yuhun)
print 'yuhun complete'
return ret_yuhun
# ret = ret_yuhun.rdd.map(json_data_yuhun).collect()
# count = 0
# for row in ret:
# count += 1
# if count == 2:
# break
# print row
# print row['role_id'], row['server'], row['yuhun_cnt']
# save_ret = rs_yuhun.rdd.map(save_func).\
# saveAsTextFile('/home/workspace/kukulcan/src/g37/task/date=%s' % MONDAY)
# print 'complete'
# 式神数据统计
def calc_g37_shishen(para):
global MONDAY
global SUNDAY
sql_shishen = """
select role_id,concat_ws(',',collect_list(fn.json(source,"$.currency_id"))) as currency_id
from g37.currency_qys
where fn.json(source,'$.code_op')='1'
and date between {monday} and {sunday}
group by role_id
""".format(monday=MONDAY, sunday=SUNDAY)
f = open("shishen.txt", "r")
sss = eval(f.readline())
ssrs = sss['5']
srs = sss['4']
rs = sss['3']
ns = sss['2']
def json_data_shishen(df):
currency_ids = df['currency_id'].strip(',').split(',')
item = {'ssr': [], 'sr': [], 'r': [], 'n': []}
for currency_id in currency_ids:
if int(currency_id) in ssrs:
item['ssr'].append(int(currency_id))
elif int(currency_id) in srs:
item['sr'].append(int(currency_id))
elif int(currency_id) in rs:
item['r'].append(int(currency_id))
elif int(currency_id) in ns:
item['n'].append(int(currency_id))
o = {
'role_id': df['role_id'],
# 'server': df['server'],
'info_shishen': item
}
return json.dumps(o)
ret_shishen = para.hsc.sql(sql_shishen)
print 'shishen complete'
return ret_shishen
# ret = ret_shishen.rdd.map(json_data_shishen).collect()
# count = 0
# for row in ret:
# count += 1
# if count == 2:
# break
# print row
# print row['role_id'], row['server'], row['yuhun_cnt']
# save_ret = rs_yuhun.rdd.map(save_func).\
# saveAsTextFile('/home/workspace/kukulcan/src/g37/task/date=%s' % MONDAY)
# print 'complete'
# 斗技数据统计
def calc_g37_doufa(para):
global MONDAY
global SUNDAY
# sql_doufa = """
# select a.role_id,concat_ws('###',collect_list(concat_ws('@@@',b.d_role_name,a.source))) as source from
# (
# select role_id,fn.json(source,"$.d_role_id") as d_role_id,source
# from g37.battle_doufa
# where date between {monday} and {sunday}
# )as a
# left join
# (
# select distinct(role_id) ,fn.json(source,'$.role_name') as d_role_name
# from g37.battle_doufa
# where date between {monday} and {sunday}
# )as b on (a.d_role_id = b.role_id)
# group by a.role_id
# """.format(monday=MONDAY, sunday=SUNDAY)
# sql_doufa = """
# select role_id ,concat_ws(',',collect_set(source)) as source
# from (
# select t.role_id as role_id,concat(
# '{',
# '\"role_id\":\"',NVL(t.role_id,''),
# '\",\"role_name\":\"',NVL(fn.json(t.source,'$.role_name'),''),
# '\",\"role_lv\":',NVL(fn.json(t.source,'$.role_lv'),''),
# ',\"d_role_id\":\"',NVL(fn.json(t.source,'$.d_role_id'),''),
# '\",\"d_role_name\":\"',NVL(b.d_role_name,''),
# '\",\"d_role_lv\":',NVL(fn.json(t.source,'$.d_role_lv'),''),
# ',\"duan\":',NVL(fn.json(t.source,'$.duan'),''),
# ',\"score\":',NVL(fn.json(t.source,'$.score'),''),
# ',\"battle_list\":',NVL(fn.json(t.source,'$.battle_list'),''),
# ',\"d_battle_list\":',NVL(fn.json(t.source,'$.d_battle_list'),''),
# ',\"score\":',NVL(fn.json(t.source,'$.score'),''),
# ',\"battle_result\":',NVL(fn.json(t.source,'$.battle_result'),''),
# '}') as source
# from (
# select role_id,fn.json(source,"$.d_role_id") as d_role_id,source
# from g37.battle_doufa
# where date between %s and %s
# )as t
# left join
# (
# select distinct(role_id) ,fn.json(source,'$.role_name') as d_role_name
# from g37.battle_doufa
# where date between %s and %s
# )as b on (t.d_role_id = b.role_id)
# )as a group by role_id
# """ % (MONDAY, SUNDAY, MONDAY, SUNDAY)#format(monday=MONDAY, sunday=SUNDAY)
# sql_doufa = """
# select role_id,concat_ws(',',collect_list(source)) as source
# from g37.battle_doufa
# where date between {monday} and {sunday}
# group by role_id
# """.format(monday=MONDAY, sunday=SUNDAY)
sql_doufa = """
select a.role_id as role_id ,b.d_role_name as d_role_name,a.source as source from
(
select role_id,fn.json(source,"$.d_role_id") as d_role_id,source
from g37.battle_doufa
where date between {monday} and {sunday}
)as a
left join
(
select distinct(role_id) ,fn.json(source,'$.role_name') as d_role_name
from g37.battle_doufa
where date between {monday} and {sunday}
)as b on (a.d_role_id = b.role_id)
""".format(monday=MONDAY, sunday=SUNDAY)
# 对数据进行加工返回新的数据
def json_data_doufa(df):
o = {
'role_id': '',
'role_name': '',
'role_lv': '',
'duan': '',
'score': '',
'battle_result': '',
'd_role_id': '',
'd_role_name': '',
'd_role_lv': '',
'battle_list': '',
'd_battle_list': ''
}
if df['source'] is not None and df['source'].strip() is not None: # 判断不为空
doufa = json.loads(df['source'])
o = {
'role_id': doufa['role_id'],
'role_name': doufa['role_name'],
'role_lv':doufa['role_lv'],
'duan': doufa['duan'],
'score': doufa['score'],
'battle_result':doufa['battle_result'],
'd_role_id': doufa['d_role_id'],
'd_role_name': df['d_role_name'],
'd_role_lv': doufa['d_role_lv'],
'battle_list': doufa['battle_list'],
'd_battle_list': doufa['d_battle_list']
}
return df['role_id'], json.dumps(o)
print time.strftime('%Y-%m-%d %X', time.localtime())
# ret_doufa.rdd.map(json_data_doufa).saveAsTextFile('/home/workspace/g37/weekly/date=%s' % MONDAY)
ret_doufa = para.hsc.sql(sql_doufa).rdd.map(json_data_doufa)
schema = StructType([
StructField("role_id", StringType(), True),
StructField("source", StringType(), True)])
sqlContext = HiveContext(para.sc)
ret_doufa = sqlContext.createDataFrame(ret_doufa, schema)
ret_doufa.printSchema()
ret_doufa = ret_doufa.groupby("role_id").agg(F.collect_set("source")).withColumnRenamed('collect_set(source)', 'source')
ret_doufa.printSchema()
def save_fun(df):
info_doufa = []
if df['source'] is not None: # 判断不为空
for doufa in df['source']:
doufa = json.loads(doufa)
duan = doufa['duan']
score = doufa['score']
battle_list = doufa['battle_list']
battle_list_final = []
d_battle_list = doufa['d_battle_list']
d_battle_list_final = []
if battle_list is not None and len(battle_list) > 0:
for item in battle_list:
o_t = {
'star': item['star'],
'level': item['level'],
'shishen_id': item['shishen_id'],
}
battle_list_final.append(o_t)
if d_battle_list is not None and len(d_battle_list) > 0:
for item in d_battle_list:
o_t = {
'star': item['star'],
'level': item['level'],
'shishen_id': item['shishen_id'],
}
d_battle_list_final.append(o_t)
o = {
'role_id': doufa['role_id'],
'role_name': doufa['role_name'],
'role_lv': doufa['role_lv'] if doufa['role_lv'] is not None else 0,
'duan': duan,
'score': score,
'battle_result': doufa['battle_result'],
'd_role_name': doufa['d_role_name'],
'd_role_lv': doufa['d_role_lv'] if doufa['d_role_lv'] is not None else 0,
'd_role_id': doufa['d_role_id'],
'battle_list': battle_list_final,
'd_battle_list': d_battle_list_final
}
info_doufa.append(o)
o = {
'role_id': df['role_id'],
'doufa': info_doufa
}
return json.dumps(o)
print 'doufa complete'
return ret_doufa
def test_date():
global MONDAY
global SUNDAY
print MONDAY, SUNDAY
def json_data(df):
global ssrs
global srs
global rs
global ns
# 御魂
yuhun_item = {'1': [], '2': [], '3': [], '4': [], '5': [], '6': []}
if df['detail'] is not None and df['detail'].strip() is not None: # 判断不为空
details = df['detail'].strip(',').split(',')
for detail in details:
yuhun_id = detail.split(':')[1]
star = detail.split(':')[0]
yuhun_item[str(star)].append(int(yuhun_id))
# 式神
shishen_item = {'ssr': [], 'sr': [], 'r': [], 'n': []}
if df['currency_id'] is not None and df['currency_id'].strip() is not None: # 判断不为空
currency_ids = df['currency_id'].strip(',').split(',')
for currency_id in currency_ids:
if int(currency_id) in ssrs:
shishen_item['ssr'].append(int(currency_id))
elif int(currency_id) in srs:
shishen_item['sr'].append(int(currency_id))
elif int(currency_id) in rs:
shishen_item['r'].append(int(currency_id))
elif int(currency_id) in ns:
shishen_item['n'].append(int(currency_id))
# 斗技
info_doufa = []
if df['source'] is not None: # 判断不为空
# doufas = json.loads(df['source'])
for doufa in df['source']:
doufa = json.loads(doufa)
duan = doufa['duan']
score = doufa['score']
battle_list = doufa['battle_list']
battle_list_final = []
d_battle_list = doufa['d_battle_list']
d_battle_list_final = []
if battle_list is not None and len(battle_list) > 0:
for item in battle_list:
o_t = {
'star': item['star'],
'level': item['level'],
'shishen_id': item['shishen_id'],
}
battle_list_final.append(o_t)
if d_battle_list is not None and len(d_battle_list) > 0:
for item in d_battle_list:
o_t = {
'star': item['star'],
'level': item['level'],
'shishen_id': item['shishen_id'],
}
d_battle_list_final.append(o_t)
o = {
'role_id':doufa['role_id'],
'role_name': doufa['role_name'],
'role_lv': doufa['role_lv'] if doufa['role_lv'] is not None else 0,
'duan': duan,
'score': score,
'battle_result': doufa['battle_result'],
'd_role_name': doufa['d_role_name'],
'd_role_lv': doufa['d_role_lv'] if doufa['d_role_lv'] is not None else 0,
'd_role_id': doufa['d_role_id'],
'battle_list': battle_list_final,
'd_battle_list': d_battle_list_final
}
info_doufa.append(o)
role_id = df['role_id'] if df['role_id'] is not None else "role_id_null"
pifu = df['pifu'].strip(',').split(',') if df['pifu'] is not None and df['pifu'].strip() is not None else df['pifu']
o = {
'role_id': role_id,
'server': df['server'],
'role_name': df['role_name'],
'role_level': df['role_level'],
'info_fuben': {
'yuhun_cnt': df['yuhun_cnt'],
'xiezu_cnt': df['xiezu_cnt'],
'juexing_cnt': df['juexing_cnt'],
'haoyoufb_cnt': df['haoyoufb_cnt'],
'shiju_cnt': df['shiju_cnt'],
'nianshou_cnt': df['nianshou_cnt'],
'tupo_cnt': df['tupo_cnt']
},
'info_ziyuan': {
'money_get': df['money_get'],
'money_spend': df['money_spend'],
'gouyu_get': df['gouyu_get'],
'gouyu_spend:': df['gouyu_spend'],
'enegy_get': df['enegy_get'],
'enegy_spend': df['enegy_spend'],
'online_cnt': df['online_cnt'],
'pifu': pifu,
'qiandao': df['qiandao']
},
'info_yuhun': yuhun_item,
'info_shishen': shishen_item,
'info_doufa': info_doufa
}
o_hive = {
'row_key': str(role_id)[::-1],
'value': {
'info:': json.dumps(o),
}
}
return json.dumps(o_hive)
# return str(role_id)[::-1] + '\t' + json.dumps(o)
def start_cal(para):
t_fuben = MyThread(calc_g37_fuben, args=para)
t_ziyuan = MyThread(calc_g37_ziyuan, args=para)
t_yuhun = MyThread(calc_g37_yuhun, args=para)
t_shishen = MyThread(calc_g37_shishen, args=para)
t_doufa = MyThread(calc_g37_doufa, args=para)
threads.append(t_fuben)
threads.append(t_ziyuan)
threads.append(t_yuhun)
threads.append(t_shishen)
threads.append(t_doufa)
for t in threads:
t.setDaemon(True)
t.start()
t.join()
rets_ziyuan = t_ziyuan.get_result()
rets_fuben = t_fuben.get_result()
rets_yuhun = t_yuhun.get_result()
rets_shishen = t_shishen.get_result()
rets_doufa = t_doufa.get_result()
print 'join 1'
# 资源统计和副本统计
rets = rets_ziyuan.join(rets_fuben, [rets_ziyuan["role_id"] == rets_fuben["role_id"]], "outer") \
.select(rets_ziyuan.role_id, rets_ziyuan.server, rets_ziyuan.role_name, rets_ziyuan.role_level,
rets_ziyuan.money_get, rets_ziyuan.money_spend, rets_ziyuan.gouyu_get, rets_ziyuan.gouyu_spend,rets_ziyuan.enegy_get, rets_ziyuan.enegy_spend, rets_ziyuan.online_cnt,rets_ziyuan.pifu,rets_ziyuan.qiandao,
rets_fuben.nianshou_cnt, rets_fuben.yuhun_cnt, rets_fuben.haoyoufb_cnt, rets_fuben.shiju_cnt,rets_fuben.juexing_cnt, rets_fuben.xiezu_cnt, rets_fuben.tupo_cnt
)
print 'join 2'
# 加御魂统计
rets = rets.join(rets_yuhun, [rets["role_id"] == rets_yuhun["role_id"]],"outer") \
.select(rets.role_id, rets.server, rets.role_name, rets.role_level,
rets.money_get, rets.money_spend, rets.gouyu_get, rets.gouyu_spend, rets.enegy_get, rets.enegy_spend,rets.online_cnt, rets.pifu,rets.qiandao,
rets.nianshou_cnt, rets.yuhun_cnt, rets.haoyoufb_cnt, rets.shiju_cnt, rets.juexing_cnt, rets.xiezu_cnt,rets.tupo_cnt,
rets_yuhun.detail
)
print 'join 3'
# 加式神统计
rets = rets.join(rets_shishen, [rets["role_id"] == rets_shishen["role_id"]], "outer") \
.select(rets.role_id, rets.server, rets.role_name, rets.role_level,
rets.money_get, rets.money_spend, rets.gouyu_get, rets.gouyu_spend, rets.enegy_get, rets.enegy_spend,rets.online_cnt,rets.pifu,rets.qiandao,
rets.nianshou_cnt, rets.yuhun_cnt, rets.haoyoufb_cnt, rets.shiju_cnt, rets.juexing_cnt, rets.xiezu_cnt,rets.tupo_cnt,
rets.detail,
rets_shishen.currency_id
)
print 'join 4'
# 加斗技统计
rets = rets.join(rets_doufa, [rets["role_id"] == rets_doufa["role_id"]], "outer") \
.select(rets.role_id, rets.server, rets.role_name, rets.role_level,
rets.money_get, rets.money_spend, rets.gouyu_get, rets.gouyu_spend, rets.enegy_get, rets.enegy_spend, rets.online_cnt,rets.pifu,rets.qiandao,
rets.nianshou_cnt, rets.yuhun_cnt, rets.haoyoufb_cnt, rets.shiju_cnt, rets.juexing_cnt, rets.xiezu_cnt, rets.tupo_cnt,
rets.detail,
rets.currency_id,
rets_doufa.source
).distinct()
print 'join complete'
rets.printSchema()
print time.strftime('%Y-%m-%d %X', time.localtime())
print 'handle json data '
rets.rdd.setName('mark.g37').map(json_data).saveAsTextFile('/home/workspace/g37/weekly/date=%s' % MONDAY)
print time.strftime('%Y-%m-%d %X', time.localtime())
if __name__ == '__main__':
test_date()
product = 'g37'
try:
print 'link...'
conf = SparkConf()
app_name = "mark.charts.g37.weekly" + ".%s" % MONDAY
sc, hsc = get_context(conf, appName=app_name)
print 'link success'
uid_dict = {}
b_uid_set = sc.broadcast(set(uid_dict.keys()))
param = Param(sc, MONDAY, product, uid_dict, b_uid_set, hsc)
start_cal(param)
except:
get_logger().error(traceback.format_exc())
raise
print 'main quit'
sc.stop()
3.其他
A:启动脚本
#!/bin/bash
cd `dirname $0`
curdir=`pwd`
HIVE_CLASSPATH=$(find /usr/lib/hive/lib/ -name '*.jar' -not -name 'guava*' -print0 | sed 's/\x0/,/g')
source /home/env/sparkenv/bin/activate
#./fetch_regex.sh
if [[ -z "$dt" ]]; then
dt=`date "+%Y-%m-%d"`
fi
echo $dt
shift
hour=`date "+%H"`
if (( $hour < 18)); then
hour=13
else
hour=21
fi
kinit -kt /home/hadoop/keytab/sparkuser.keytab sparkuser@HADOOP.163.GZ
#SPARK_SUBMIT="/home/spark-1.5.1-bin-without-hadoop/bin/spark-submit"
SPARK_SUBMIT="/home/workspace/spark-2.1.1-bin-hadoop2.6/bin/spark-submit"
#SPARK_SUBMIT='/usr/bin/spark-submit'
#MASTER="local[12]"
MASTER="yarn"
TOTAL_CP=$(/usr/bin/hadoop classpath):/usr/lib/hadoop/lib/:/usr/lib/hadoop/lib/native:/usr/lib/hadoop/lib/hadoop-lzo.jar:/usr/lib/hadoop-0.20-mapreduce/hadoop-core-2.6.0-mr1-cdh5.5.2.jar
export SPARK_LIBRARY_PATH=${TOTAL_CP}
export LD_LIBRARY_PATH=${TOTAL_CP}
export HADOOP_CONF_DIR=/etc/hive/conf
export SPARK_PRINT_LAUNCH_COMMAND=1
export TMPDIR=/home/tmp
export NUM_EXEC=200
export DEBUG=1
JAR_LIST="""/usr/lib/hadoop/lib/hadoop-lzo.jar
/usr/lib/hadoop-0.20-mapreduce/hadoop-core-2.6.0-mr1-cdh5.5.2.jar
/usr/lib/hive/lib/fastjson.jar
/usr/lib/hive/lib/neop-hive.jar
/usr/lib/hive/lib/commons-beanutils-1.7.0.jar
/usr/lib/hive/lib/commons-beanutils-core-1.8.0.jar
/usr/lib/hive/lib/hive-hwi-1.1.0-cdh5.5.2.jar
/usr/lib/hive/lib/hive-shims-scheduler-1.1.0-cdh5.5.2.jar
/usr/lib/hive/lib/hive-ant-1.1.0-cdh5.5.2.jar
/usr/lib/hive/lib/hive-contrib-1.1.0-cdh5.5.2.jar
/usr/lib/hive/lib/hive-exec-1.1.0-cdh5.5.2.jar
/usr/lib/hive/lib/hive-shims-1.1.0-cdh5.5.2.jar
/usr/lib/hive/lib/hive-jdbc-1.1.0-cdh5.5.2.jar
/usr/lib/hive/lib/hive-metastore-1.1.0-cdh5.5.2.jar
/usr/lib/hive/lib/hive-common-1.1.0-cdh5.5.2.jar
/usr/lib/hive/lib/hive-service-1.1.0-cdh5.5.2.jar
/usr/lib/hive/lib/hive-jdbc-1.1.0-cdh5.5.2-standalone.jar
/usr/lib/hive/lib/hive-shims-0.23-1.1.0-cdh5.5.2.jar
/usr/lib/hive/lib/hive-serde-1.1.0-cdh5.5.2.jar
/usr/lib/hive/lib/hive-shims-common-1.1.0-cdh5.5.2.jar
/usr/lib/hive/lib/hive-cli-1.1.0-cdh5.5.2.jar
/home/workspace/kukulcan/src/g37/task/neop-hadoop-lzo.jar
"""
###TOTAL_CP_COMMA=/usr/lib/hadoop/lib/hadoop-lzo.jar,${HIVE_CLASSPATH},/usr/lib/hadoop-0.20-mapreduce/hadoop-core-2.6.0-mr1-cdh5.5.2.jar
TOTAL_CP_COMMA=`echo $JAR_LIST | sed 's/ /,/g'`
SCRIPT=`pwd`
SCRIPTPATH=`dirname $SCRIPT`
echo "scriptpath:$SCRIPTPATH"
#--driver-java-options "-Dlog4j.configuration=file://${SCRIPTPATH}/log4j.properties" \
#$SPARK_SUBMIT --master $MASTER --queue root.tech \
# --num-executors $NUM_EXEC \
# --driver-class-path ${TOTAL_CP_COMMA} \
# --files /etc/hive/conf/hive-site.xml \
# --driver-java-options "-Dlog4j.configuration=file://${SCRIPTPATH}/l10.prod.log4j.prop" \
# --conf "spark.executor.extraClassPath=${TOTAL_CP_COMMA}" \
# --conf "spark.executor.extraLibraryPath=${TOTAL_CP}" \
# --conf "spark.eventLog.enabled=true" \
# --conf "spark.eventLog.compree=true" \
# --conf "spark.eventLog.dir=hdfs://neophdfs/user/spark/applicationHistory" \
# --py-files logic.py,mod_score.py,mod_score_l10.py \
# --jars ${TOTAL_CP_COMMA} \
# --executor-memory 1500M --driver-memory 8G calc_l10.py $dt
# --conf "spark.serializer=org.apache.spark.serializer.KryoSerializer"\
$SPARK_SUBMIT --master $MASTER --queue root.tech \
--driver-class-path ${TOTAL_CP_COMMA} \
--files /etc/hive/conf/hive-site.xml \
--conf "spark.sql.shuffle.partitions=10000" \
--driver-java-options "-Dlog4j.configuration=file:///home/workspace/kukulcan/src/g37/task/g37.prod.log4j.prop" \
--conf "spark.executor.extraClassPath=${TOTAL_CP_COMMA}" \
--conf "spark.executor.extraLibraryPath=${TOTAL_CP}" \
--conf "spark.eventLog.enabled=true" \
--conf "spark.eventLog.compress=true" \
--conf "spark.eventLog.dir=hdfs://neophdfs/user/spark/applicationHistory" \
--conf "spark.local.dir=/home/tmp" \
--conf "spark.driver.maxResultSize=4g" \
--py-files logic.py,calc_g37.py \
--jars ${TOTAL_CP_COMMA} \
--conf "spark.dynamicAllocation.enabled=true" \
--conf "spark.shuffle.service.enabled=true" \
--conf "spark.dynamicAllocation.minExecutors=80" \
--conf "spark.dynamicAllocation.maxExecutors=1000" \
--conf "spark.speculation=true" \
--executor-memory 3000M --driver-memory 16G calc_all_g37.py -h $hour $dt
monday=`python -c 'import datetime; print str((datetime.datetime.now() - datetime.timedelta(days=7) + datetime.timedelta(days=-(datetime.datetime.now().weekday()))).strftime("%Y-%m-%d 00:00:00"))[0:10].replace('-','')
'`
sudo -u hive kinit -kt /home/hadoop/keytab/sparkuser.keytab sparkuser@HADOOP.163.GZ
sudo -u sparkuser kinit -kt /home/hadoop/keytab/sparkuser.keytab sparkuser@HADOOP.163.GZ
sudo -u hive hive --auxpath /usr/lib/hive/lib/neop-hive.jar:/usr/lib/hive/lib/fastjson.jar:/usr/lib/hive/lib/neop-hadoop-lzo.jar -e "MSCK REPAIR TABLE sparkuser.g37_weekly"
cd /home/workspace/hbase-importer/app && sudo -u sparkuser ./start-spark-hbase-importer.sh sparkuser.g37_weekly g37:weekly_${monday} "$monday"
cd $curdir && ./clean_hdfs.sh
http://www.cnblogs.com/makexu/

浙公网安备 33010602011771号