34.spark读取本地文件转换为json保存到hadoop上
0.目录结构

路径:/home/makexu/zqzjhive 导出的内容放在/home/makexu/zqzjhive/log/目录下 zq_hive_task.sh是删除7天前的文件,并调用dump.py生成昨天的文件
1.文件的准备
database_cmd.MYSQL_CMD_DB2 = MYSQL_CMD_DB2 = """mysql -h'%s' -u'%s' -p'%s' -P'%s' --default-character-set=latin1 %s""" % ( DATABASE_HOST_CSA, DATABASE_USER_CSA, DATABASE_PASSWORD_CSA, DATABASE_PORT_CSA, DATABASE_NAME_CSA)
def dump_question():
temp = "%s/question_tmp.sql" % database_cmd.LOG_BASE_PATH
filename = "%s/question.sql" % database_cmd.LOG_BASE_PATH
filename = filename.replace("\\", "/")
cmd = "%s -e \"select * from question where qtime>='%s' and qtime<='%s'\" > %s" % (
database_cmd.MYSQL_CMD_DB2, bdts, edts, temp)
print cmd
call(cmd, shell=True)
linedatas = [line for line in open(temp).readlines()]
data = common.g2unicode(''.join(linedatas))
fileWriter = open(filename, 'w')
fileWriter.write(data)
fileWriter.close()
def dump_answer():
temp = "%s/answer_tmp.sql" % database_cmd.LOG_BASE_PATH
filename = "%s/answer.sql" % database_cmd.LOG_BASE_PATH
filename = filename.replace("\\", "/")
cmd = "%s -e \"select * from answer where atime>='%s' and atime<='%s'\" > %s" % (
database_cmd.MYSQL_CMD_DB2, bdts, edts, temp)
print cmd
call(cmd, shell=True)
linedatas = [line for line in open(temp).readlines()]
data = common.g2unicode(''.join(linedatas))
fileWriter = open(filename, 'w')
fileWriter.write(data)
fileWriter.close()
def dump_answer_evaluation():
filename = "%s/answer_evaluation.sql" % database_cmd.LOG_BASE_PATH
filename = filename.replace("\\", "/")
cmd = "%s -e \"select * from answer_evaluation where eval_time>='%s' and eval_time<='%s'\" > %s" % (
database_cmd.MYSQL_CMD_DB2, bdts, edts, filename)
print cmd
call(cmd, shell=True)
2.启动脚本参数配置:若要读取本地文件,其他节点是没有,那么需要将master设置为local
#!/bin/bash
cd `dirname $0`
curdir=`pwd`
HIVE_CLASSPATH=$(find /usr/lib/hive/lib/ -name '*.jar' -not -name 'guava*' -print0 | sed 's/\x0/,/g')
# source /home/env/sparkenv/bin/activate
#./fetch_regex.sh
if [[ -z "$dt" ]]; then
dt=`date "+%Y-%m-%d"`
fi
echo $dt
shift
hour=`date "+%H"`
if (( $hour < 18)); then
hour=13
else
hour=21
fi
kinit -kt /home/hadoop/keytab/sparkuser.keytab sparkuser@HADOOP.163.GZ
#SPARK_SUBMIT="/home/spark-1.5.1-bin-without-hadoop/bin/spark-submit"
SPARK_SUBMIT="/home/workspace/spark-2.1.1-bin-hadoop2.6/bin/spark-submit"
#SPARK_SUBMIT="/home/workspace/kukulcan/spark-2.0.2-bin-hadoop2.6/bin/spark-submit"
#SPARK_SUBMIT='/usr/bin/spark-submit'
#MASTER="local[12]"
MASTER="local[2]"
TOTAL_CP=$(/usr/bin/hadoop classpath):/usr/lib/hadoop/lib/:/usr/lib/hadoop/lib/native:/usr/lib/hadoop/lib/hadoop-lzo.jar:/usr/lib/hadoop-0.20-mapreduce/hadoop-core-2.6.0-mr1-cdh5.5.2.jar
export SPARK_LIBRARY_PATH=${TOTAL_CP}
export LD_LIBRARY_PATH=${TOTAL_CP}
export HADOOP_CONF_DIR=/etc/hive/conf
export SPARK_PRINT_LAUNCH_COMMAND=1
export TMPDIR=/home/tmp
export NUM_EXEC=200
export DEBUG=1
JAR_LIST="""/usr/lib/hadoop/lib/hadoop-lzo.jar
/usr/lib/hadoop-0.20-mapreduce/hadoop-core-2.6.0-mr1-cdh5.5.2.jar
/usr/lib/hive/lib/fastjson.jar
/usr/lib/hive/lib/neop-hive.jar
/usr/lib/hive/lib/commons-beanutils-1.7.0.jar
/usr/lib/hive/lib/commons-beanutils-core-1.8.0.jar
/usr/lib/hive/lib/hive-hwi-1.1.0-cdh5.5.2.jar
/usr/lib/hive/lib/hive-shims-scheduler-1.1.0-cdh5.5.2.jar
/usr/lib/hive/lib/hive-ant-1.1.0-cdh5.5.2.jar
/usr/lib/hive/lib/hive-contrib-1.1.0-cdh5.5.2.jar
/usr/lib/hive/lib/hive-exec-1.1.0-cdh5.5.2.jar
/usr/lib/hive/lib/hive-shims-1.1.0-cdh5.5.2.jar
/usr/lib/hive/lib/hive-jdbc-1.1.0-cdh5.5.2.jar
/usr/lib/hive/lib/hive-metastore-1.1.0-cdh5.5.2.jar
/usr/lib/hive/lib/hive-common-1.1.0-cdh5.5.2.jar
/usr/lib/hive/lib/hive-service-1.1.0-cdh5.5.2.jar
/usr/lib/hive/lib/hive-jdbc-1.1.0-cdh5.5.2-standalone.jar
/usr/lib/hive/lib/hive-shims-0.23-1.1.0-cdh5.5.2.jar
/usr/lib/hive/lib/hive-serde-1.1.0-cdh5.5.2.jar
/usr/lib/hive/lib/hive-shims-common-1.1.0-cdh5.5.2.jar
/usr/lib/hive/lib/hive-cli-1.1.0-cdh5.5.2.jar
/usr/lib/hive/lib/neop-hadoop-lzo.jar
"""
###TOTAL_CP_COMMA=/usr/lib/hadoop/lib/hadoop-lzo.jar,${HIVE_CLASSPATH},/usr/lib/hadoop-0.20-mapreduce/hadoop-core-2.6.0-mr1-cdh5.5.2.jar
TOTAL_CP_COMMA=`echo $JAR_LIST | sed 's/ /,/g'`
SCRIPT=`pwd`
SCRIPTPATH=`dirname $SCRIPT`
echo "scriptpath:$SCRIPTPATH"
#--driver-java-options "-Dlog4j.configuration=file://${SCRIPTPATH}/log4j.properties" \
#$SPARK_SUBMIT --master $MASTER --queue root.tech \
# --num-executors $NUM_EXEC \
# --driver-class-path ${TOTAL_CP_COMMA} \
# --files /etc/hive/conf/hive-site.xml \
# --driver-java-options "-Dlog4j.configuration=file://${SCRIPTPATH}/l10.prod.log4j.prop" \
# --conf "spark.executor.extraClassPath=${TOTAL_CP_COMMA}" \
# --conf "spark.executor.extraLibraryPath=${TOTAL_CP}" \
# --conf "spark.eventLog.enabled=true" \
# --conf "spark.eventLog.compree=true" \
# --conf "spark.eventLog.dir=hdfs://neophdfs/user/spark/applicationHistory" \
# --py-files logic.py,mod_score.py,mod_score_l10.py \
# --jars ${TOTAL_CP_COMMA} \
# --executor-memory 1500M --driver-memory 8G calc_l10.py $dt
# --conf "spark.serializer=org.apache.spark.serializer.KryoSerializer"\
$SPARK_SUBMIT --master $MASTER --queue root.tech \
--driver-class-path ${TOTAL_CP_COMMA} \
--files /etc/hive/conf/hive-site.xml \
--conf "spark.sql.shuffle.partitions=10000" \
--driver-java-options "-Dlog4j.configuration=file:///home/makexu/zqzjhive/zqzj.prod.log4j.prop" \
--conf "spark.executor.extraClassPath=${TOTAL_CP_COMMA}" \
--conf "spark.executor.extraLibraryPath=${TOTAL_CP}" \
--conf "spark.eventLog.enabled=true" \
--conf "spark.eventLog.compress=true" \
--conf "spark.eventLog.dir=hdfs://neophdfs/user/spark/applicationHistory" \
--conf "spark.local.dir=/home/tmp" \
--conf "spark.driver.maxResultSize=4g" \
--py-files zq_hive.py \
--jars ${TOTAL_CP_COMMA} \
--conf "spark.dynamicAllocation.enabled=true" \
--conf "spark.shuffle.service.enabled=true" \
--conf "spark.dynamicAllocation.minExecutors=80" \
--conf "spark.dynamicAllocation.maxExecutors=1000" \
--conf "spark.speculation=true" \
--executor-memory 3000M --driver-memory 16G dump.py -h $hour $dt
#monday=`python -c 'import datetime; print str((datetime.datetime.now() - datetime.timedelta(days=7) + datetime.timedelta(days=-(datetime.datetime.now().weekday()))).strftime("%Y-%m-%d 00:00:00"))[0:10].replace("-","")
#'`
#sudo -u hive kinit -kt /home/hadoop/keytab/sparkuser.keytab sparkuser@HADOOP.163.GZ
#sudo -u sparkuser kinit -kt /home/hadoop/keytab/sparkuser.keytab sparkuser@HADOOP.163.GZ
#
#sudo -u hive hive --auxpath /usr/lib/hive/lib/neop-hive.jar:/usr/lib/hive/lib/fastjson.jar:/usr/lib/hive/lib/neop-hadoop-lzo.jar -e "MSCK REPAIR TABLE sparkuser.l10_weekly"
#
#cd /home/workspace/hbase-importer/app && sudo -u sparkuser ./start-spark-hbase-importer.sh sparkuser.l10_weekly g37:weekly_${monday} "$monday"
#cd $curdir && ./clean_hdfs.sh
3.转换上传代码
def start_save(para):
rdd_question = para.sc.textFile('file:///home/makexu/zqzjhive/log/question.sql').map(lambda x: x.split('\t'))
rdd_answer = para.sc.textFile('file:///home/makexu/zqzjhive/log/answer.sql').map(lambda x: x.split('\t'))
rdd_answer_evaluation = para.sc.textFile('file:///home/makexu/zqzjhive/log/answer_evaluation.sql').map(lambda x: x.split('\t'))
def save_json_answer(df):
if len(df) == 8:
o = {
'answerid': df[0],
'qid': df[1],
'atime': df[2],
'gid': df[3],
'content': df[4],
'handletime': df[5],
'expire_time': df[6],
'evaluate_way': df[7],
}
return json.dumps(o)
def save_json_question(df):
if len(df) == 23:
o = {
'qid': df[0],
'mid': df[1],
'sid': df[2],
'pid': df[3],
'gid': df[4],
'uid': df[5],
'level': df[6],
'ispublish': df[7],
'replytype': df[8],
'qtime': df[9],
'qip': df[10],
'fetchtime': df[11],
'lasthandletime': df[12],
'status': df[13],
'tagcolor': df[14],
'hotspot': df[15],
'contentxml': df[16],
'qtitle': df[17],
'replynum': df[18],
'repute': df[19],
'priority': df[20],
'is_new': df[21],
'hidden': df[22]
}
return json.dumps(o)
def save_answer_evaluation(df):
if len(df) == 10:
o = {
'id': df[0],
'aid': df[1],
'pid': df[2],
'gid': df[3],
'gm_group_id': df[4],
'keyid': df[5],
'evaluation': df[6],
'eval_time': df[7],
'answer_time': df[8],
'sid': df[9]
}
return json.dumps(o)
rdd_answer.map(save_json_answer).saveAsTextFile(
'/home/workspace/zq/kefu/answer/date=%s' % str(get_date(-1)).replace('-', ''))
rdd_question.map(save_json_question).saveAsTextFile(
'/home/workspace/zq/kefu/question/date=%s' % str(get_date(-1)).replace('-', ''))
rdd_answer_evaluation.map(save_answer_evaluation).saveAsTextFile(
'/home/workspace/zq/kefu/answer_evaluation/date=%s' % str(get_date(-1)).replace('-', ''))
if __name__ == '__main__':
print bdts
print edts
dump_question()
dump_answer()
dump_answer_evaluation()
product = 'zq'
sc = None
try:
print 'link...'
conf = SparkConf()
app_name = "mark.zqzj.save.hive" + ".%s" % bdts
sc, hsc = get_context(conf, appName=app_name)
print 'link success'
uid_dict = {}
b_uid_set = sc.broadcast(set(uid_dict.keys()))
param = Param(sc, bdts, product, uid_dict, b_uid_set, hsc)
print time.strftime('%Y-%m-%d %X', time.localtime())
start_save(param)
except:
get_logger().error(traceback.format_exc())
raise
finally:
if sc:
sc.stop()
print 'main quit'
http://www.cnblogs.com/makexu/

浙公网安备 33010602011771号