34.spark读取本地文件转换为json保存到hadoop上
0.目录结构

路径:/home/makexu/zqzjhive 导出的内容放在/home/makexu/zqzjhive/log/目录下 zq_hive_task.sh是删除7天前的文件,并调用dump.py生成昨天的文件
1.文件的准备
database_cmd.MYSQL_CMD_DB2 = MYSQL_CMD_DB2 = """mysql -h'%s' -u'%s' -p'%s' -P'%s' --default-character-set=latin1 %s""" % ( DATABASE_HOST_CSA, DATABASE_USER_CSA, DATABASE_PASSWORD_CSA, DATABASE_PORT_CSA, DATABASE_NAME_CSA)
def dump_question():
    temp = "%s/question_tmp.sql" % database_cmd.LOG_BASE_PATH
    filename = "%s/question.sql" % database_cmd.LOG_BASE_PATH
    filename = filename.replace("\\", "/")
    cmd = "%s -e \"select * from question where qtime>='%s' and qtime<='%s'\" > %s" % (
        database_cmd.MYSQL_CMD_DB2, bdts, edts, temp)
    print cmd
    call(cmd, shell=True)
    linedatas = [line for line in open(temp).readlines()]
    data = common.g2unicode(''.join(linedatas))
    fileWriter = open(filename, 'w')
    fileWriter.write(data)
    fileWriter.close()
def dump_answer():
    temp = "%s/answer_tmp.sql" % database_cmd.LOG_BASE_PATH
    filename = "%s/answer.sql" % database_cmd.LOG_BASE_PATH
    filename = filename.replace("\\", "/")
    cmd = "%s -e \"select * from answer where atime>='%s' and atime<='%s'\" > %s" % (
        database_cmd.MYSQL_CMD_DB2, bdts, edts, temp)
    print cmd
    call(cmd, shell=True)
    linedatas = [line for line in open(temp).readlines()]
    data = common.g2unicode(''.join(linedatas))
    fileWriter = open(filename, 'w')
    fileWriter.write(data)
    fileWriter.close()
def dump_answer_evaluation():
    filename = "%s/answer_evaluation.sql" % database_cmd.LOG_BASE_PATH
    filename = filename.replace("\\", "/")
    cmd = "%s -e \"select * from answer_evaluation where eval_time>='%s' and eval_time<='%s'\" > %s" % (
        database_cmd.MYSQL_CMD_DB2, bdts, edts, filename)
    print cmd
    call(cmd, shell=True)
2.启动脚本参数配置:若要读取本地文件,其他节点是没有,那么需要将master设置为local
#!/bin/bash
cd `dirname $0`
curdir=`pwd`
HIVE_CLASSPATH=$(find /usr/lib/hive/lib/ -name '*.jar' -not -name 'guava*' -print0 | sed 's/\x0/,/g')
# source /home/env/sparkenv/bin/activate
#./fetch_regex.sh
if [[ -z "$dt" ]]; then
    dt=`date "+%Y-%m-%d"`
fi
echo $dt
shift
hour=`date "+%H"`
if (( $hour < 18)); then
    hour=13
else
    hour=21
fi
kinit -kt /home/hadoop/keytab/sparkuser.keytab sparkuser@HADOOP.163.GZ
#SPARK_SUBMIT="/home/spark-1.5.1-bin-without-hadoop/bin/spark-submit"
SPARK_SUBMIT="/home/workspace/spark-2.1.1-bin-hadoop2.6/bin/spark-submit"
#SPARK_SUBMIT="/home/workspace/kukulcan/spark-2.0.2-bin-hadoop2.6/bin/spark-submit"
#SPARK_SUBMIT='/usr/bin/spark-submit'
#MASTER="local[12]"
MASTER="local[2]"
TOTAL_CP=$(/usr/bin/hadoop classpath):/usr/lib/hadoop/lib/:/usr/lib/hadoop/lib/native:/usr/lib/hadoop/lib/hadoop-lzo.jar:/usr/lib/hadoop-0.20-mapreduce/hadoop-core-2.6.0-mr1-cdh5.5.2.jar
export SPARK_LIBRARY_PATH=${TOTAL_CP}
export LD_LIBRARY_PATH=${TOTAL_CP}
export HADOOP_CONF_DIR=/etc/hive/conf
export SPARK_PRINT_LAUNCH_COMMAND=1
export TMPDIR=/home/tmp
export NUM_EXEC=200
export DEBUG=1
JAR_LIST="""/usr/lib/hadoop/lib/hadoop-lzo.jar
/usr/lib/hadoop-0.20-mapreduce/hadoop-core-2.6.0-mr1-cdh5.5.2.jar
/usr/lib/hive/lib/fastjson.jar
/usr/lib/hive/lib/neop-hive.jar
/usr/lib/hive/lib/commons-beanutils-1.7.0.jar
/usr/lib/hive/lib/commons-beanutils-core-1.8.0.jar
/usr/lib/hive/lib/hive-hwi-1.1.0-cdh5.5.2.jar
/usr/lib/hive/lib/hive-shims-scheduler-1.1.0-cdh5.5.2.jar
/usr/lib/hive/lib/hive-ant-1.1.0-cdh5.5.2.jar
/usr/lib/hive/lib/hive-contrib-1.1.0-cdh5.5.2.jar
/usr/lib/hive/lib/hive-exec-1.1.0-cdh5.5.2.jar
/usr/lib/hive/lib/hive-shims-1.1.0-cdh5.5.2.jar
/usr/lib/hive/lib/hive-jdbc-1.1.0-cdh5.5.2.jar
/usr/lib/hive/lib/hive-metastore-1.1.0-cdh5.5.2.jar
/usr/lib/hive/lib/hive-common-1.1.0-cdh5.5.2.jar
/usr/lib/hive/lib/hive-service-1.1.0-cdh5.5.2.jar
/usr/lib/hive/lib/hive-jdbc-1.1.0-cdh5.5.2-standalone.jar
/usr/lib/hive/lib/hive-shims-0.23-1.1.0-cdh5.5.2.jar
/usr/lib/hive/lib/hive-serde-1.1.0-cdh5.5.2.jar
/usr/lib/hive/lib/hive-shims-common-1.1.0-cdh5.5.2.jar
/usr/lib/hive/lib/hive-cli-1.1.0-cdh5.5.2.jar
/usr/lib/hive/lib/neop-hadoop-lzo.jar
"""
###TOTAL_CP_COMMA=/usr/lib/hadoop/lib/hadoop-lzo.jar,${HIVE_CLASSPATH},/usr/lib/hadoop-0.20-mapreduce/hadoop-core-2.6.0-mr1-cdh5.5.2.jar
TOTAL_CP_COMMA=`echo $JAR_LIST | sed 's/ /,/g'`
SCRIPT=`pwd`
SCRIPTPATH=`dirname $SCRIPT`
echo "scriptpath:$SCRIPTPATH"
    #--driver-java-options "-Dlog4j.configuration=file://${SCRIPTPATH}/log4j.properties" \
#$SPARK_SUBMIT --master $MASTER --queue root.tech \
#    --num-executors $NUM_EXEC \
#    --driver-class-path ${TOTAL_CP_COMMA}  \
#    --files /etc/hive/conf/hive-site.xml \
#    --driver-java-options "-Dlog4j.configuration=file://${SCRIPTPATH}/l10.prod.log4j.prop" \
#    --conf "spark.executor.extraClassPath=${TOTAL_CP_COMMA}" \
#    --conf "spark.executor.extraLibraryPath=${TOTAL_CP}" \
#    --conf "spark.eventLog.enabled=true" \
#    --conf "spark.eventLog.compree=true" \
#    --conf "spark.eventLog.dir=hdfs://neophdfs/user/spark/applicationHistory" \
#    --py-files logic.py,mod_score.py,mod_score_l10.py \
#    --jars ${TOTAL_CP_COMMA} \
#    --executor-memory 1500M --driver-memory 8G calc_l10.py $dt
#    --conf "spark.serializer=org.apache.spark.serializer.KryoSerializer"\
$SPARK_SUBMIT --master $MASTER --queue root.tech \
    --driver-class-path ${TOTAL_CP_COMMA}  \
    --files /etc/hive/conf/hive-site.xml \
    --conf "spark.sql.shuffle.partitions=10000" \
    --driver-java-options "-Dlog4j.configuration=file:///home/makexu/zqzjhive/zqzj.prod.log4j.prop" \
    --conf "spark.executor.extraClassPath=${TOTAL_CP_COMMA}" \
    --conf "spark.executor.extraLibraryPath=${TOTAL_CP}" \
    --conf "spark.eventLog.enabled=true" \
    --conf "spark.eventLog.compress=true" \
    --conf "spark.eventLog.dir=hdfs://neophdfs/user/spark/applicationHistory" \
    --conf "spark.local.dir=/home/tmp" \
    --conf "spark.driver.maxResultSize=4g" \
    --py-files zq_hive.py \
    --jars ${TOTAL_CP_COMMA} \
    --conf "spark.dynamicAllocation.enabled=true" \
    --conf "spark.shuffle.service.enabled=true" \
    --conf "spark.dynamicAllocation.minExecutors=80" \
    --conf "spark.dynamicAllocation.maxExecutors=1000" \
    --conf "spark.speculation=true" \
    --executor-memory 3000M --driver-memory 16G dump.py -h $hour $dt
#monday=`python -c 'import datetime; print str((datetime.datetime.now() - datetime.timedelta(days=7) + datetime.timedelta(days=-(datetime.datetime.now().weekday()))).strftime("%Y-%m-%d 00:00:00"))[0:10].replace("-","")
#'`
#sudo -u hive kinit -kt /home/hadoop/keytab/sparkuser.keytab sparkuser@HADOOP.163.GZ
#sudo -u sparkuser kinit -kt /home/hadoop/keytab/sparkuser.keytab sparkuser@HADOOP.163.GZ
#
#sudo -u hive hive --auxpath /usr/lib/hive/lib/neop-hive.jar:/usr/lib/hive/lib/fastjson.jar:/usr/lib/hive/lib/neop-hadoop-lzo.jar -e "MSCK REPAIR TABLE sparkuser.l10_weekly"
#
#cd /home/workspace/hbase-importer/app && sudo -u sparkuser ./start-spark-hbase-importer.sh sparkuser.l10_weekly g37:weekly_${monday} "$monday"
#cd $curdir && ./clean_hdfs.sh
3.转换上传代码
def start_save(para):
    rdd_question = para.sc.textFile('file:///home/makexu/zqzjhive/log/question.sql').map(lambda x: x.split('\t'))
    rdd_answer = para.sc.textFile('file:///home/makexu/zqzjhive/log/answer.sql').map(lambda x: x.split('\t'))
    rdd_answer_evaluation = para.sc.textFile('file:///home/makexu/zqzjhive/log/answer_evaluation.sql').map(lambda x: x.split('\t'))
    def save_json_answer(df):
        if len(df) == 8:
            o = {
                'answerid': df[0],
                'qid': df[1],
                'atime': df[2],
                'gid': df[3],
                'content': df[4],
                'handletime': df[5],
                'expire_time': df[6],
                'evaluate_way': df[7],
            }
            return json.dumps(o)
    def save_json_question(df):
        if len(df) == 23:
            o = {
                'qid': df[0],
                'mid': df[1],
                'sid': df[2],
                'pid': df[3],
                'gid': df[4],
                'uid': df[5],
                'level': df[6],
                'ispublish': df[7],
                'replytype': df[8],
                'qtime': df[9],
                'qip': df[10],
                'fetchtime': df[11],
                'lasthandletime': df[12],
                'status': df[13],
                'tagcolor': df[14],
                'hotspot': df[15],
                'contentxml': df[16],
                'qtitle': df[17],
                'replynum': df[18],
                'repute': df[19],
                'priority': df[20],
                'is_new': df[21],
                'hidden': df[22]
            }
            return json.dumps(o)
    def save_answer_evaluation(df):
        if len(df) == 10:
            o = {
                'id': df[0],
                'aid': df[1],
                'pid': df[2],
                'gid': df[3],
                'gm_group_id': df[4],
                'keyid': df[5],
                'evaluation': df[6],
                'eval_time': df[7],
                'answer_time': df[8],
                'sid': df[9]
            }
            return json.dumps(o)
    rdd_answer.map(save_json_answer).saveAsTextFile(
        '/home/workspace/zq/kefu/answer/date=%s' % str(get_date(-1)).replace('-', ''))
    rdd_question.map(save_json_question).saveAsTextFile(
        '/home/workspace/zq/kefu/question/date=%s' % str(get_date(-1)).replace('-', ''))
    rdd_answer_evaluation.map(save_answer_evaluation).saveAsTextFile(
        '/home/workspace/zq/kefu/answer_evaluation/date=%s' % str(get_date(-1)).replace('-', ''))
if __name__ == '__main__':
    print bdts
    print edts
    dump_question()
    dump_answer()
    dump_answer_evaluation()
    product = 'zq'
    sc = None
    try:
        print 'link...'
        conf = SparkConf()
        app_name = "mark.zqzj.save.hive" + ".%s" % bdts
        sc, hsc = get_context(conf, appName=app_name)
        print 'link success'
        uid_dict = {}
        b_uid_set = sc.broadcast(set(uid_dict.keys()))
        param = Param(sc, bdts, product, uid_dict, b_uid_set, hsc)
        print time.strftime('%Y-%m-%d %X', time.localtime())
        start_save(param)
    except:
        get_logger().error(traceback.format_exc())
        raise
    finally:
        if sc:
            sc.stop()
    print 'main quit'
    http://www.cnblogs.com/makexu/

 
                
            
         
         浙公网安备 33010602011771号
浙公网安备 33010602011771号