Hive 调优代码准备

 1 # coding: utf-8
 2 import random
 3 import datetime
 4 import sys
 5 from imp import reload
 6 
 7 reload(sys)
 8 # lastname和first都是为了来随机构造名称
 9 lastname = u"赵李周吴郑王冯陈褚卫蒋沈韩杨朱秦尤许何吕施张孔曹严华金魏陶姜戚谢邹喻柏水窦章云苏潘葛奚范彭郎鲁韦昌马苗"
10 firstname = u"红尘冷暖岁月清浅仓促间遗落一地如诗的句点不甘愿不决绝掬一份刻骨的思念系一根心的挂牵在你回眸抹兰轩的底色悄然"
11 #创建一个函数，参数start表示循环的批次
12 def create_student_dict(start):
13     firstlen = len(firstname)
14     lastlen = len(lastname)
15     # 创建一个符合正太分布的分数队列
16     scoreList = [int(random.normalvariate(100, 50)) for _ in range(1, 5000)]
17     # 创建1万条记录，如果执行程序内存够大这个可以适当调大
18     filename = str(start) + '.txt'
19     print (filename)
20     #每次循环都创建一个文件，文件名为：循环次数+'.txt',例如 1.txt
21     with open('/usr/local/data/warehouse/student_small/' + filename, mode='w+') as fp:
22         for i in range(start * 40000, (start + 1) * 40000):
23             firstind = random.randint(1, firstlen - 4)
24             model = {"s_no": u"xuehao_no_" + str(i),
25                 "s_name": u"{0}{1}".format(lastname[random.randint(1, lastlen - 1)],
26                                            firstname[firstind: firstind + 1]),
27                 "s_birth": u"{0}-{1}-{2}".format(random.randint(1991, 2000),
28                                                  '0' + str(random.randint(1, 9)),
29                                                  random.randint(10, 28)),
30                 "s_age": random.sample([20, 20, 20, 20, 21, 22, 23, 24, 25, 26], 1)[0],
31                 "s_sex": str(random.sample(['男', '女'], 1)[0]),
32                 "s_score": abs(scoreList[random.randint(1000, 4990)]),
33                 's_desc': u"程序猿攻城狮队伍,"
34                           u"为祖国贡献一份自己的力量" * random.randint(1, 3)}
35             #写入数据到本地文件
36             fp.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}".
37                      format(model['s_no'], model['s_name'],
38                             model['s_birth'], model['s_age'],
39                             model['s_sex'], model['s_score'],
40                             model['s_desc']))
41 # 循环创建记录,一共是40000*500=2千万的数据
42 for i in range(1, 51):
43     starttime = datetime.datetime.now()
44     create_student_dict(i)

# coding: utf-8
import random, datetime
import sys
from imp import reload

reload(sys)

#创建一个函数，参数start表示循环的批次
def create_student_sc_dict(start):
    filename = str(start)+'.txt'
    print (start)
    with open('/usr/local/data/warehouse/course_small/'+filename , mode='w+') as fp:
        for i in range(start * 40000, (start + 1) * 40000):
            #课程出现越多表示喜欢的人越多
            course = random.sample([u'数学', u'数学', u'数学', u'数学', u'数学',
                                    u'语文', u'英语', u'化学', u'物理', u'生物'], 1)[0]
            model = {"s_no": u"xuehao_no_" + str(i),
                     "course": u"{0}".format(course),
                     "op_datetime": datetime.datetime.now().strftime("%Y-%m-%d"),
                     "reason": u"我非常非常"
                               u"非常非常非常喜爱{0}".format(course)}
            line = "{0}\t{1}\t{2}\t{3}"\
                .format(model['s_no'],
                        model['course'],
                        model['op_datetime'],
                        model['reason'])
            fp.write(line)


# 循环创建记录,一共是40000*500=2千万记录
for i in range(1, 51):
    starttime = datetime.datetime.now()  # create_student_dict 转换成dataframe格式，并注册临时表temp_student
    create_student_sc_dict(i)

用以上python代码生成若干数据做为调优数据样本

环境：

python version 3.6

posted @ 2020-12-05 21:32 lenomail 阅读(83) 评论(0) 收藏举报

刷新页面返回顶部

lenomail

Hive 调优代码准备

公告