中心环境测试记录

四川公司测试环境

测试方法:
    写速度:time dd bs=2G count=2 oflag=direct if=/dev/zero of=/var/data/block
    读速度:time dd if=/dev/sdb of=/dev/null bs=4k
12块盘raid0 
    读速度: 29698170880 bytes (30 GB) copied, 28.4618 s, 1.0 GB/s
    写速度: 4294959104 bytes (4.3 GB) copied, 12.5802 s, 341 MB/s

中心集群测试环境

测试方法:
    写速度:time dd bs=2G count=2 oflag=direct if=/dev/zero of=/data01/block
    读速度:time dd if=/dev/sdb1 of=/dev/null bs=4k 
单块盘:
    读速度:2687393792字节(2.7 GB)已复制,13.9415 秒,193 MB/秒
    写速度:4294959104字节(4.3 GB)已复制,28.7784 秒,149 MB/秒
8块盘raid0
    读速度:11199823872字节(11 GB)已复制,10.4142 秒,1.1 GB/秒
    写速度:4294959104字节(4.3 GB)已复制,7.61144 秒,564 MB/秒

测试用例

测试用例SQL
test_2_0_1 select pz_id,ydz_ip,count(1) as sl from url_rz_t1 where day<20150302 and pzs in (0,4,7) and pz_id in (963677,146870,429074) group by pz_id,ydz_ip;”
test_2_0_2 select concat(md_dz,fd_lj) as url,count(1) as sl from url_rz_t1 where day<20150302 and ydz_ip=3658617496 group by md_dz,fd_lj;”
test_2_0_3 select ydz_ip,count(1) as sl from url_rz_t1 where day<20150302 and pzs in (0,3) and pz_id in (146870,992373,342090) and md_dz like ‘cdnb.static.youjizz.com%’ group by ydz_ip;”
test_2_0_4 select a1.yl1,count(1) as sl from url_rz_t1 a1 join zb_test a2 on a1.pz_id=a2.pz_id where day<20150302 group by a1.yl1”
test_2_0_5 select a1.pz_id,a1.ydz_ip,count(1) as sl from url_rz_t1 a1 join zb_test a2 on a1.pz_id=a2.pz_id where day<20150302 group by a1.pz_id,a1.ydz_ip;”
test_2_0_6 select t1.md_dz,t1.mddz_ip,t1.ydz_ip,t1.shijian,t1.yl from url_rz_t1 t1 join zb_test t2 on t1.pz_id=t2.pz_id where t1.day>=20150301 and t1.day<20150303;”
test_2_0_7 select a3.pz_id,a3.ydz_ip,count(1) as sl from url_rz_t1 a3 join zb_test a2 on a3.pz_id=a2.pz_id where a3.day=20150301 group by a3.ydz_ip,a3.pz_id having count(1)>1 order by a3.ydz_ip,a3.pz_id;”
test_2_0_8 select a2.pz_id,a2.ydz_ip,a2.ydz_dk,count(1) from url_rz_t1 a2 join zb_test a1 on a2.pz_id=a1.pz_id where a2.day=20150301 group by a2.pz_id,a2.ydz_ip,a2.ydz_dk”
test_2_0_9 select a2.yl,a2.yl2,count(1),a2.yl1 from url_rz_t1 a2 join zb_test a1 on a2.pz_id=a1.pz_id where a2.day=20150301 group by a2.yl,a2.yl2,a2.yl1”
test_2_0_10 select ydz_ip,count(1) as sl from url_rz_t1 where pz_id in (146870,992373,342090) and pzs in (0,3) and md_dz like ‘cdnb.static.youjizz.com%’ group by ydz_ip;”
test_2_0_11 select a3.pz_id,a3.ydz_ip,count(1) as sl from url_rz_t1 a3 join zb_test a2 on a3.pz_id=a2.pz_id group by a3.ydz_ip,a3.pz_id having count(1)>1 order by a3.ydz_ip,a3.pz_id;”
test_2_1 select pz_id,ydz_ip,count(1) as sl from url_rz_extend_t1 where day<20150302 and pz_id in (963677,146870,429074) and pzs in (7,0,4) group by pz_id,ydz_ip;”
test_2_2 select concat(md_dz,fd_lj) as url,count(1) as sl from url_rz_extend_t1 where day<20150302 and ydz_ip=3658617496 group by md_dz,fd_lj;”
test_2_3 select ydz_ip,count(1) as sl from url_rz_extend_t1 where day<20150302 and pz_id in (146870,992373,342090) and pzs in (0,3) and md_dz like ‘cdnb.static.youjizz.com%’ group by ydz_ip;”
test_2_4 select a1.yl1,count(1) from url_rz_extend_t1 a1 join zb_test a2 on a1.pz_id=a2.pz_id where a1.day<20150302 group by a1.yl1;”
test_2_5 select a1.pz_id,a1.ydz_ip,count(1) as sl from url_rz_extend_t1 a1 join zb_test a2 on a1.pz_id=a2.pz_id where a1.day<20150302 group by a1.pz_id,a1.ydz_ip;”
test_2_6 select t1.md_dz,t1.mddz_ip,t1.ydz_ip,t1.shijian,t1.yl from url_rz_extend_t1 t1 join zb_test t2 on t1.pz_id=t2.pz_id where t1.day>=20150301 and t1.day<20150303;”
test_2_7 select a3.pz_id,ydz_ip,count(1) from url_rz_extend_t1 a3 join zb_test a2 on a3.pz_id=a2.pz_id where a3.day<’20150302’ and a3.day>=’20150301’ group by a3.ydz_ip,a3.pz_id having count(1)>1 order by a3.ydz_ip,a3.pz_id;”
test_2_8 select a2.pz_id,a2.ydz_ip,a2.ydz_dk,count(1) from url_rz_extend_t1 a2 join zb_test a1 on a2.pz_id=a1.pz_id where a2.day<’20150302’ and a2.day>=’20150301’ group by a2.pz_id,a2.ydz_ip,a2.ydz_dk”
test_2_9 select a2.yl,a2.yl2,count(*) as sl,a2.yl1 from url_rz_extend_t1 a2 join zb_test a1 on a2.pz_id=a1.pz_id where day=20150301 group by a2.yl,a2.yl2,a2.yl1;”
test_2_10 select time, customer_id, router_group, ip, sum(dstpacket) from RG_CUSTOMER_PEERIPDSTPACKET_M group by time, customer_id, router_group,ip”
test_2_11 select time, customer_id, router_group, protocol, port, sum(inbyte), sum(inpacket) from RG_CUSTOMER_DSTPORT_M group by time, customer_id, router_group, protocol, port”
test_2_12 select time, customer_id, router_group, ip, sum(srcbyte) from RG_CUSTOMER_IPSRCBYTE_M group by time, customer_id, router_group, ip”
test_2_13 select kg_iplist.ip,case when server_port=80 and attr like ‘%UDA%’ then ‘udp 80’ when server_port=443 and attr like ‘%UDA%’ then ‘udp 443’ when server_port=80 and attr like ‘%TDA%’ then ‘tcp 80’ when server_port=443 and attr like ‘%TDA%’ then ‘tcp 443’ else null end as f7, count(distinct client_ip) as dccip,count(client_ip) as ccip from comm_pkg_t1 join kg_iplist on comm_pkg_t1.server_ip=kg_iplist.ip where pd=20150824 and cp=1 and ((server_port=80 and attr like ‘%UDA%’) or (server_port=443 and attr like ‘%UDA%’) or (server_port=80 and attr like ‘%TDA%’) or (server_port=443 and attr like ‘%TDA%’)) group by kg_iplist.ip,case when server_port=80 and attr like ‘%UDA%’ then ‘udp 80’ when server_port=443 and attr like ‘%UDA%’ then ‘udp 443’ when server_port=80 and attr like ‘%TDA%’ then ‘tcp 80’ when server_port=443 and attr like ‘%TDA%’ then ‘tcp 443’ else null end;”
test_2_14 select case when (comm_pkg_t1.server_port=80 or comm_pkg_t1.server_port=443) and comm_pkg_t1.attr like ‘%TDA%’ then ‘tcp’ when (comm_pkg_t1.server_port=80 or comm_pkg_t1.server_port=443) and comm_pkg_t1.attr like ‘%UDA%’ then ‘udp’ else null end as f7, count(distinct client_ip) as dccip,count(client_ip) as ccip from comm_pkg_t1 join kg_iplist on comm_pkg_t1.server_ip=kg_iplist.ip where comm_pkg_t1.pd=20150824 and comm_pkg_t1.cp=1 and (((comm_pkg_t1.server_port=80 or comm_pkg_t1.server_port=443) and comm_pkg_t1.attr like ‘%TDA%’) or ((comm_pkg_t1.server_port=80 or comm_pkg_t1.server_port=443) and comm_pkg_t1.attr like ‘%UDA%’)) group by case when (comm_pkg_t1.server_port=80 or comm_pkg_t1.server_port=443) and comm_pkg_t1.attr like ‘%TDA%’ then ‘tcp’ when (comm_pkg_t1.server_port=80 or comm_pkg_t1.server_port=443) and comm_pkg_t1.attr like ‘%UDA%’ then ‘udp’ else null end;”
test_2_15 select case when server_port=80 and attr like ‘%UDA%’ then ‘udp 80’ when server_port=443 and attr like ‘%UDA%’ then ‘udp 443’ when server_port=80 and attr like ‘%TDA%’ then ‘tcp 80’ when server_port=443 and attr like ‘%TDA%’ then ‘tcp 443’ else null end as f7, count(distinct client_ip),count(client_ip) from comm_pkg_t1 join kg_iplist on kg_iplist.ip=comm_pkg_t1.server_ip where pd=20150824 and cp=1 and ((server_port=80 and attr like ‘%UDA%’) or (server_port=443 and attr like ‘%UDA%’) or (server_port=80 and attr like ‘%TDA%’) or (server_port=443 and attr like ‘%TDA%’)) group by case when server_port=80 and attr like ‘%UDA%’ then ‘udp 80’ when server_port=443 and attr like ‘%UDA%’ then ‘udp 443’ when server_port=80 and attr like ‘%TDA%’ then ‘tcp 80’ when server_port=443 and attr like ‘%TDA%’ then ‘tcp 443’ else null end;”
test_2_16 select ip_list.ip,comm_pkg_t1.server_port,comm_pkg_t1.client_ip,comm_pkg_t1.client_port,comm_pkg_t1.finder_id,comm_pkg_t1.attr,comm_pkg_t1.pkgs,comm_pkg_t1.bytes from comm_pkg_t1 join ip_list on comm_pkg_t1.server_ip=ip_list.ip where comm_pkg_t1.pd=20150824 and comm_pkg_t1.attr like ‘NIL%’ and (comm_pkg_t1.finder_id=’10903’ or comm_pkg_t1.finder_id=’7002’ or comm_pkg_t1.finder_id=’11031’ or comm_pkg_t1.finder_id=’11032’ or comm_pkg_t1.finder_id=’10905’ or comm_pkg_t1.finder_id=’10906’ or comm_pkg_t1.finder_id=’10907’ or comm_pkg_t1.finder_id=’10908’);”
test_2_17 select localtime,createtime,server_ip,server_port,client_ip,client_port,finder_id,attr,pkgs,bytes,pd from comm_pkg_t1 where pd=20150824 and attr like ‘NIL%’ and (finder_id=’10903’ or finder_id=’7002’ or finder_id=’11031’ or finder_id=’11032’ or finder_id=’10905’ or finder_id=’10906’ or finder_id=’10907’ or finder_id=’10908’);”
test_2_18 select ip.ip,count(distinct comm_pkg_t1.client_ip) as dccip from comm_pkg_t1 join ip on comm_pkg_t1.server_ip=ip.ip where comm_pkg_t1.pd=20150824 and comm_pkg_t1.cp=1 group by ip.ip;”
test_2_19 select list.ip,list.port,count(distinct comm_pkg.client_ip) as dccip from comm_pkg join list on comm_pkg.server_ip=list.ip and comm_pkg.server_port=list.port where comm_pkg.pd=20150824 and comm_pkg.cp=1 group by list.ip,list.port;”
test_2_20 select count(distinct client_ip) as dccip,count(client_ip) as ccip from comm_pkg join ip on comm_pkg.server_ip=ip.ip where comm_pkg.pd=20150824 and comm_pkg.cp=1”

公司测试模拟数据

namecountsize(parquet+snappy)partitiondesc
comm_pkg 32917378285 约320亿条 4.3T(hdfs)/4.6T(原始) pd,cp,sp pd(时间),cp(pkgs)值为0或者1,sp(server_port)值为0或者8080,443等查询端口
url_rz 7991716844 约80亿条 299.4G(20150301) day,pzs day(时间),pzs(pz_id)根据id最后一位值建立0-9的分区

Impala测试-四川环境

测试项耗时描述
test_2_0_4 1700s 1亿随机数分组
test_2_0_7 2000s 结果11亿
test_2_20 6581.41s 结果1条
test_2_19 892.7s 结果1598428条
test_2_18   分组数量太大内存溢出
test_2_17 3015.22s/1260s(优化块后的查询时间) 结果8517989条
test_2_16 2107.1s 结果17038222条
test_2_15 1386.7s 结果3条
test_2_14 1339.43s 结果3条
test_2_13 1288.21s 结果948875条

Join字段分区测试

SQL耗时描述
select count(distinct client_ip),count(client_ip) from comm_pkg1 c join ip1 i on c.server_ip=i.ip and c.ipl = i.ipl where c.pd= 20150824; 153.31s c.ipl = i.ipl为join分区字段,根据ip字段最后一位进行分区
select count(distinct client_ip),count(client_ip) from comm_pkg1 c join ip1 in c.server_ip=i.ip where c.pd= 20150824; 146.96s 不带join分区字段条件

Impala join DAG图例如下;


执行结果如下:
+—————————+——————+
| count(distinct client_ip) | count(client_ip) |
+—————————+——————+
| 10000000 | 1286308554 |
+—————————+——————+
结论:join分区条件并不会加快查询速度.

遇到问题

当查询时有部分数据块不是本地优先读取,而是通过网络传输这样带来性能损耗。根据提示运行INVALIDATE METADATA default.`url_rz后依旧无效。

  1. WARNINGS:Read351.45 MB of data across network that was expected to be local.Block locality metadata for table 'default.url_rz' may be stale.Consider running "INVALIDATE METADATA `default`.`url_rz`".
  2. Read322.53 MB of data across network that was expected to be local.Block locality metadata for table 'default.url_rz' may be stale.Consider running "INVALIDATE METADATA `default`.`url_rz`".
  3. Read329.39 MB of data across network that was expected to be local.Block locality metadata for table 'default.url_rz' may be stale.Consider running "INVALIDATE METADATA `default`.`url_rz`".
  4. Read556.97 MB of data across network that was expected to be local.Block locality metadata for table 'default.url_rz' may be stale.Consider running "INVALIDATE METADATA `default`.`url_rz`".
  5. Read495.47 MB of data across network that was expected to be local.Block locality metadata for table 'default.url_rz' may be stale.Consider running "INVALIDATE METADATA `default`.`url_rz`".
  6. Read497.76 MB of data across network that was expected to be local.Block locality metadata for table 'default.url_rz' may be stale.Consider running "INVALIDATE METADATA `default`.`url_rz`".
  7. Read496.60 MB of data across network that was expected to be local.Block locality metadata for table 'default.url_rz' may be stale.Consider running "INVALIDATE METADATA `default`.`url_rz`".
  8. Read232.36 MB of data across network that was expected to be local.Block locality metadata for table 'default.url_rz' may be stale.Consider running "INVALIDATE METADATA `default`.`url_rz`".
  9. Parquet files should not be split into multiple hdfs-blocks. file=hdfs://logSave/home/etl/export/parquet/url_rz/day=20150301/pzs=7/url_rz_20160512155453_31_e6c8f171-818d-4c82-a08b-e40824f28313.parquet (1 of 5946 similar)
  10. Metadata states that ingroup hdfs://logSave/home/etl/export/parquet/url_rz/day=20150301/pzs=6/url_rz_20160512160805_38_5308e0ce-9d9d-48e1-8ee9-72a1c4852eb9.parquet(0) there are 1 rows, but there is at least one more row in the file. (1 of 192 similar)

最后根据Parquet files should not be split into multiple hdfs-blocks.错误信息引导下,查询parquet官网推荐的parquet file大小为1G、hdfs one block可以获得最佳的读取性能。经测试不会再报这个错误。如下信息摘取自parquet file

Row group size: Larger row groups allow for larger column chunks which makes it possible to do larger sequential IO. Larger groups also require more buffering in the write path (or a two pass write). We recommend large row groups (512MB - 1GB). Since an entire row group might need to be read, we want it to completely fit on one HDFS block. Therefore, HDFS block sizes should also be set to be larger. An optimized read setup would be: 1GB row groups, 1GB HDFS block size, 1 HDFS block per HDFS file.

Spark测试-四川环境

采用spark-thrift-server,使用hive的beeline工具连接执行查询SQL的方式。

测试项耗时描述
test_2_20 38m 结果1条
test_2_19 6.8m 1598428条
test_2_17 15m 结果8517989条
test_2_16 12m 结果17038222条
test_2_15 5.4m 结果3条
test_2_14 4.6m 结果3条
test_2_13 6.2m 结果948875条

优化

    经过impala的优化经验采取了1Gparquet + 1G block的策。在join,group含大量shuffle操作的查询时性能要突出很多。因为impala目前只有单核进行join,group这类的操作(This can happen because currently Impala uses only a single core per node to process join and aggregation queries.),可能是考虑到并发性的需求。而spark则采用并行方式。
    JVM参数-XX:+UseCompressedOops调整指针由8bytes替换为4bytes

遇到问题

beeline bug: https://issues.apache.org/jira/browse/HIVE-6758
解决办法
export HADOOP_CLIENT_OPTS=”-Djline.terminal=jline.UnsupportedTerminal”

资料:
http://www.cloudera.com/documentation/archive/impala/2-x/2-1-x/topics/impala_cluster_sizing.html

impala VS spark

中心测试-现场环境

数据入库情况

表名条数大小(HDFS)原始文件大小
url_rz_snappy 43207873470 2.3T 7.3TB
url_rz_extend_snappy 56768636007 562.3 G 9.5TB
RG_CUSTOMER_PEERIPDSTPACKET_M 4258186310 41.7G 160GB
RG_CUSTOMER_DSTPORT_M 2599488956 20.2G 102GB
RG_CUSTOMER_IPSRCBYTE_M 1460522987 11.8G 55GB
comm_pkg_snappy 126683074109 4.4T 18TB
zb_test 1000001 205.6 M 500MB
kg_iplist 2000000 416.1 M 1GB
ip_list 2000000 416.1 M 1GB
ip 2000000 416.1 M 1GB
list 2000000 425.5 M 1GB

ETL入库性能

加载机数量加载方法每台机器读取线程数每台机器发送线程数发送数据量(TB)数据压缩格式数据压缩比率平均单台发送速度最大单台发送速度平均单台磁盘IO利用率最大单台磁盘利用率平均单台CPU利用率最大单台CPU利用率平均单台内存利用率最大单台内存利用率
19 ETL 5 5 16.7 parquet+snappy 30% 241914 254455 10% 17.20% 38.8 41 30g 34g

SQL查询

测试项耗时结果条数结果数据量
2.0.1 25.6s 3309327 63.6M
2.0.2 144.5s 12 624B
2.0.3 21.2s 382209 4.9M
2.0.4 30.4s 19827 576k
2.0.5 43.1s 59091218 1.1G
2.0.6 234.6s 2833856415 662.8G
2.0.7 48.9s 36388200 700.9M
2.0.8 156.9s 1176776386 28.3G
2.0.9 35.5s 55116 2.4M
2.0.10 139.6s 6314620 80.2M
2.0.11 574.5s 703492236 13.3G
2.1 52.5s 747 17.3K
2.2 240.7s 0 只有12条是查询数据1里的结果
2.3 65.6s 78 1.3K
2.4 147.1s 12 259B
2.5 202.1s 256476 5.9M
2.6 254.0s 56768636007 6.8T
2.7 178.3s 256476 5.9M
2.8 231.6s 3329104 94.3M
2.9 256.1s 849 31.9K
2.10 172.0s 4150379957 312.4G
2.11 209.3s 2362301374 185G
2.12 160.8s 1083782662 41G
2.13 568.7s 158,671 3.7M
2.14 548.8s 2 43B
2.15 653.7s 4 51B
2.16 595.9s 2821474 184M
2.17 688.3s 912789298 98.7G
2.18 436.8s 1184826 18.3M
2.19 267.5s 9 192B
2.20 309.0s 1 20B

并发测试

单线程

测试项查询时间方式
2.2 506.2s 并发
2.7 1240.9s 并发
2.11 1254.3s 并发
2.14 1269.1s 并发
2.15 1274.2s 并发

3线程

测试项查询时间方式
2.2 22m 并发
2.2 25m 并发
2.2 35m 并发
2.7 32m 并发
2.7 30m 并发
2.7 36m 并发
2.11 27m 并发
2.11 29m 并发
2.11 25m 并发
2.14 41m 并发
2.14 40m 并发
2.14 40m 并发
2.15 45m 并发
2.15 45m 并发
2.15 46m 并发

稳定测试

执行查询中KILL计算节点 任务会失败并重新分配,任务查询时间延长。
执行查询中KILL namenode节点 对查询基本没有影响
执行查询中KILL master节点 对查询基本没有影响





posted @ 2016-12-14 15:22  jasonbrooke  阅读(831)  评论(0)    收藏  举报