shell编程系列21--文本处理三剑客之awk中数组的用法及模拟生产环境数据统计
shell中的数组的用法:
shell数组中的下标是从0开始的
array=("Allen" "Mike" "Messi" "Jerry" "Hanmeimei" "Wang")
打印元素: echo ${array[2]}
打印元素个数: echo ${#array[@]}
打印某个元素长度: echo ${#array[3]}
给元素赋值: array[3]=ui;
删除元素: unset array[2];unset array # 删除数组
分片访问: echo ${array[@]:1:3}
元素内容替换: ${array[@]/e/E} 只替换第一个e;${array[@]//e/E} 替换所有的e
数组的遍历:
for a in ${array[@]}
do
echo $a
done
awk中数组的用法:
在awk中,使用数组时,不仅可以使用1.2..n作为数组小标,也可以使用字符串作为数组下标
典型常用例子:
1、统计主机上所有的tcp链接状态数,按照每个tcp状态分类
# netstat -an | grep tcp | awk '{arr[$6]++}END{for (i in arr) print i,arr[i]}'
LAST_ACK 40
LISTEN 6
SYN_RECV 105
ESTABLISHED 3751
FIN_WAIT1 33
FIN_WAIT2 9
CLOSING 3
TIME_WAIT 2444
2、计算横向数据综合,计算纵向数据总和
Allen 80 90 96 98
Mike 93 98 92 91
Zhang 78 76 87 92
Jerry 86 89 68 92
Han 85 95 75 90
Li 78 88 98 100
# 代码如下:
[root@localhost shell]# awk -f statics.awk student.txt
Name Yuwen Math English Physical total
Allen 80 90 96 98 364
Mike 93 98 92 91 374
Zhang 78 76 87 92 333
Jerry 86 89 68 92 335
Han 85 95 75 90 345
Li 78 88 98 100 364
every_total 500 536 516 563
[root@localhost shell]# cat statics.awk
BEGIN{
printf "%-30s%-30s%-30s%-30s%-30s%-30s\n","Name","Yuwen","Math","English","Physical","total"
}
{
total=$2+$3+$4+$5
yuwen_sum+=$2
math_sum+=$3
english_sum+=$4
physical_sum+=$5
printf "%-30s%-30d%-30d%-30d%-30d%-30d\n",$1,$2,$3,$4,$5,total
}
END{
printf "%-30s%-30d%-30d%-30d%-30d\n","every_total",yuwen_sum,math_sum,english_sum,physical_sum
}
计算字符串的长度:
[root@localhost shell]# str="test string"
[root@localhost shell]# echo $str
test string
[root@localhost shell]# echo ${#str}
11
# 修改数组元素
array=("Allen" "Mike" "Messi" "Jerry" "Hanmeimei" "Wang")
[root@localhost shell]# array[1]="Jerry"
[root@localhost shell]# echo ${array[@]}
Allen Jerry Messi Jerry Hanmeimei Wang
# 删除第3个元素
[root@localhost shell]# echo ${array[@]}
Allen Jerry Messi Jerry Hanmeimei Wang
[root@localhost shell]#
[root@localhost shell]# unset array[2];
[root@localhost shell]# echo ${array[@]}
Allen Jerry Jerry Hanmeimei Wang
# 在数组中删除下标为1的元素,即Mike被删除,再次删除下标为1的元素,发现数组不变,说明数组虽然删除了元素,下标还是不变保存在内存中
[root@localhost shell]# array=("Allen" "Mike" "Messi" "Jerry" "Hanmeimei" "Wang")
[root@localhost shell]# unset array[1]
[root@localhost shell]# echo ${array[*]}
Allen Messi Jerry Hanmeimei Wang
[root@localhost shell]# unset array[1]
[root@localhost shell]# echo ${array[*]}
Allen Messi Jerry Hanmeimei Wang
# 分片访问,数组为1的开始遍历3个元素
[root@localhost shell]# array=("Allen" "Mike" "Messi" "Jerry" "Hanmeimei" "Wang")
[root@localhost shell]# echo ${array[@]:1:3}
Mike Messi Jerry
# 1到最后
[root@localhost shell]# echo ${array[@]:1}
Mike Messi Jerry Hanmeimei Wang
#替换1个,替换所有
[root@localhost shell]# echo ${array[@]}
Allen Mike Messi Jerry Hanmeimei Wang
[root@localhost shell]# echo ${array[@]/e/E}
AllEn MikE MEssi JErry HanmEimei Wang
[root@localhost shell]# echo ${array[@]//e/E}
AllEn MikE MEssi JErry HanmEimEi Wang
# 遍历数组
[root@localhost shell]# for a in ${array[@]};do echo $a;done
Allen
Mike
Messi
Jerry
Hanmeimei
Wang
计算横向和、纵向和
Allen 80 90 96 98
Mike 93 98 92 91
Zhang 78 76 87 92
Jerry 86 89 68 92
Han 85 95 75 90
Li 78 88 98 100
[root@localhost shell]# awk -f stu.awk student.txt
Name Yuwen Math English Physical total
Allen 80 90 96 98 364
Mike 93 98 92 91 374
Zhang 78 76 87 92 333
Jerry 86 89 68 92 335
Han 85 95 75 90 345
Li 78 88 98 100 364
every_total 500 536 516 563
[root@localhost shell]# cat stu.awk
BEGIN{
printf "%-20s%-20s%-20s%-20s%-20s%-20s\n","Name","Yuwen","Math","English","Physical","total"
}
{ total=$2+$3+$4+$5
yunwen_sum+=$2
math_sum+=$3
english_sum+=$4
physical_sum+=$5
printf "%-20s%-20d%-20d%-20d%-20d%-20d\n",$1,$2,$3,$4,$5,total
}
END{
printf "%-20s%-20d%-20d%-20d%-20d\n","every_total",yunwen_sum,math_sum,english_sum,physical_sum
}
# 模拟生产环境数据脚本
[root@localhost shell]# cat insert.sh
#!/bin/bash
#
function create_random()
{
min=$1
max=$(($2-$min+1))
num=$(date +%s%N)
echo $(($num%$max+$min))
}
INDEX=1
while true
do
for user in Allen Mike Jerry Tracy Hanmeimei Lilei
do
COUNT=$RANDOM
NUM1=`create_random 1 $COUNT`
NUM2=`expr $COUNT - $NUM1`
echo "`date '+%Y-%m-%d %H:%M:%S'` $INDEX Batches: user:$user insert $COUNT records into datebase:product table:detail, insert $NUM1 records successfully,
failed $NUM2 records" >> ./db.log.`date +%Y%m%d`
INDEX=`expr $INDEX + 1`
done
done
数据格式如下:
db.log.20190608
2019-06-08 10:31:40 15459 Batches: user Jerry insert 5504 records into datebase:product table:detail, insert 5253 records successfully,failed 251 records
2019-06-08 10:31:40 15460 Batches: user Tracy insert 25114 records into datebase:product table:detail, insert 13340 records successfully,failed 11774 records
2019-06-08 10:31:40 15461 Batches: user Hanmeimei insert 13840 records into datebase:product table:detail, insert 5108 records successfully,failed 8732 records
2019-06-08 10:31:40 15462 Batches: user Lilei insert 32691 records into datebase:product table:detail, insert 5780 records successfully,failed 26911 records
2019-06-08 10:31:40 15463 Batches: user Allen insert 25902 records into datebase:product table:detail, insert 14027 records successfully,failed 11875 records
...
1、统计每个人分别插入了多少条record进数据库
输出结果示例:
Name totalrecords
allen 493082
mike 349287
[root@localhost shell]# awk -f exam1.awk db.log.20190608
User Total records
Jerry 42168344
Mike 42070170
Lilei 42628926
Hanmeimei 41141251
Tracy 42241015
Allen 42283107
[root@localhost shell]# cat exam1.awk
BEGIN{
printf "%-20s%-20s\n","User","Total records"
}
{
USER[$6]+=$8
}
END{
for(u in USER)
printf "%-20s%-20d\n",u,USER[u]
}
2、统计每个人分别插入成功了多少record,失败了多少record
输出结果:
User Success_record failed_record
jerry 342222 2813413
success $14
failed $17
[root@localhost shell]# cat exam2.awk
BEGIN{
printf "%-30s%-30s%-30s\n","User","Success records","Failed records"
}
{
SUCCESS[$6]+=$14
FAILED[$6]+=$17
}
END{
for(u in SUCCESS)
printf "%-30s%-30d%-30d\n",u,SUCCESS[u],FAILED[u]
}
[root@localhost shell]# awk -f exam2.awk db.log.20190608
User Success records Failed records
Jerry 20641524 21526820
Mike 21119844 20950326
Lilei 21441042 21187884
Hanmeimei 20584580 20556671
Tracy 20837585 21403430
Allen 21024923 21258184
3、将例子1和例子2结合起来,一起输出,输出每个人分别插入多少条数据,多少成功,多少失败,并且要格式化输出,加上标题
输出结果:
User Total success failed
tracy 6134139 34113444 1341
allen 13387878 7897912 14319
代码:
[root@localhost shell]# cat exam3.awk
BEGIN{
printf "%-30s%-30s%-30s%-30s\n","Name","total records","success records","failed records"
}
{
TOTAL_RECORDS[$6]+=$8
SUCCESS[$6]+=$14
FAILED[$6]+=$17
}
END{
for(u in TOTAL_RECORDS)
printf "%-30s%-30d%-30d%-30d\n",u,TOTAL_RECORDS[u],SUCCESS[u],FAILED[u]
}
[root@localhost shell]# awk -f exam3.awk db.log.20190608
Name total records success records failed records
Jerry 42168344 20641524 21526820
Mike 42070170 21119844 20950326
Lilei 42628926 21441042 21187884
Hanmeimei 41141251 20584580 20556671
Tracy 42241015 20837585 21403430
Allen 42283107 21024923 21258184
4、在例子3的基础上,加上结尾,统计全部插入记录数,成功记录数,失败记录数
输出结果:
User Total success failed
tracy 6134139 34113444 1341
allen 13387878 7897912 14319
1413413 3141311341 13411
方法1:
[root@localhost shell]# cat exam4_b.awk
BEGIN{
printf "%-30s%-30s%-30s%-30s\n","Name","total records","success records","failed records"
}
{
TOTAL_RECORDS[$6]+=$8
SUCCESS[$6]+=$14
FAILED[$6]+=$17
}
END{
for(u in TOTAL_RECORDS)
{
# 在统计出的结果数组中进行累加
records_sum+=TOTAL_RECORDS[u]
success_sum+=SUCCESS[u]
failed_sum+=FAILED[u]
printf "%-30s%-30d%-30d%-30d\n",u,TOTAL_RECORDS[u],SUCCESS[u],FAILED[u]
}
printf "%-30s%-30d%-30d%-30d\n","",records_sum,success_sum,failed_sum
}
[root@localhost shell]# awk -f exam4_b.awk db.log.20190608
Name total records success records failed records
Jerry 42168344 20641524 21526820
Mike 42070170 21119844 20950326
Lilei 42628926 21441042 21187884
Hanmeimei 41141251 20584580 20556671
Tracy 42241015 20837585 21403430
Allen 42283107 21024923 21258184
252532813 125649498 126883315
方法2:
[root@localhost shell]# cat exam4.awk
BEGIN{
printf "%-30s%-30s%-30s%-30s\n","Name","total records","success records","failed records"
}
{
RECORDS[$6]+=$8
SUCCESS[$6]+=$14
FAILED[$6]+=$17
# 在原始数据中进行汇总计算
records_sum+=$8
success_sum+=$14
failed_sum+=$17
}
END{
for(u in RECORDS)
printf "%-30s%-30d%-30d%-30d\n",u,RECORDS[u],SUCCESS[u],FAILED[u]
printf "%-30s%-30d%-30d%-30d\n","total",records_sum,success_sum,failed_sum
}
[root@localhost shell]# awk -f exam4.awk db.log.20190608
Name total records success records failed records
Jerry 42168344 20641524 21526820
Mike 42070170 21119844 20950326
Lilei 42628926 21441042 21187884
Hanmeimei 41141251 20584580 20556671
Tracy 42241015 20837585 21403430
Allen 42283107 21024923 21258184
total 252532813 125649498 126883315
5、查找丢失数据的现象,也就是成功+失败的记录数不等于一共插入的记录数,找出这些数据并显示行号和对应行的日志信息
输出结果:
代码:
[root@localhost shell]# awk '{if($8!=$14+$17) print NR,$0}' db.log.20190608
5 2019-06-08 10:29:59 5 Batches: user Hanmeimei insert 15688 records into datebase:product table:detail, insert 3986 records successfully,failed 1102 records
15404 2019-06-08 10:31:39 15404 Batches: user Mike insert 26675 records into datebase:product table:detail, insert 1931 records successfully,failed 24725 records
# 写入文件的方式
[root@localhost shell]# awk -f exam5.awk db.log.20190608
5 2019-06-08 10:29:59 5 Batches: user Hanmeimei insert 15688 records into datebase:product table:detail, insert 3986 records successfully,failed 1102 records
15404 2019-06-08 10:31:39 15404 Batches: user Mike insert 26675 records into datebase:product table:detail, insert 1931 records successfully,failed 24725 records
[root@localhost shell]# cat exam5.awk
BEGIN{
}
{
if($8!=$14+$17)
print NR,$0
}