#!/bin/bash
# 统计搜索词 分析171,173上的日志
sourceDir="/export/manager/kmsearch/log/wordlog"
tmpDateFile="/tmp/search_wordlog_tmp.txt"
tmpSearchWordlog="/tmp/search_wordlog"
# 分析获取哪些日志文件
startDate="2015-05-04"
startTimeStamp=`date -d "$startDate" +%s`
endDate="2015-12-31"
endTimeStamp=`date -d "$endDate" +%s`
echo "" > $tmpDateFile
for((i=$startTimeStamp; i<=$endTimeStamp; i=i+86400))
do
dateStr=`date -d @$i "+%Y-%m-%d"`
echo "$dateStr.txt" >> $tmpDateFile
done
#下载 171
echo "downloading from 171..."
dateArr=$(cat $tmpDateFile )
for tmpStr in ${dateArr[@]}
do
scp root@10.15.200.171:$sourceDir/$tmpStr $tmpSearchWordlog/171/
done
#173
echo "downloading from 173..."
dateArr=$(cat $tmpDateFile )
for tmpStr in ${dateArr[@]}
do
scp root@10.15.200.173:$sourceDir/$tmpStr $tmpSearchWordlog/173/
done
#输出到同一个文件
echo "combine all data... "
echo '' > $tmpSearchWordlog/alldata.txt
dateArr=$(cat $tmpDateFile )
for tmpStr in ${dateArr[@]}
do
cat $tmpSearchWordlog/171/$tmpStr >> $tmpSearchWordlog/alldata.txt
cat $tmpSearchWordlog/173/$tmpStr >> $tmpSearchWordlog/alldata.txt
done
#统计 - all
#cat $tmpSearchWordlog/alldata.txt | awk -F ',' '{print $2","$6}' | sort | uniq | awk -F ',' '{print $1}'| sort | uniq -c | sort -rn | head -100 | awk '{print $1"\t"$2" "$3}' > $tmpSearchWordlog/allTop.txt
#exit
#拆分文件 3,000,000行 <200M
cd $tmpSearchWordlog
find . -name 'part.alldata.txt*' | xargs rm -rf
split -l3000000 alldata.txt part.alldata.txt
allPartFiles=`find . -name "part.alldata.txt*"`
for tmpStr in $allPartFiles
do
cat $tmpStr | awk -F ',' '{print $2","$6}' | sort | uniq | awk -F ',' '{print $1}'| sort | uniq -c | sort -rn | head -900 | awk '{print $1"\t"$2" "$3}' > ${tmpStr}_Tops.txt &
done
echo 'waiting 1分钟...'
sleep 60
# 整合统计
find . -name 'part.alldata.txt*_Tops.txt' | xargs cat | awk '{print $2"\t"$1}' | tr '[A-Z]' '[a-z]' | sort > partsAllTops.txt
# 关键词统计
awk '{a[$1]+=$2;}END{for(i in a){print i,a[i];}}' partsAllTops.txt | awk '{print $2"\t"$1}' | sort -rn | grep -v 'www.' | grep -v 'http:' > statistic.result