1. 用awk的match匹配URL,注意一下如果匹配带/开头或者/结尾的字符串时,需要处理一下,变成[/],对于分号也需要处理,采用ASCII码\\x3B
j = foreach i generate time,city,user,referer;
k = stream j through `awk '
{
refer=$4
tt=""
if(match(refer,"http://sou.zhaopin.com"))
tt="职位搜索频道页"
else if(match(refer,"http://sou.zhaopin.com/jobs[/]"))
tt="职位搜索结果页"
if(length(tt) > 0)
{
split($2,arrcity,"\\x3B")
if(length(arrcity) > 1)
print tt"\t选择了多个地点\t"$1"\t"$3
tmp="广东,湖北,陕西,四川,辽宁,吉林,江苏,山东,浙江,广西,安徽,河北,山西,内蒙,黑龙江,福建,江西,河南,湖南,海南,贵州,云南,西藏,甘肃,青海,宁夏,新疆,新疆维吾尔自治区,香港特别行政区,澳门特别行政区,台湾省"
b="false"
for(i=1;i<=length(arrcity);i++)
{
city1=arrcity[i]
if(city1 != "台湾省" && substr(city1,length(city1)) == "省")
city1=substr(city1,1,length(city1)-1)
if(match(tmp,city1))
{
b="true"
break;
}
}
if(b == "true")
print tt"\t直接选择省\t"$1"\t"$3
}
}'`;
2.使用awk匹配URL,注意里面的"?"符号,需要处理变成"[?]"
b = foreach h3 generate time,site,path,((referer is null)?'NA':referer) as referer:chararray,user;
i = stream b through `awk '
{
domain=$2
url=$3
ref=$4
if((domain == "my.zhaopin.com") && match(url,"[/]myzhaopin/resume_nav.asp[?]nr=yes"))
print $1"\thttp://my.zhaopin.com/myzhaopin/resume_nav.asp?nr=yes\tNA\t"$5
if((domain == "my.zhaopin.com") && match(url,"[/]myzhaopin/resume_baseinfo.asp") && (match(ref,"http://my.zhaopin.com/myzhaopin/resume_nav.asp[?]nr=yes")))
print $1"\thttp://my.zhaopin.com/myzhaopin/resume_baseinfo.asp\thttp://my.zhaopin.com/myzhaopin/resume_nav.asp?nr=yes\t"$5
}'`;
3.使用awk匹配城市,区分单城市,单省份,多地点
11.txt文件内容:
530;586;785;
浙江
台湾省;北京;上海
538
(赣州
深圳
上海;(湖北)
青海省
湖北省;陕西
a = load '11.txt' as (city:chararray);
b = foreach a generate city;
c = stream b through `awk '
{
tmp="广东,湖北,陕西,四川,辽宁,吉林,江苏,山东,浙江,广西,安徽,河北,山西,内蒙,黑龙江,福建,江西,河南,湖南,海南,贵州,云南,西藏,甘肃,青海,宁夏,新疆,新疆维吾尔自治区,香港特别行政区,澳门特别行政区,台湾省"
split($1,arrcity,"\\x3B")
if(length(arrcity) == 1)
{
citytmp=arrcity[1]
gsub("([\)]|[\(])*","",citytmp)
if(citytmp != "台湾省" && substr(citytmp,length(citytmp)) == "省")
citytmp=substr(citytmp,1,length(city1)-1)
if(match(tmp,citytmp))
print "选择了单个省\t"$1
else if(match(citytmp,"[^0-9A-Za-z]+"))
print "选择了单个城市\t"$1
}
else {
b=0
for(i=1;i<=length(arrcity);i++)
{
city1=arrcity[i]
gsub("([\)]|[\(])*","",city1)
if(city1 != "台湾省" && substr(city1,length(city1)) == "省")
city1=substr(city1,1,length(city1)-1)
if(match(tmp,city1))
{
b=b+1
}
else if(match(city1,"[^0-9A-Za-z]+"))
{
b=b+1
}
}
if(b > 1)
print "多选地点-包括城市和省\t"$1
}
}'`;
dump c;
浙公网安备 33010602011771号