Linux Shell 网页抓取

 args.txt
#! /bin/bash 
if [ -z $1 ] || [ ! -e $1 ]  
then
	echo "Usage: cmd.sh input "
	exit
fi


echo $0
for num in $*;do 
    echo "$num"
done 
for i in $(seq -3 $#); 
    do   
        echo $i 
    done 
for i in {0..5} 
do 
    echo $i 
done 
echo $@ 
  
for((i=4;i<7;i++));do 
echo $i 
done 
  
echo "all:$$"
  
trimReg="s/\(^ *\)\(.*[^ ]\)\( *$\)/\2/"  
tmpfile=`cat /proc/sys/kernel/random/uuid` 
  
  
while read line; 
do 
    value=${line#*=} 
    key=${line%%=*} 
    key=`echo ${key}|sed -e "${trimReg}"` 
    value=`echo ${value}|sed -e "${trimReg}"` 
    	if [ "$key" == "url" ] 
   	 then     
        		url=$value 
    	elif [ "$key" == "beginwith" ] 
    	then 
            beginwith=$value 
    	elif [ "$key" == "endwith" ] 
    	then  
            endwith=$value 
    	elif [ "$key" == "pagereg" ] 
    	then  
            pagereg=$value 
    	elif [ "$key" == "savepath" ] 
    	then 
            savepath=$value 
	 	elif [ "$key" == "prefix" ]
	 	then  
			 	prefix=$value
		elif [ "$key" == "proxy" ]
		then	proxy=$value
    	fi 
  
done < $1
  
echo "url:$url"
echo "beginwith:$beginwith"
echo "pagereg:$pagereg"
echo "endwith:$endwith"
echo "prefix:$prefix"
echo "proxy:$proxy"
echo "savepath:$savepath"
echo "tmpfile:$tmpfile"
if [ -z $proxy ]
then
content=`curl -s $url | iconv -f gbk -t utf-8`
else
content=`curl -x $proxy -s $url | iconv -f gbk -t utf-8`
fi 
length=`expr length "${content}"` 
echo "download:$length byte(s)"
content=${content#*${beginwith}} 
content=${content%%${endwith}*}
length=`expr length "${content}"` 
echo "after filer:$length byte(s)"

echo $content|grep -Po "$pagereg"|uniq > $savepath

awk '{a[$0]++}END{for(m in a) print m}' $savepath > $tmpfile


if [ ! -z $prefix ]
then
	sed "s/^/$prefix/g" $tmpfile > $savepath
else
	cp $tmpfile $savepath
fi


rm -f $tmpfile

  
str="0000012345456789000000"
echo $str
#str= expr substr $str 1 2 
#str=${str:2:3} 
str=${str#*0} 
echo $str
#trim the string 
str="  s =  "
str=`echo $str | sed -e "${trimReg}"`  
echo [$str] 
echo $str | sed -e "${trimReg}"

 

url = focus.news.163.com
beginwith = <ul class="focuslist-1" id="focusTab-1">
pagereg = (?<=href=\\")http://focus\\.news\\.163\\.com/[\\d]+.+?(?=\\")
endwith =  <div class="con-4" area clearfix">
savepath = 163.txt

 

posted on 2011-11-12 17:02  yangyh  阅读(3430)  评论(0编辑  收藏  举报