python获取网页所有链接 - lexus

公告

http://hongyan.cqupt.edu.cn/blog/blog.php?do=showone&tid=36247

#! /usr/bin/env python
#coding=utf-8
import urllib2,re,redis
#a=urllib2.urlopen("http://www.cnblogs.com/lexus").read()
#print(a)
#f=open("a.html","w")
#f.write(a)
#f.close()
ss=open("a.html","r").read()
#print ss
p = re.compile( r'<a.+?href=.+?>.+?</a>' )
pname = re.compile( r'(?<=>).*?(?=</a>)' )
phref = re.compile( r'(?<=href\=\").*?(?=\")' )
#构造及编译正则表达式

sarr = p.findall(ss)
#找出一条一条的<a></a>标签

i=0
c=0
for every in sarr:
    if i>1000:
        break
    else:
        i+=1
    sname = pname.findall( every )
    if sname:
        sname = sname[0]
    shref = phref.findall( every )
    if shref:
        shref = shref[0]
    str=r"http://www.cnblogs.com/lexus"
    if shref.find(str) > -1:
        c+=1
        print sname, "\n", shref, "\n\n\n"

print "="*10
print c

posted on 2011-12-12 00:55 lexus 阅读(847) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

浙江省高等学校教师教育理论培训

公告