昨天晚上看看python,比较高兴,今天照着教程写了个小例子
主要用到了python中的正则表达式re和网络urllib2,
下面我附上全部代码
其实downURL这个的功能是最核心的,应为整个爬虫扒取网页的主要功能就是由他实现的
- #coding=utf-8
- import urllib2
- import re
- def downURL(url,filename):
- try:
- fp=urllib2.urlopen(url)
- except:
- print 'download exception'
- return 0
- op = open(filename,"wb")#趴下来的网页存这里
- while 1:
- s = fp.read()
- if not s:
- break
- op.write(s)
- fp.close()
- op.close()
- return 1
- def getURL(url):#使用正则表达式进行url的挖掘
- try:
- fp = urllib2.urlopen(url)
- except:
- print 'get url exception'
- return []
- pattern = re.compile("http://m.qiushibaike.com/hot/page/")#正则表达式匹配
- while 1:
- s = fp.read()
- if not s:
- break
- urls = pattern.findall(s)
- fp.close()
- return urls
- def spider(startURL,times):#爬虫
- urls = []
- urls.append(startURL)
- i=0;
- while 1:
- if i>times:
- break
- if len(urls)>0:
- url = urls.pop(0)
- print url,len(urls)
- downURL(url,str(i)+'.htm')#趴下来的网页
- i=i+1
- if len(urls)<times:
- urllist = getURL(url)
- for url in urllist:
- if urls.count(url) == 0:
- urls.append(url)
- else:
- break
- return 1
- spider("http://m.qiushibaike.com/hot/page/",2)
浙公网安备 33010602011771号