查看网站结构
import builtwith
print builtwith.parse("http://example.webscraping.com")
最原始的爬虫
import urllib2
def download(url):
print "down", url
try:
html = urllib2.urlopen(url).read()
except urllib2.URLError as e:
print "download error", e.reason
html = None
return html
print download("http://example.webscraping.com")
增加递归
import urllib2
def download(url, num_retries):
print "down", url
try:
html = urllib2.urlopen(url).read()
except urllib2.URLError as e:
print "download error", e.reason
html = None
if hasattr(e, "code") and 500 <= e.code <600:
# recursively retry 5xx HTTP errors
return download(url, num_retries-1)
return html
print download("http://example.webscraping.com",2)
用户代理
import urllib2
def download(url, user_agent = "wswp", num_retries = 2):
print "down",url
headers = {"User_agent":user_agent}
request = urllib2.Request(url, headers=headers)
try:
html = urllib2.urlopen(request).read()
except urllib2.URLError as e:
print "download error", e.reason
html = None
if hasattr(e, "code") and 500 <= e.code < 600:
# recursively retry 5xx Http errors
return download(url, user_agent, num_retries-1)
return html
print download("http://www.meetup.com")
运用上述的download
脚本读不出标签(问题:可能是标签不存在)
import re
def crawl_sitemap(url):
# download the sitemap
site_map = download(url)
print "site_map", site_map
# extract the sitemap links
links = re.findall('<loc>(.*?)</loc>', site_map)
print 'links', links
# download each link
for link in links:
html = download(links)
crawl_sitemap("http://example.webscraping.com/sitmap.xml")
对ID进行遍历,直到出错为止
import itertools # 无限迭代器
for page in itertools.count(1):
url = "http://example.webscraping.com/view/-%d" % page
html = download(url)
if html is None:
break
else:
pass
若ID出现中间被删除,就无法连续自动退出, 为了解决这种问题,
脚本加入连续判断5次,若都为空,就结束
import itertools
max_error = 5 # 最大错误值
num_error = 0 # 现有错误
for page in itertools.count(1):
url = "http://example.webscraping.com/view/-%d" % page
html = download(url)
if html is None:
num_error += 1
if num_error == max_error:
break # 若连续5次错误,程序结束
else:
num_error = 0 # 若错误不是连续的,则变量归0
链接爬虫
import re
def get_link(html):
"""return a list of links from html
"""
webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE) # re.IGNORECASE 忽略大小写
return webpage_regex.findall(html)
def link_crawler(seed_url, link_regex):
"""
"""
crawl_queue = [seed_url]
while crawl_queue:
url = crawl_queue.pop()
html = download(url)
for link in get_link(html):
if re.match(link_regex, link):
crawl_queue.append(link)
python HTTP模块requests 来实现支持代理的功能
import urllib2
import urlparse
proxy = ""
opener = urllib2.build_opener()
proxy_params = {urlparse.urlparse(url).scheme: proxy}
opener.add_handler(urllib2.ProxyHandler(proxy_params))
response = opener.open(request)
新版本的download函数
def download(url, user_agent="wswp", proxy=None, num_retries=2):
print "DownLoading", url
headers = {"User-agent": user_agent}
request = urllib2.Request(url, headers=headers)
opener = urllib2.build_opener()
if proxy:
proxy_params = {urlparse.urlparse(url).scheme: proxy}
opener.add_handler(urllib2.ProxyHandler(proxy_params))
try:
html = opener.open(request).read()
except urllib2.URLError as e:
print "download error", e.reason
html = None
if num_retries > 0:
if hasattr(e, "code") and 500 <= e.code <600:
# retry 5xx http error
html = download(url, user_agent, proxy, num_retries-1)
return html
下载限速(两次下载中添加延时)
import time
import datetime
class Throttle:
"""Add a delay between downloads to the same domain
"""
def __init__(self, delay):
# amount of delay between downloads for each domain
self.delay = delay
# timestamp of when a domain was last accessed
self.domains = {}
def wait(self, url):
domain = urlparse.urlparse(url).netloc
last_accessed = self.domains.get(domain)
if self.delay > 0 and last_accessed is not None:
sleep_secs = self.delay - (datetime.datetime.now() - last_accessed).seconds
if sleep_secs > 0:
# domain has been accessd recently
# so need to sleep
time.sleep(sleep_secs)
# update the last accessed time
self.domains[domain] = datetime.datetime.now()
实例操作延时
throttle = Throttle(delay)
throttle.wait(url)
result = download(url, headers, proxy=proxy, num_retries=num_retries)
"""爬虫陷阱(有些网站会动态生成内容如:下一月,下一年这种无限递归)
方法:添加深度限制,修改seen变量
(该变量原本只记录访问过的链接,现在修改成为一个字典,增加了页面深度记录)
"""
def link_crawler(... , max_depth=2):
max_depth = 2
seen = {}
...
depth = seen[url]
if depth != max_depth:
for link in links:
if link not in seen:
seen[link] = depth + 1
crawl_queue.append(link)
"""禁用该功能把max_depth设成负数就永远不会相等
"""
调用最终版本
seed_url = "http://example.webscraping.com/index"
link_regex = "/(index|view)"
link_crawler(seed_url, link_regex, user_agent="BadCrawler") # user_agent="BadCrawler"用户代理被屏蔽程序运行不了
link_crawler(seed_url, link_regex, max_depth=1) # 这是使用默认用户代理的,深度为1