spider简介以及基础方法(第一章)

查看网站结构

import builtwith
print builtwith.parse("http://example.webscraping.com")

 

最原始的爬虫

import urllib2
def download(url):
    print "down", url
    try:
        html = urllib2.urlopen(url).read()
    except urllib2.URLError as e:
        print "download error", e.reason
        html = None
    return html

print download("http://example.webscraping.com")

 

增加递归

import urllib2
def download(url, num_retries):
    print "down", url
    try:
        html = urllib2.urlopen(url).read()
    except urllib2.URLError as e:
        print "download error", e.reason
        html = None
        if hasattr(e, "code") and 500 <= e.code <600:
            # recursively retry 5xx HTTP errors
            return download(url, num_retries-1)
    return html

print download("http://example.webscraping.com",2)


用户代理

import urllib2
def download(url, user_agent = "wswp", num_retries = 2):
    print "down",url
    headers = {"User_agent":user_agent}
    request = urllib2.Request(url, headers=headers)
    try:
        html = urllib2.urlopen(request).read()
    except urllib2.URLError as e:
        print "download error", e.reason
        html = None
        if hasattr(e, "code") and 500 <= e.code < 600:
            # recursively retry 5xx Http errors
            return download(url, user_agent, num_retries-1)
    return html
print download("http://www.meetup.com")

 


运用上述的download
脚本读不出标签(问题:可能是标签不存在)

import re
def crawl_sitemap(url):
    # download the sitemap
    site_map = download(url)
    print "site_map", site_map
    # extract the sitemap links
    links = re.findall('<loc>(.*?)</loc>', site_map)
    print 'links', links
    # download each link
    for link in links:
        html = download(links)

crawl_sitemap("http://example.webscraping.com/sitmap.xml")

 

 

对ID进行遍历,直到出错为止

import itertools  # 无限迭代器
for page in itertools.count(1):
    url = "http://example.webscraping.com/view/-%d" % page
    html = download(url)
    if html is None:
        break
    else:
        pass

 

 

若ID出现中间被删除,就无法连续自动退出, 为了解决这种问题,
脚本加入连续判断5次,若都为空,就结束

import itertools
max_error = 5 # 最大错误值
num_error = 0 # 现有错误
for page in itertools.count(1):
    url = "http://example.webscraping.com/view/-%d" % page
    html = download(url)
    if html is None:
        num_error += 1
        if num_error == max_error:
            break    # 若连续5次错误,程序结束
    else:
        num_error = 0  # 若错误不是连续的,则变量归0

 

 

链接爬虫

import re


def get_link(html):
    """return a list of links from html
    """
    webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)   # re.IGNORECASE 忽略大小写
    return webpage_regex.findall(html)


def link_crawler(seed_url, link_regex):
    """
    """
    crawl_queue = [seed_url]
    while crawl_queue:
        url = crawl_queue.pop()
        html = download(url)
        for link in get_link(html):
            if re.match(link_regex, link):
                crawl_queue.append(link)

 

 

python HTTP模块requests 来实现支持代理的功能

import urllib2
import urlparse
proxy = ""
opener = urllib2.build_opener()
proxy_params = {urlparse.urlparse(url).scheme: proxy}
opener.add_handler(urllib2.ProxyHandler(proxy_params))
response = opener.open(request)

 

 

新版本的download函数

def download(url, user_agent="wswp", proxy=None, num_retries=2):
    print "DownLoading", url
    headers = {"User-agent": user_agent}
    request = urllib2.Request(url, headers=headers)
    opener = urllib2.build_opener()
    if proxy:
        proxy_params = {urlparse.urlparse(url).scheme: proxy}
        opener.add_handler(urllib2.ProxyHandler(proxy_params))
    try:
        html = opener.open(request).read()
    except urllib2.URLError as e:
        print "download error", e.reason
        html = None
        if num_retries > 0:
            if hasattr(e, "code") and 500 <= e.code <600:
                # retry 5xx http error
                html = download(url, user_agent, proxy, num_retries-1)
    return html

 

 

下载限速(两次下载中添加延时)

import time
import datetime
class Throttle:
    """Add a delay between downloads to the same domain
    """
    def __init__(self, delay):
        # amount of delay between downloads for each domain
        self.delay = delay
        # timestamp of when a domain was last accessed
        self.domains = {}

    def wait(self, url):
        domain = urlparse.urlparse(url).netloc
        last_accessed = self.domains.get(domain)
        if self.delay > 0 and last_accessed is not None:
            sleep_secs = self.delay - (datetime.datetime.now() - last_accessed).seconds

            if sleep_secs > 0:
                # domain has been accessd recently
                # so need to sleep
                time.sleep(sleep_secs)
        # update the last accessed time
        self.domains[domain] = datetime.datetime.now()

 

 

实例操作延时

throttle = Throttle(delay)
throttle.wait(url)
result = download(url, headers, proxy=proxy, num_retries=num_retries)

"""爬虫陷阱(有些网站会动态生成内容如:下一月,下一年这种无限递归)
方法:添加深度限制,修改seen变量
(该变量原本只记录访问过的链接,现在修改成为一个字典,增加了页面深度记录)
"""
def link_crawler(... , max_depth=2):
    max_depth = 2
    seen = {}
    ...
    depth = seen[url]
    if depth != max_depth:
        for link in links:
            if link not in seen:
                seen[link] = depth + 1
                crawl_queue.append(link)

    """禁用该功能把max_depth设成负数就永远不会相等
    """

 

 

调用最终版本

seed_url = "http://example.webscraping.com/index"
link_regex = "/(index|view)"
link_crawler(seed_url, link_regex, user_agent="BadCrawler")  # user_agent="BadCrawler"用户代理被屏蔽程序运行不了
link_crawler(seed_url, link_regex, max_depth=1)  # 这是使用默认用户代理的,深度为1

 



posted @ 2018-11-22 13:52  Corey0606  阅读(541)  评论(0编辑  收藏  举报