spider简介以及基础方法（第一章）

查看网站结构

import builtwith
print builtwith.parse("http://example.webscraping.com")

最原始的爬虫

import urllib2
def download(url):
    print "down", url
    try:
        html = urllib2.urlopen(url).read()
    except urllib2.URLError as e:
        print "download error", e.reason
        html = None
    return html

print download("http://example.webscraping.com")

增加递归

import urllib2
def download(url, num_retries):
    print "down", url
    try:
        html = urllib2.urlopen(url).read()
    except urllib2.URLError as e:
        print "download error", e.reason
        html = None
        if hasattr(e, "code") and 500 <= e.code <600:
            # recursively retry 5xx HTTP errors
            return download(url, num_retries-1)
    return html

print download("http://example.webscraping.com",2)

用户代理

import urllib2
def download(url, user_agent = "wswp", num_retries = 2):
    print "down",url
    headers = {"User_agent":user_agent}
    request = urllib2.Request(url, headers=headers)
    try:
        html = urllib2.urlopen(request).read()
    except urllib2.URLError as e:
        print "download error", e.reason
        html = None
        if hasattr(e, "code") and 500 <= e.code < 600:
            # recursively retry 5xx Http errors
            return download(url, user_agent, num_retries-1)
    return html
print download("http://www.meetup.com")

运用上述的download
脚本读不出标签（问题：可能是标签不存在）

import re
def crawl_sitemap(url):
    # download the sitemap
    site_map = download(url)
    print "site_map", site_map
    # extract the sitemap links
    links = re.findall('<loc>(.*?)</loc>', site_map)
    print 'links', links
    # download each link
    for link in links:
        html = download(links)

crawl_sitemap("http://example.webscraping.com/sitmap.xml")

对ID进行遍历，直到出错为止

import itertools  # 无限迭代器
for page in itertools.count(1):
    url = "http://example.webscraping.com/view/-%d" % page
    html = download(url)
    if html is None:
        break
    else:
        pass

若ID出现中间被删除，就无法连续自动退出，为了解决这种问题，
脚本加入连续判断5次，若都为空，就结束

import itertools
max_error = 5 # 最大错误值
num_error = 0 # 现有错误
for page in itertools.count(1):
    url = "http://example.webscraping.com/view/-%d" % page
    html = download(url)
    if html is None:
        num_error += 1
        if num_error == max_error:
            break    # 若连续5次错误，程序结束
    else:
        num_error = 0  # 若错误不是连续的，则变量归0

链接爬虫

import re


def get_link(html):
    """return a list of links from html
    """
    webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)   # re.IGNORECASE 忽略大小写
    return webpage_regex.findall(html)


def link_crawler(seed_url, link_regex):
    """
    """
    crawl_queue = [seed_url]
    while crawl_queue:
        url = crawl_queue.pop()
        html = download(url)
        for link in get_link(html):
            if re.match(link_regex, link):
                crawl_queue.append(link)

python HTTP模块requests 来实现支持代理的功能

import urllib2
import urlparse
proxy = ""
opener = urllib2.build_opener()
proxy_params = {urlparse.urlparse(url).scheme: proxy}
opener.add_handler(urllib2.ProxyHandler(proxy_params))
response = opener.open(request)

新版本的download函数

def download(url, user_agent="wswp", proxy=None, num_retries=2):
    print "DownLoading", url
    headers = {"User-agent": user_agent}
    request = urllib2.Request(url, headers=headers)
    opener = urllib2.build_opener()
    if proxy:
        proxy_params = {urlparse.urlparse(url).scheme: proxy}
        opener.add_handler(urllib2.ProxyHandler(proxy_params))
    try:
        html = opener.open(request).read()
    except urllib2.URLError as e:
        print "download error", e.reason
        html = None
        if num_retries > 0:
            if hasattr(e, "code") and 500 <= e.code <600:
                # retry 5xx http error
                html = download(url, user_agent, proxy, num_retries-1)
    return html

下载限速（两次下载中添加延时）

import time
import datetime
class Throttle:
    """Add a delay between downloads to the same domain
    """
    def __init__(self, delay):
        # amount of delay between downloads for each domain
        self.delay = delay
        # timestamp of when a domain was last accessed
        self.domains = {}

    def wait(self, url):
        domain = urlparse.urlparse(url).netloc
        last_accessed = self.domains.get(domain)
        if self.delay > 0 and last_accessed is not None:
            sleep_secs = self.delay - (datetime.datetime.now() - last_accessed).seconds

            if sleep_secs > 0:
                # domain has been accessd recently
                # so need to sleep
                time.sleep(sleep_secs)
        # update the last accessed time
        self.domains[domain] = datetime.datetime.now()

实例操作延时

throttle = Throttle(delay)
throttle.wait(url)
result = download(url, headers, proxy=proxy, num_retries=num_retries)

"""爬虫陷阱（有些网站会动态生成内容如：下一月，下一年这种无限递归）
方法：添加深度限制，修改seen变量
（该变量原本只记录访问过的链接，现在修改成为一个字典，增加了页面深度记录）
"""
def link_crawler(... , max_depth=2):
    max_depth = 2
    seen = {}
    ...
    depth = seen[url]
    if depth != max_depth:
        for link in links:
            if link not in seen:
                seen[link] = depth + 1
                crawl_queue.append(link)

    """禁用该功能把max_depth设成负数就永远不会相等
    """

调用最终版本

seed_url = "http://example.webscraping.com/index"
link_regex = "/(index|view)"
link_crawler(seed_url, link_regex, user_agent="BadCrawler")  # user_agent="BadCrawler"用户代理被屏蔽程序运行不了
link_crawler(seed_url, link_regex, max_depth=1)  # 这是使用默认用户代理的，深度为1

posted @ 2018-11-22 13:52 Corey0606 阅读(583) 评论(0) 收藏举报

刷新页面返回顶部

COREY

spider简介以及基础方法（第一章）

查看网站结构

最原始的爬虫

增加递归

用户代理

运用上述的download
脚本读不出标签（问题：可能是标签不存在）

对ID进行遍历，直到出错为止

若ID出现中间被删除，就无法连续自动退出，为了解决这种问题，
脚本加入连续判断5次，若都为空，就结束

链接爬虫

python HTTP模块requests 来实现支持代理的功能

新版本的download函数

下载限速（两次下载中添加延时）

实例操作延时

调用最终版本

公告

COREY

spider简介以及基础方法（第一章）

查看网站结构

最原始的爬虫

增加递归

用户代理

运用上述的download脚本读不出标签（问题：可能是标签不存在）

对ID进行遍历，直到出错为止

若ID出现中间被删除，就无法连续自动退出， 为了解决这种问题，脚本加入连续判断5次，若都为空，就结束

链接爬虫

python HTTP模块requests 来实现支持代理的功能

新版本的download函数

下载限速（两次下载中添加延时）

实例操作延时

调用最终版本

公告

运用上述的download
脚本读不出标签（问题：可能是标签不存在）

若ID出现中间被删除，就无法连续自动退出，为了解决这种问题，
脚本加入连续判断5次，若都为空，就结束