urlib2 标准代码

import urllib2
def downloadHtml(url,user_agent=None,num_retries=2):
    print 'Downloading:',url
    headers={'User-agent':user_agent}
    req=urllib2.Request(url,headers=headers)
    try:
        html=urllib2.urlopen(req).read()
    except urllib2.URLError as e:
        print 'Download error:',e.reason
        html=None
        if num_retries>0:
            if hasattr(e,'code') and 500<=e.code<600:
                return downloadHtml(url,user_agent,num_retries-1) 
    return html 
def download_id():##根据连续页码下载若连续5次出错停止下载
    max_count=5
    error_count=0
    for i in itertools.count(1):
        url='http://xxxx/%s'%i
        html=download(url)
        if html is None:
            error_count+=1
            if error_count==max_count:
                break
        else:
            error_count=0

def get_links(html):
    reg=re.compile(r'',re.S)
    return reg.findall(html)

def link_crawler(seed_url, link_regex):
    crawl_queue=[seed_url]
    seen=set(crawl_queue)
    while crawl_queue:
        url=crawl_queue.pop()
        html=download(url)
        for link in get_links(html):
            if re.match(link_regex,link):
                link=urlparse.urljoin(seed_url,link)
                if link not in seen:
                    seen.add(link)
                    crawl_queue.append(link)

 

  

posted @ 2017-08-21 13:25  howhy  阅读(219)  评论(0)    收藏  举报