【Python爬虫】cnblogs博客备份工具

并发爬虫小练习。

直接粘贴到本地,命名为.py文件即可运行,运行时的参数为你想要爬取的用户。默认是本博客。

输出是以用户名命名的目录,目录内便是博客内容。

仅供学习python的多线程编程方法,后续会重写成并行爬虫。

爬虫代码如下:

# -*- coding:utf-8 -*-
from multiprocessing.managers import BaseManager
from pyquery import PyQuery
import os, sys, urllib
import re, random, logging, time
import Queue, threading, multiprocessing, threadpool

USER_NAME = 'kirai'
TOTAL_PAGE_NUMBER = 0
INT_REGEXP = re.compile('([\d]+)')
BASE_URL = 'http://www.cnblogs.com/'+USER_NAME+'/p/?page='
ARTICLE_REGEXP = re.compile('href=\"(http://www.cnblogs.com/'+USER_NAME+'/p/[\d]+.html)\"')
THREAD_NUMBER = multiprocessing.cpu_count() * 2
ARTICLE_URLS_MUTEX = threading.Lock()
ARTICLE_URLS = []

class ListWithLinkExtend(list):
    def extend(self, value):
        super(ListWithLinkExtend, self).extend(value)
        return self

def get_total_page_number():
    doc = PyQuery(url=BASE_URL)
    return int(INT_REGEXP.findall(
        doc.find('.pager .Pager').text())[0].encode('ascii'))

def get_page_url():
    global TOTAL_PAGE_NUMBER
    return map(lambda page: BASE_URL+str(page),
                         [i for i in range(1, TOTAL_PAGE_NUMBER+1)])

def get_article_url(idx):
    url = PAGE_URLS[idx]
    doc = PyQuery(url=url)
    article_urls = ARTICLE_REGEXP.findall(str(doc.find('.PostList .postTitl2')))
    return article_urls

def handle_result(request, result):
    global ARTICLE_URLS_MUTEX, ARTICLE_URLS
    try:
        ARTICLE_URLS_MUTEX.acquire()
        ARTICLE_URLS.append(result)
    finally:
        ARTICLE_URLS_MUTEX.release()

def cluster_process():
    global ARTICLE_URLS
    # list : urls
    task_queue = Queue.Queue()
    # str : path
    result_queue = Queue.Queue()
    KiraiManager.register('get_task_queue', callable=lambda: task_queue)
    KiraiManager.register('get_result_queue', callable=lambda: result_queue)
    manager = KiraiManager(address=('', 6969), authkey='whosyourdaddy')
    manager.start()
    manager.shutdown()
    # article_flag, article_urls = get_article_url()

# a simple way.
def get_article(url):
    html = urllib.urlopen(url).read()
    return html, INT_REGEXP.findall(url)[0]

def save_article(request, result):
    content = result[0]
    file_name = result[1]
    path = './' + USER_NAME + '/' + file_name + '.html'
    try:
        fp = file(path, 'w')
        fp.writelines(content)
    finally:
        fp.close()

def thread_process():
    global ARTICLE_URLS
    os.mkdir(USER_NAME)
    thread_pool = threadpool.ThreadPool(THREAD_NUMBER)
    requests = threadpool.makeRequests(get_article, ARTICLE_URLS, save_article)
    [thread_pool.putRequest(req) for req in requests]
    thread_pool.wait()

def __main__(argv):
    global ARTICLE_URLS, TOTAL_PAGE_NUMBER, USER_NAME, BASE_URL, ARTICLE_REGEXP, PAGE_URLS, TOTAL_PAGE_NUMBER
    if len(argv) == 2:
        USER_NAME = argv[1]
    BASE_URL = 'http://www.cnblogs.com/'+USER_NAME+'/p/?page='
    ARTICLE_REGEXP = re.compile('href=\"(http://www.cnblogs.com/'+USER_NAME+'/p/[\d]+.html)\"')
    TOTAL_PAGE_NUMBER = get_total_page_number()
    PAGE_URLS = get_page_url()
    thread_pool = threadpool.ThreadPool(THREAD_NUMBER)
    requests = threadpool.makeRequests(
        get_article_url,
        [i for i in range(0, TOTAL_PAGE_NUMBER)],
        handle_result)
    [thread_pool.putRequest(req) for req in requests]
    thread_pool.wait()
    ARTICLE_URLS = list(reduce(lambda a, b: ListWithLinkExtend(a).extend(ListWithLinkExtend(b)),
                                                         ARTICLE_URLS))
    thread_process()

if __name__ == '__main__':
    __main__(sys.argv)

简单介绍下全局变量的意义:

USER_NAME:希望爬取的用户名,默认为kirai。

TOTAL_PAGE_NUMBER:会被更新成博客随笔的总页数。

INT_REGEXP:为了匹配数字的正则。
BASE_URL:随笔页的初始URL。

ARTICLE_REGEXP:在经过pyquery处理过后的每个随笔目录页中提取出博客文章页面的正则。

THREAD_NUMBER:线程数,默认设置是本机cpu核数的2倍。

ARTICLE_URLS_MUTEX:ARTICLE_URLS的锁,保证线程唯一占用。

ARTICLE_URLS:用于存放所有的文章url。

参考:http://www.cnblogs.com/kirai/p/6204051.html

 

待续

posted @ 2019-03-22 15:53  念槐聚  阅读(204)  评论(0)    收藏  举报