【Python爬虫】cnblogs博客备份工具
并发爬虫小练习。
直接粘贴到本地,命名为.py文件即可运行,运行时的参数为你想要爬取的用户。默认是本博客。
输出是以用户名命名的目录,目录内便是博客内容。
仅供学习python的多线程编程方法,后续会重写成并行爬虫。
爬虫代码如下:
# -*- coding:utf-8 -*- from multiprocessing.managers import BaseManager from pyquery import PyQuery import os, sys, urllib import re, random, logging, time import Queue, threading, multiprocessing, threadpool USER_NAME = 'kirai' TOTAL_PAGE_NUMBER = 0 INT_REGEXP = re.compile('([\d]+)') BASE_URL = 'http://www.cnblogs.com/'+USER_NAME+'/p/?page=' ARTICLE_REGEXP = re.compile('href=\"(http://www.cnblogs.com/'+USER_NAME+'/p/[\d]+.html)\"') THREAD_NUMBER = multiprocessing.cpu_count() * 2 ARTICLE_URLS_MUTEX = threading.Lock() ARTICLE_URLS = [] class ListWithLinkExtend(list): def extend(self, value): super(ListWithLinkExtend, self).extend(value) return self def get_total_page_number(): doc = PyQuery(url=BASE_URL) return int(INT_REGEXP.findall( doc.find('.pager .Pager').text())[0].encode('ascii')) def get_page_url(): global TOTAL_PAGE_NUMBER return map(lambda page: BASE_URL+str(page), [i for i in range(1, TOTAL_PAGE_NUMBER+1)]) def get_article_url(idx): url = PAGE_URLS[idx] doc = PyQuery(url=url) article_urls = ARTICLE_REGEXP.findall(str(doc.find('.PostList .postTitl2'))) return article_urls def handle_result(request, result): global ARTICLE_URLS_MUTEX, ARTICLE_URLS try: ARTICLE_URLS_MUTEX.acquire() ARTICLE_URLS.append(result) finally: ARTICLE_URLS_MUTEX.release() def cluster_process(): global ARTICLE_URLS # list : urls task_queue = Queue.Queue() # str : path result_queue = Queue.Queue() KiraiManager.register('get_task_queue', callable=lambda: task_queue) KiraiManager.register('get_result_queue', callable=lambda: result_queue) manager = KiraiManager(address=('', 6969), authkey='whosyourdaddy') manager.start() manager.shutdown() # article_flag, article_urls = get_article_url() # a simple way. def get_article(url): html = urllib.urlopen(url).read() return html, INT_REGEXP.findall(url)[0] def save_article(request, result): content = result[0] file_name = result[1] path = './' + USER_NAME + '/' + file_name + '.html' try: fp = file(path, 'w') fp.writelines(content) finally: fp.close() def thread_process(): global ARTICLE_URLS os.mkdir(USER_NAME) thread_pool = threadpool.ThreadPool(THREAD_NUMBER) requests = threadpool.makeRequests(get_article, ARTICLE_URLS, save_article) [thread_pool.putRequest(req) for req in requests] thread_pool.wait() def __main__(argv): global ARTICLE_URLS, TOTAL_PAGE_NUMBER, USER_NAME, BASE_URL, ARTICLE_REGEXP, PAGE_URLS, TOTAL_PAGE_NUMBER if len(argv) == 2: USER_NAME = argv[1] BASE_URL = 'http://www.cnblogs.com/'+USER_NAME+'/p/?page=' ARTICLE_REGEXP = re.compile('href=\"(http://www.cnblogs.com/'+USER_NAME+'/p/[\d]+.html)\"') TOTAL_PAGE_NUMBER = get_total_page_number() PAGE_URLS = get_page_url() thread_pool = threadpool.ThreadPool(THREAD_NUMBER) requests = threadpool.makeRequests( get_article_url, [i for i in range(0, TOTAL_PAGE_NUMBER)], handle_result) [thread_pool.putRequest(req) for req in requests] thread_pool.wait() ARTICLE_URLS = list(reduce(lambda a, b: ListWithLinkExtend(a).extend(ListWithLinkExtend(b)), ARTICLE_URLS)) thread_process() if __name__ == '__main__': __main__(sys.argv)
简单介绍下全局变量的意义:
USER_NAME:希望爬取的用户名,默认为kirai。
TOTAL_PAGE_NUMBER:会被更新成博客随笔的总页数。
INT_REGEXP:为了匹配数字的正则。
BASE_URL:随笔页的初始URL。
ARTICLE_REGEXP:在经过pyquery处理过后的每个随笔目录页中提取出博客文章页面的正则。
THREAD_NUMBER:线程数,默认设置是本机cpu核数的2倍。
ARTICLE_URLS_MUTEX:ARTICLE_URLS的锁,保证线程唯一占用。
ARTICLE_URLS:用于存放所有的文章url。
参考:http://www.cnblogs.com/kirai/p/6204051.html
待续
赠人玫瑰
手留余香
我们曾如此渴望命运的波澜,到最后才发现:人生最曼妙的风景,竟是内心的淡定与从容……我们曾如此期盼外界的认可,到最后才知道:世界是自己的,与他人毫无关系!-杨绛先生
如果,您希望更容易地发现我的新博客,不妨点击一下绿色通道的【关注我】。

浙公网安备 33010602011771号