spider_main.py

coding=UTF-8

import html_download
import html_outputer
import html_parser
import url_maneger


class SpiderMain(object):
    # 构造器
    def __init__(self):
        # 1:初始化组件
        self.urls = url_maneger.urlManeger()
        self.downloader = html_download.htmlDownload()
        self.parser = html_parser.htmlParser()
        self.outputer = html_outputer.htmlOutputer()

    def crawl(self, root_url):
        # 2:定义爬取方法
        count = 1
        self.urls.add_new_url(root_url)
        while self.urls.has_new_url():
            # a.判断是否有新的url
            try:
                new_url = self.urls.get_new_url()
                print('正在爬取:' + new_url)
                # b.获得一个url
                html_cont = self.downloader.download(new_url)
                # c.下载url内容
                new_urls, data = self.parser.parse(new_url, html_cont)
                # d.解析url内容,得到新的列表和数据
                self.urls.add_new_urls(new_urls)
                # e.把列表添加到url未读列表
                self.outputer.collect_data(data)
                # f.获取解析结果

                count = count + 1
            except:
                print('error page')

            if count == 10000:
                break
        self.outputer.output()


if __name__ == "__main__":
    root_url = 'http://baike.baidu.com/item/python'  # 爬取起点
    spider = SpiderMain()
    spider.crawl(root_url)
posted @ 2017-08-11 10:08  岑忠满  阅读(356)  评论(0编辑  收藏  举报