Django:
# 创建project
django-admin startproject mysite
cd mysite
# 创建app
python manage.py startapp app01
python manage.py startapp app02
# 启动项目
python manage.py runserver
scrapy:
# 创建project 项目名称
scrapy startproject xdb
cd xdb
#创建爬虫 爬虫名称 爬虫地址
scrapy genspider chouti chouti.com
scrapy genspider cnblogs cnblogs.com
# 启动爬虫
scrapy crawl chouti
scrapy crawl chouti --nolog
"""
源码内容:
1. 判断当前XdbPipeline类中是否有from_crawler
有:obj = XdbPipeline.from_crawler(...)
否:obj = XdbPipeline()
2. obj.open_spider()
3. obj.process_item()|obj.process_item()|obj.process_item()|
4. obj.close_spider()
"""
from scrapy.exceptions import DropItem
class XdbPipeline(object):
def __init__(self, path):
self.f = None
self.path = path
@classmethod
def from_crawler(cls, crawler):
'''
初始化时候,用于创建pipeline对象
:param crawler:
:return:
'''
path = crawler.settings.get('HREF_FILE_PATH')
return cls(path)
def open_spider(self, spider):
'''
爬虫开始执行时,调用
:param spider:
:return:
'''
self.f = open(self.path, 'a+')
def process_item(self, item, spider):
# print(item.get("text"))
self.f.write(item.get('href') + '\n')
return item # 交给下一个Pipleline中的process_item方法去执行
return DropItem() # 后续的Pipeline中的process_item方法不再执行
def close_spider(self, spider):
'''
爬虫关闭时,被调用
:param spider:
:return:
'''
self.f.close()
持久化:pipelines
pipelines.py
class XdbPipeline(object):
def __init__(self, path):
self.f = None
self.path = path
@classmethod
def from_crawler(cls, crawler):
path = crawler.settings.get('HREF_FILE_PATH')
return cls(path)
def open_spider(self, spider):
self.f = open(self.path, 'a+')
def process_item(self, item, spider):
# print(item.get("text"))
self.f.write(item.get('href') + '\n')
return item
def close_spider(self, spider):
self.f.close()
settings.py
ITEM_PIPELINES = {
'xdb.pipelines.XdbPipeline': 300, # 数字越小优先级越高, 范围0--1000
}
items.py
import scrapy
class XdbItem(scrapy.Item):
text = scrapy.Field()
href = scrapy.Field()
chouti.py
import scrapy
xdb.items import XdbItem
class ChoutiSpider(scrapy.Spider):
name = 'chouti'
allowed_domains = ['chouti.com']
start_urls = ['http://chouti.com/']
def parse(self, response):
content_list = response.xpath('//div[@class="link-con"]//div[@class="link-detail"]')
for item in content_list:
text = item.xpath('./a/text()').extract_first()
href = item.xpath('./a/@href').extract_first()
yield XdbItem(text=text, href=href)