scrapy 框架的编写

①编写spider下由scrapy genspider [-t template] <name> <domain>创建的脚本

初始文件

 1 # -*- coding: utf-8 -*-
 2 import scrapy
 3 
 4 
 5 class StartSpiderSpider(scrapy.Spider):
 6     name = 'start_spider'
 7     allowed_domains = ['www.tipdm.com']  
8
start_urls = ['http://www.tipdm.com/tipdm/tddt/'] 9 10 def parse(self, response): 11 pass

 编辑后的文件

 1 # -*- coding: utf-8 -*-
 2 import scrapy
 3 from scrapy.http import Request
 4 from qunae.items import QunaeItem
 5 
 6 class StartSpider(scrapy.Spider):
 7     name = 'start'
 8     allowed_domains = ['www.tipdm.com']   #网址域名
 9     start_urls = ['http://www.tipdm.com/tipdm/tddt/']  #网址首页
10     def parse(self, response):
11         # 网页解析
12         last_page_num = response.xpath("//div[@class='fpage']/div/a[last()]/text()").extract()
13         # 网址拼接
14         append_urls = ['http://www.tipdm.com/tipdm/tddt/index_%d.html'%i \
15         for i in range(2,int(last_page_num[0])+1)]
16         append_urls.append('http://www.tipdm.com/tipdm/tddt')
17         # 回调
18         for url in append_urls:
19             yield Request(url, callback=self.parse_url, dont_filter=True)
20 
21     def parse_url(self, response):
22         # 网页解析
23         urls = response.xpath("//div[@class='item clearfix']/div[1]/h1/a/@href").extract()
24         # 回调
25         for page_url in urls:
26             text_url = "http://www.tipdm.com"+page_url
27             yield Request(text_url, callback=self.parse_text, dont_filter=True)
28 
29     def parse_text(self,response):
30         item = QunaeItem()
31         item['title'] = response.xpath("//div[@class='artTitle']/h1/text()").extract()
32         text = response.xpath("//div[@class='artCon']//p/text()").extract()
33         texts = " "
34         for strings in text:
35             texts = texts + strings + " \n"
36         item['text'] = [texts.strip()]
37         item['time'] = response.xpath("//span[@class='date']/text()").extract()
38         item['view_count'] = response.xpath("//span[@class='view']/text()").extract()
39         yield item

②items文件编辑

 初始文件

 1 # -*- coding: utf-8 -*-
 2 
 3 # Define here the models for your scraped items
 4 #
 5 # See documentation in:
 6 # https://docs.scrapy.org/en/latest/topics/items.html
 7 
 8 import scrapy
 9 
10 
11 class SsItem(scrapy.Item):
12     # define the fields for your item here like:
13     # name = scrapy.Field()
14     pass

编辑后的文件

1 import scrapy
2 class QunaeItem(scrapy.Item):
3     # define the fields for your item here like:
4     # name = scrapy.Field()
5     title = scrapy.Field()
6     text = scrapy.Field()
7     time = scrapy.Field()
8     view_count = scrapy.Field()

③start文件编辑

初始文件

 1 # -*- coding: utf-8 -*-
 2 import scrapy
 3 
 4 
 5 class StartSpiderSpider(scrapy.Spider):
 6     name = 'start_spider'
 7     allowed_domains = ['https://travel.qunar.com']
 8     start_urls = ['http://https://travel.qunar.com/']
 9 
10     def parse(self, response):
11         pass

编辑后的文件

 1 import scrapy
 2 from scrapy.http import Request
 3 from qunae.items import QunaeItem
 4 
 5 class StartSpider(scrapy.Spider):
 6     name = 'start'
 7     allowed_domains = ['www.tipdm.com']   #网址域名
 8     start_urls = ['http://www.tipdm.com/tipdm/tddt/']  #网址首页
 9     def parse(self, response):
10         # 网页解析
11         last_page_num = response.xpath("//div[@class='fpage']/div/a[last()]/text()").extract()
12         # 网址拼接
13         append_urls = ['http://www.tipdm.com/tipdm/tddt/index_%d.html'%i \
14         for i in range(2,int(last_page_num[0])+1)]
15         append_urls.append('http://www.tipdm.com/tipdm/tddt')
16         # 回调
17         for url in append_urls:
18             yield Request(url, callback=self.parse_url, dont_filter=True)
19 
20     def parse_url(self, response):
21         # 网页解析
22         urls = response.xpath("//div[@class='item clearfix']/div[1]/h1/a/@href").extract()
23         # 回调
24         for page_url in urls:
25             text_url = "http://www.tipdm.com"+page_url
26             yield Request(text_url, callback=self.parse_text, dont_filter=True)
27 
28     def parse_text(self,response):
29         item = QunaeItem()
30         item['title'] = response.xpath("//div[@class='artTitle']/h1/text()").extract()
31         text = response.xpath("//div[@class='artCon']//p/text()").extract()
32         texts = " "
33         for strings in text:
34             texts = texts + strings + " \n"
35         item['text'] = [texts.strip()]
36         item['time'] = response.xpath("//span[@class='date']/text()").extract()
37         item['view_count'] = response.xpath("//span[@class='view']/text()").extract()
38         yield item

④pipelines文件编辑

初始文件

 1 # -*- coding: utf-8 -*-
 2 
 3 # Define your item pipelines here
 4 #
 5 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
 7 
 8 
 9 class SsPipeline(object):
10     def process_item(self, item, spider):
11         return item

编辑后的文件

 1 import pandas as pd
 2 from sqlalchemy import create_engine  #数据库连接
 3 
 4 
 5 class QunaePipeline(object):
 6     def __init__(self):
 7         self.engine = create_engine('mysql+pymysql://root:123456@localhost/qunae?charset=utf8')
 8     def process_item(self, item, spider):
 9         data = pd.DataFrame(dict(item))
10         data.to_csv('qunaes.csv',mode='a',index=False,sep=',',header=False,encoding='utf-8')
11         data.to_sql('qunae_data',self.engine,if_exists='replace',index=False)
12         return item

⑤setting

 1 # -*- coding: utf-8 -*-
 2 
 3 # Scrapy settings for ss project
 4 #
 5 # For simplicity, this file contains only settings considered important or
 6 # commonly used. You can find more settings consulting the documentation:
 7 #
 8 #     https://docs.scrapy.org/en/latest/topics/settings.html
 9 #     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
10 #     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
11 
12 BOT_NAME = 'ss'
13 
14 SPIDER_MODULES = ['ss.spiders']
15 NEWSPIDER_MODULE = 'ss.spiders'
16 
17 
18 # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 #USER_AGENT = 'ss (+http://www.yourdomain.com)'
20 
21 # Obey robots.txt rules
22 ROBOTSTXT_OBEY = True
23 
24 # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 #CONCURRENT_REQUESTS = 32
26 
27 # Configure a delay for requests for the same website (default: 0)
28 # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
29 # See also autothrottle settings and docs
30 #DOWNLOAD_DELAY = 3
31 # The download delay setting will honor only one of:
32 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 #CONCURRENT_REQUESTS_PER_IP = 16
34 
35 # Disable cookies (enabled by default)
36 #COOKIES_ENABLED = False
37 
38 # Disable Telnet Console (enabled by default)
39 #TELNETCONSOLE_ENABLED = False
40 
41 # Override the default request headers:
42 #DEFAULT_REQUEST_HEADERS = {
43 #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 #   'Accept-Language': 'en',
45 #}
46 
47 # Enable or disable spider middlewares
48 # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
49 #SPIDER_MIDDLEWARES = {
50 #    'ss.middlewares.SsSpiderMiddleware': 543,
51 #}
52 
53 # Enable or disable downloader middlewares
54 # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
55 #DOWNLOADER_MIDDLEWARES = {
56 #    'ss.middlewares.SsDownloaderMiddleware': 543,
57 #}
58 
59 # Enable or disable extensions
60 # See https://docs.scrapy.org/en/latest/topics/extensions.html
61 #EXTENSIONS = {
62 #    'scrapy.extensions.telnet.TelnetConsole': None,
63 #}
64 
65 # Configure item pipelines
66 # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
67 #ITEM_PIPELINES = {
68 #    'ss.pipelines.SsPipeline': 300,
69 #}
70 
71 # Enable and configure the AutoThrottle extension (disabled by default)
72 # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
73 #AUTOTHROTTLE_ENABLED = True
74 # The initial download delay
75 #AUTOTHROTTLE_START_DELAY = 5
76 # The maximum download delay to be set in case of high latencies
77 #AUTOTHROTTLE_MAX_DELAY = 60
78 # The average number of requests Scrapy should be sending in parallel to
79 # each remote server
80 #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 # Enable showing throttling stats for every response received:
82 #AUTOTHROTTLE_DEBUG = False
83 
84 # Enable and configure HTTP caching (disabled by default)
85 # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 #HTTPCACHE_ENABLED = True
87 #HTTPCACHE_EXPIRATION_SECS = 0
88 #HTTPCACHE_DIR = 'httpcache'
89 #HTTPCACHE_IGNORE_HTTP_CODES = []
90 #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
编辑后的文件
 1 # -*- coding: utf-8 -*-
 2 
 3 # Scrapy settings for qunae project
 4 #
 5 # For simplicity, this file contains only settings considered important or
 6 # commonly used. You can find more settings consulting the documentation:
 7 #
 8 #     https://docs.scrapy.org/en/latest/topics/settings.html
 9 #     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
10 #     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
11 
12 BOT_NAME = 'qunae'
13 
14 SPIDER_MODULES = ['qunae.spiders']
15 NEWSPIDER_MODULE = 'qunae.spiders'
16 
17 
18 # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 #USER_AGENT = 'qunae (+http://www.yourdomain.com)'
20 
21 # Obey robots.txt rules
22 ROBOTSTXT_OBEY = True
23 
24 # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 #CONCURRENT_REQUESTS = 32
26 
27 # Configure a delay for requests for the same website (default: 0)
28 # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
29 # See also autothrottle settings and docs
30 #DOWNLOAD_DELAY = 3
31 # The download delay setting will honor only one of:
32 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 #CONCURRENT_REQUESTS_PER_IP = 16
34 
35 # Disable cookies (enabled by default)
36 #COOKIES_ENABLED = False
37 
38 # Disable Telnet Console (enabled by default)
39 #TELNETCONSOLE_ENABLED = False
40 
41 # Override the default request headers:
42 #DEFAULT_REQUEST_HEADERS = {
43 #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 #   'Accept-Language': 'en',
45 #}
46 
47 # Enable or disable spider middlewares
48 # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
49 #SPIDER_MIDDLEWARES = {
50 #    'qunae.middlewares.QunaeSpiderMiddleware': 543,
51 #}
52 
53 # Enable or disable downloader middlewares
54 # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
55 #DOWNLOADER_MIDDLEWARES = {
56 #    'qunae.middlewares.QunaeDownloaderMiddleware': 543,
57 #}
58 
59 # Enable or disable extensions
60 # See https://docs.scrapy.org/en/latest/topics/extensions.html
61 #EXTENSIONS = {
62 #    'scrapy.extensions.telnet.TelnetConsole': None,
63 #}
64 
65 # Configure item pipelines
66 # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
67 #打开了这个才能下载定义的数据
68 ITEM_PIPELINES = {
69    'qunae.pipelines.QunaePipeline': 300,
70 }
71 
72 # Enable and configure the AutoThrottle extension (disabled by default)
73 # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
74 #AUTOTHROTTLE_ENABLED = True
75 # The initial download delay
76 #AUTOTHROTTLE_START_DELAY = 5
77 # The maximum download delay to be set in case of high latencies
78 #AUTOTHROTTLE_MAX_DELAY = 60
79 # The average number of requests Scrapy should be sending in parallel to
80 # each remote server
81 #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
82 # Enable showing throttling stats for every response received:
83 #AUTOTHROTTLE_DEBUG = False
84 
85 # Enable and configure HTTP caching (disabled by default)
86 # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
87 #表示网页已缓存,调用时无需再次爬取网页内容
88 HTTPCACHE_ENABLED = True
89 #HTTPCACHE_EXPIRATION_SECS = 0
90 HTTPCACHE_DIR = 'E:/qunaes/qunae/ss'
91 #HTTPCACHE_IGNORE_HTTP_CODES = []
92 #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
⑥终端运行scrapy框架
打开终端——》进入scrapy框架有scrapy.cfg文件的目录——》运行scrapy crawl start_spider(若没有编写将数据保存在数据库,可以运行scrapy crawl start_spider -o books.csv)

posted on 2020-03-12 16:09  LiErRui  阅读(242)  评论(0)    收藏  举报

导航