scrapy 框架的编写
①编写spider下由scrapy genspider [-t template] <name> <domain>创建的脚本
初始文件
1 # -*- coding: utf-8 -*- 2 import scrapy 3 4 5 class StartSpiderSpider(scrapy.Spider): 6 name = 'start_spider' 7 allowed_domains = ['www.tipdm.com']
8 start_urls = ['http://www.tipdm.com/tipdm/tddt/'] 9 10 def parse(self, response): 11 pass
编辑后的文件
1 # -*- coding: utf-8 -*- 2 import scrapy 3 from scrapy.http import Request 4 from qunae.items import QunaeItem 5 6 class StartSpider(scrapy.Spider): 7 name = 'start' 8 allowed_domains = ['www.tipdm.com'] #网址域名 9 start_urls = ['http://www.tipdm.com/tipdm/tddt/'] #网址首页 10 def parse(self, response): 11 # 网页解析 12 last_page_num = response.xpath("//div[@class='fpage']/div/a[last()]/text()").extract() 13 # 网址拼接 14 append_urls = ['http://www.tipdm.com/tipdm/tddt/index_%d.html'%i \ 15 for i in range(2,int(last_page_num[0])+1)] 16 append_urls.append('http://www.tipdm.com/tipdm/tddt') 17 # 回调 18 for url in append_urls: 19 yield Request(url, callback=self.parse_url, dont_filter=True) 20 21 def parse_url(self, response): 22 # 网页解析 23 urls = response.xpath("//div[@class='item clearfix']/div[1]/h1/a/@href").extract() 24 # 回调 25 for page_url in urls: 26 text_url = "http://www.tipdm.com"+page_url 27 yield Request(text_url, callback=self.parse_text, dont_filter=True) 28 29 def parse_text(self,response): 30 item = QunaeItem() 31 item['title'] = response.xpath("//div[@class='artTitle']/h1/text()").extract() 32 text = response.xpath("//div[@class='artCon']//p/text()").extract() 33 texts = " " 34 for strings in text: 35 texts = texts + strings + " \n" 36 item['text'] = [texts.strip()] 37 item['time'] = response.xpath("//span[@class='date']/text()").extract() 38 item['view_count'] = response.xpath("//span[@class='view']/text()").extract() 39 yield item
②items文件编辑
初始文件
1 # -*- coding: utf-8 -*- 2 3 # Define here the models for your scraped items 4 # 5 # See documentation in: 6 # https://docs.scrapy.org/en/latest/topics/items.html 7 8 import scrapy 9 10 11 class SsItem(scrapy.Item): 12 # define the fields for your item here like: 13 # name = scrapy.Field() 14 pass
编辑后的文件
1 import scrapy 2 class QunaeItem(scrapy.Item): 3 # define the fields for your item here like: 4 # name = scrapy.Field() 5 title = scrapy.Field() 6 text = scrapy.Field() 7 time = scrapy.Field() 8 view_count = scrapy.Field()
③start文件编辑
初始文件
1 # -*- coding: utf-8 -*- 2 import scrapy 3 4 5 class StartSpiderSpider(scrapy.Spider): 6 name = 'start_spider' 7 allowed_domains = ['https://travel.qunar.com'] 8 start_urls = ['http://https://travel.qunar.com/'] 9 10 def parse(self, response): 11 pass
编辑后的文件
1 import scrapy 2 from scrapy.http import Request 3 from qunae.items import QunaeItem 4 5 class StartSpider(scrapy.Spider): 6 name = 'start' 7 allowed_domains = ['www.tipdm.com'] #网址域名 8 start_urls = ['http://www.tipdm.com/tipdm/tddt/'] #网址首页 9 def parse(self, response): 10 # 网页解析 11 last_page_num = response.xpath("//div[@class='fpage']/div/a[last()]/text()").extract() 12 # 网址拼接 13 append_urls = ['http://www.tipdm.com/tipdm/tddt/index_%d.html'%i \ 14 for i in range(2,int(last_page_num[0])+1)] 15 append_urls.append('http://www.tipdm.com/tipdm/tddt') 16 # 回调 17 for url in append_urls: 18 yield Request(url, callback=self.parse_url, dont_filter=True) 19 20 def parse_url(self, response): 21 # 网页解析 22 urls = response.xpath("//div[@class='item clearfix']/div[1]/h1/a/@href").extract() 23 # 回调 24 for page_url in urls: 25 text_url = "http://www.tipdm.com"+page_url 26 yield Request(text_url, callback=self.parse_text, dont_filter=True) 27 28 def parse_text(self,response): 29 item = QunaeItem() 30 item['title'] = response.xpath("//div[@class='artTitle']/h1/text()").extract() 31 text = response.xpath("//div[@class='artCon']//p/text()").extract() 32 texts = " " 33 for strings in text: 34 texts = texts + strings + " \n" 35 item['text'] = [texts.strip()] 36 item['time'] = response.xpath("//span[@class='date']/text()").extract() 37 item['view_count'] = response.xpath("//span[@class='view']/text()").extract() 38 yield item
④pipelines文件编辑
初始文件
1 # -*- coding: utf-8 -*- 2 3 # Define your item pipelines here 4 # 5 # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html 7 8 9 class SsPipeline(object): 10 def process_item(self, item, spider): 11 return item
编辑后的文件
1 import pandas as pd 2 from sqlalchemy import create_engine #数据库连接 3 4 5 class QunaePipeline(object): 6 def __init__(self): 7 self.engine = create_engine('mysql+pymysql://root:123456@localhost/qunae?charset=utf8') 8 def process_item(self, item, spider): 9 data = pd.DataFrame(dict(item)) 10 data.to_csv('qunaes.csv',mode='a',index=False,sep=',',header=False,encoding='utf-8') 11 data.to_sql('qunae_data',self.engine,if_exists='replace',index=False) 12 return item
⑤setting
1 # -*- coding: utf-8 -*- 2 3 # Scrapy settings for ss project 4 # 5 # For simplicity, this file contains only settings considered important or 6 # commonly used. You can find more settings consulting the documentation: 7 # 8 # https://docs.scrapy.org/en/latest/topics/settings.html 9 # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 10 # https://docs.scrapy.org/en/latest/topics/spider-middleware.html 11 12 BOT_NAME = 'ss' 13 14 SPIDER_MODULES = ['ss.spiders'] 15 NEWSPIDER_MODULE = 'ss.spiders' 16 17 18 # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 #USER_AGENT = 'ss (+http://www.yourdomain.com)' 20 21 # Obey robots.txt rules 22 ROBOTSTXT_OBEY = True 23 24 # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 #CONCURRENT_REQUESTS = 32 26 27 # Configure a delay for requests for the same website (default: 0) 28 # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay 29 # See also autothrottle settings and docs 30 #DOWNLOAD_DELAY = 3 31 # The download delay setting will honor only one of: 32 #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 #CONCURRENT_REQUESTS_PER_IP = 16 34 35 # Disable cookies (enabled by default) 36 #COOKIES_ENABLED = False 37 38 # Disable Telnet Console (enabled by default) 39 #TELNETCONSOLE_ENABLED = False 40 41 # Override the default request headers: 42 #DEFAULT_REQUEST_HEADERS = { 43 # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 # 'Accept-Language': 'en', 45 #} 46 47 # Enable or disable spider middlewares 48 # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html 49 #SPIDER_MIDDLEWARES = { 50 # 'ss.middlewares.SsSpiderMiddleware': 543, 51 #} 52 53 # Enable or disable downloader middlewares 54 # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 55 #DOWNLOADER_MIDDLEWARES = { 56 # 'ss.middlewares.SsDownloaderMiddleware': 543, 57 #} 58 59 # Enable or disable extensions 60 # See https://docs.scrapy.org/en/latest/topics/extensions.html 61 #EXTENSIONS = { 62 # 'scrapy.extensions.telnet.TelnetConsole': None, 63 #} 64 65 # Configure item pipelines 66 # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html 67 #ITEM_PIPELINES = { 68 # 'ss.pipelines.SsPipeline': 300, 69 #} 70 71 # Enable and configure the AutoThrottle extension (disabled by default) 72 # See https://docs.scrapy.org/en/latest/topics/autothrottle.html 73 #AUTOTHROTTLE_ENABLED = True 74 # The initial download delay 75 #AUTOTHROTTLE_START_DELAY = 5 76 # The maximum download delay to be set in case of high latencies 77 #AUTOTHROTTLE_MAX_DELAY = 60 78 # The average number of requests Scrapy should be sending in parallel to 79 # each remote server 80 #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 81 # Enable showing throttling stats for every response received: 82 #AUTOTHROTTLE_DEBUG = False 83 84 # Enable and configure HTTP caching (disabled by default) 85 # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 86 #HTTPCACHE_ENABLED = True 87 #HTTPCACHE_EXPIRATION_SECS = 0 88 #HTTPCACHE_DIR = 'httpcache' 89 #HTTPCACHE_IGNORE_HTTP_CODES = [] 90 #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
编辑后的文件
1 # -*- coding: utf-8 -*- 2 3 # Scrapy settings for qunae project 4 # 5 # For simplicity, this file contains only settings considered important or 6 # commonly used. You can find more settings consulting the documentation: 7 # 8 # https://docs.scrapy.org/en/latest/topics/settings.html 9 # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 10 # https://docs.scrapy.org/en/latest/topics/spider-middleware.html 11 12 BOT_NAME = 'qunae' 13 14 SPIDER_MODULES = ['qunae.spiders'] 15 NEWSPIDER_MODULE = 'qunae.spiders' 16 17 18 # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 #USER_AGENT = 'qunae (+http://www.yourdomain.com)' 20 21 # Obey robots.txt rules 22 ROBOTSTXT_OBEY = True 23 24 # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 #CONCURRENT_REQUESTS = 32 26 27 # Configure a delay for requests for the same website (default: 0) 28 # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay 29 # See also autothrottle settings and docs 30 #DOWNLOAD_DELAY = 3 31 # The download delay setting will honor only one of: 32 #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 #CONCURRENT_REQUESTS_PER_IP = 16 34 35 # Disable cookies (enabled by default) 36 #COOKIES_ENABLED = False 37 38 # Disable Telnet Console (enabled by default) 39 #TELNETCONSOLE_ENABLED = False 40 41 # Override the default request headers: 42 #DEFAULT_REQUEST_HEADERS = { 43 # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 # 'Accept-Language': 'en', 45 #} 46 47 # Enable or disable spider middlewares 48 # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html 49 #SPIDER_MIDDLEWARES = { 50 # 'qunae.middlewares.QunaeSpiderMiddleware': 543, 51 #} 52 53 # Enable or disable downloader middlewares 54 # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 55 #DOWNLOADER_MIDDLEWARES = { 56 # 'qunae.middlewares.QunaeDownloaderMiddleware': 543, 57 #} 58 59 # Enable or disable extensions 60 # See https://docs.scrapy.org/en/latest/topics/extensions.html 61 #EXTENSIONS = { 62 # 'scrapy.extensions.telnet.TelnetConsole': None, 63 #} 64 65 # Configure item pipelines 66 # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html 67 #打开了这个才能下载定义的数据 68 ITEM_PIPELINES = { 69 'qunae.pipelines.QunaePipeline': 300, 70 } 71 72 # Enable and configure the AutoThrottle extension (disabled by default) 73 # See https://docs.scrapy.org/en/latest/topics/autothrottle.html 74 #AUTOTHROTTLE_ENABLED = True 75 # The initial download delay 76 #AUTOTHROTTLE_START_DELAY = 5 77 # The maximum download delay to be set in case of high latencies 78 #AUTOTHROTTLE_MAX_DELAY = 60 79 # The average number of requests Scrapy should be sending in parallel to 80 # each remote server 81 #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 82 # Enable showing throttling stats for every response received: 83 #AUTOTHROTTLE_DEBUG = False 84 85 # Enable and configure HTTP caching (disabled by default) 86 # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 87 #表示网页已缓存,调用时无需再次爬取网页内容 88 HTTPCACHE_ENABLED = True 89 #HTTPCACHE_EXPIRATION_SECS = 0 90 HTTPCACHE_DIR = 'E:/qunaes/qunae/ss' 91 #HTTPCACHE_IGNORE_HTTP_CODES = [] 92 #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
⑥终端运行scrapy框架
打开终端——》进入scrapy框架有scrapy.cfg文件的目录——》运行scrapy crawl start_spider(若没有编写将数据保存在数据库,可以运行scrapy crawl start_spider -o books.csv)
浙公网安备 33010602011771号