如何使用Scrapy框架实现网络爬虫
现在用下面这个案例来演示如果爬取安居客上面深圳的租房信息,我们采取这样策略,首先爬取所有租房信息的链接地址,然后再根据爬取的地址获取我们所需要的页面信息。访问次数多了,会被重定向到输入验证码页面,这个问题后面有几种策略解决。
如果还不知道怎么去安装部署scrapy的参考我的另外一篇文章《快速部署网络爬虫框架scrapy》
1. 创建项目:
进入项目路径,使用命令 scrapy startproject anjuke_urls
进入项目路径,使用命令 scrapy startproject anjuke_zufang
2. 创建爬虫文件:
进入项目anjuke_urls的spider路径,使用命令 scrapy genspider anjuke_urls https://sz.zu.anjuke.com/
3. 爬虫anjuke_urls代码:
anjuke_urls.py
1 # -*- coding: utf-8 -*- 2 import scrapy 3 from ..items import AnjukeUrlsItem 4 5 6 class AnjukeGeturlsSpider(scrapy.Spider): 7 name = 'anjuke_getUrls' 8 9 start_urls = ['https://sz.zu.anjuke.com/'] 10 11 def parse(self, response): 12 # 实例化类对象 13 mylink = AnjukeUrlsItem() 14 # 获取所有的链接列表 15 links = response.xpath("//div[@class='zu-itemmod']/a/@href | //div[@class='zu-itemmod ']/a/@href").extract() 16 17 # 提取当前页面的所有租房链接 18 for link in links: 19 mylink['url'] = link 20 21 yield mylink 22 23 # 判断下一页是否能够点击,如果可以,继续并发送请求处理,直到链接全部提取完 24 if len(response.xpath("//a[@class = 'aNxt']")) != 0: 25 yield scrapy.Request(response.xpath("//a[@class = 'aNxt']/@href").extract()[0], callback= self.parse)
items.py
1 # -*- coding: utf-8 -*- 2 3 # Define here the models for your scraped items 4 # 5 # See documentation in: 6 # http://doc.scrapy.org/en/latest/topics/items.html 7 8 import scrapy 9 10 11 class AnjukeUrlsItem(scrapy.Item): 12 # define the fields for your item here like: 13 # name = scrapy.Field() 14 # 租房链接 15 url = scrapy.Field()
pipelines.py
1 # -*- coding: utf-8 -*- 2 3 # Define your item pipelines here 4 # 5 # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 8 9 class AnjukeUrlsPipeline(object): 10 # 以写入的方式打开文件link.txt,不存在会新建一个 11 def open_spider(self, spider): 12 self.linkFile = open('G:\Python\网络爬虫\\anjuke\data\link.txt', 'w', encoding='utf-8') 13 14 # 将获取的所有url写进文件 15 def process_item(self, item, spider): 16 self.linkFile.writelines(item['url'] + "\n") 17 return item 18 19 # 关闭文件 20 def close_spider(self, spider): 21 self.linkFile.close()
setting.py
1 # -*- coding: utf-8 -*- 2 3 # Scrapy settings for anjuke_urls project 4 # 5 # For simplicity, this file contains only settings considered important or 6 # commonly used. You can find more settings consulting the documentation: 7 # 8 # http://doc.scrapy.org/en/latest/topics/settings.html 9 # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 12 BOT_NAME = 'anjuke_urls' 13 14 SPIDER_MODULES = ['anjuke_urls.spiders'] 15 NEWSPIDER_MODULE = 'anjuke_urls.spiders' 16 17 18 # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36' 20 21 # Obey robots.txt rules 22 ROBOTSTXT_OBEY = False 23 24 # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 #CONCURRENT_REQUESTS = 32 26 27 # Configure a delay for requests for the same website (default: 0) 28 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 29 # See also autothrottle settings and docs 30 #DOWNLOAD_DELAY = 3 31 # The download delay setting will honor only one of: 32 #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 #CONCURRENT_REQUESTS_PER_IP = 16 34 35 # Disable cookies (enabled by default) 36 #COOKIES_ENABLED = False 37 38 # Disable Telnet Console (enabled by default) 39 #TELNETCONSOLE_ENABLED = False 40 41 # Override the default request headers: 42 #DEFAULT_REQUEST_HEADERS = { 43 # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 # 'Accept-Language': 'en', 45 #} 46 47 # Enable or disable spider middlewares 48 # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 49 #SPIDER_MIDDLEWARES = { 50 # 'anjuke_urls.middlewares.AnjukeUrlsSpiderMiddleware': 543, 51 #} 52 53 # Enable or disable downloader middlewares 54 # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 55 #DOWNLOADER_MIDDLEWARES = { 56 # 'anjuke_urls.middlewares.MyCustomDownloaderMiddleware': 543, 57 #} 58 59 # Enable or disable extensions 60 # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 61 #EXTENSIONS = { 62 # 'scrapy.extensions.telnet.TelnetConsole': None, 63 #} 64 65 # Configure item pipelines 66 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 67 ITEM_PIPELINES = { 68 'anjuke_urls.pipelines.AnjukeUrlsPipeline': 300, 69 } 70 71 # Enable and configure the AutoThrottle extension (disabled by default) 72 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 73 #AUTOTHROTTLE_ENABLED = True 74 # The initial download delay 75 #AUTOTHROTTLE_START_DELAY = 5 76 # The maximum download delay to be set in case of high latencies 77 #AUTOTHROTTLE_MAX_DELAY = 60 78 # The average number of requests Scrapy should be sending in parallel to 79 # each remote server 80 #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 81 # Enable showing throttling stats for every response received: 82 #AUTOTHROTTLE_DEBUG = False 83 84 # Enable and configure HTTP caching (disabled by default) 85 # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 86 #HTTPCACHE_ENABLED = True 87 #HTTPCACHE_EXPIRATION_SECS = 0 88 #HTTPCACHE_DIR = 'httpcache' 89 #HTTPCACHE_IGNORE_HTTP_CODES = [] 90 #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
middlewares.py
1 # -*- coding: utf-8 -*- 2 3 # Define here the models for your spider middleware 4 # 5 # See documentation in: 6 # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 8 from scrapy import signals 9 10 11 class AnjukeUrlsSpiderMiddleware(object): 12 # Not all methods need to be defined. If a method is not defined, 13 # scrapy acts as if the spider middleware does not modify the 14 # passed objects. 15 16 @classmethod 17 def from_crawler(cls, crawler): 18 # This method is used by Scrapy to create your spiders. 19 s = cls() 20 crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 return s 22 23 def process_spider_input(self, response, spider): 24 # Called for each response that goes through the spider 25 # middleware and into the spider. 26 27 # Should return None or raise an exception. 28 return None 29 30 def process_spider_output(self, response, result, spider): 31 # Called with the results returned from the Spider, after 32 # it has processed the response. 33 34 # Must return an iterable of Request, dict or Item objects. 35 for i in result: 36 yield i 37 38 def process_spider_exception(self, response, exception, spider): 39 # Called when a spider or process_spider_input() method 40 # (from other spider middleware) raises an exception. 41 42 # Should return either None or an iterable of Response, dict 43 # or Item objects. 44 pass 45 46 def process_start_requests(self, start_requests, spider): 47 # Called with the start requests of the spider, and works 48 # similarly to the process_spider_output() method, except 49 # that it doesn’t have a response associated. 50 51 # Must return only requests (not items). 52 for r in start_requests: 53 yield r 54 55 def spider_opened(self, spider): 56 spider.logger.info('Spider opened: %s' % spider.name)
4. 爬虫anjuke_zufang代码:
anjuke_zufang.py
1 # -*- coding: utf-8 -*- 2 import scrapy 3 from ..items import AnjukeZufangItem 4 5 class AnjukeZufangSpider(scrapy.Spider): 6 name = 'anjuke_zufang' 7 # 初始化一个空列表 8 start_urls = [] 9 custom_settings = {'DOWNLOAD_DELAY' : 3}
10 # 初始化start_urls,一定需要这一步
11 def __init__(self): 12 links = open('G:\Python\网络爬虫\\anjuke\data\link.txt') 13 for line in links: 14 # 去掉换行符,如果有换行符则无法访问网址 15 line = line[:-1] 16 self.start_urls.append(line) 17 18 def parse(self, response): 19 item = AnjukeZufangItem() 20 # 直接获取页面我们需要的数据 21 item['roomRent'] = response.xpath('//span[@class = "f26"]/text()').extract()[0] 22 item['rentMode'] = response.xpath('//div[@class="pinfo"]/div/div/div[1]/dl[2]/dd/text()').extract()[0].strip() 23 item['roomLayout'] = response.xpath('//div[@class="pinfo"]/div/div/div[1]/dl[3]/dd/text()').extract()[0].strip() 24 item['roomSize'] = response.xpath('//div[@class="pinfo"]/div/div/div[2]/dl[3]/dd/text()').extract()[0] 25 item['LeaseMode'] = response.xpath('//div[@class="pinfo"]/div/div/div[1]/dl[4]/dd/text()').extract()[0] 26 item['apartmentName'] = response.xpath('//div[@class="pinfo"]/div/div/div[1]/dl[5]/dd/a/text()').extract()[0] 27 item['location1'] = response.xpath('//div[@class="pinfo"]/div/div/div[1]/dl[6]/dd/a/text()').extract()[0] 28 item['location2'] = response.xpath('//div[@class="pinfo"]/div/div/div[1]/dl[6]/dd/a[2]/text()').extract()[0] 29 item['floor'] = response.xpath('//div[@class="pinfo"]/div/div/div[2]/dl[5]/dd/text()').extract()[0] 30 item['orientation'] = response.xpath('//div[@class="pinfo"]/div/div/div[2]/dl[4]/dd/text()').extract()[0].strip() 31 item['decorationSituation'] = response.xpath('//div[@class="pinfo"]/div/div/div[2]/dl[2]/dd/text()').extract()[0] 32 item['intermediaryName'] = response.xpath('//h2[@class="f16"]/text()').extract()[0] 33 item['intermediaryPhone'] = response.xpath('//p[@class="broker-mobile"]/text()').extract()[0] 34 item['intermediaryCompany'] = response.xpath('//div[@class="broker-company"]/p[1]/a/text()').extract()[0] 35 item['intermediaryStore'] = response.xpath('//div[@class="broker-company"]/p[2]/a/text()').extract()[0] 36 item['link'] = response.url 37 38 yield item
items.py
1 # -*- coding: utf-8 -*- 2 3 # Define here the models for your scraped items 4 # 5 # See documentation in: 6 # http://doc.scrapy.org/en/latest/topics/items.html 7 8 import scrapy 9 10 11 class AnjukeZufangItem(scrapy.Item): 12 # define the fields for your item here like: 13 # name = scrapy.Field() 14 # 租金 15 roomRent = scrapy.Field() 16 # 租金压付方式 17 rentMode = scrapy.Field() 18 # 房型 19 roomLayout = scrapy.Field() 20 # 面积 21 roomSize = scrapy.Field() 22 # 租赁方式 23 LeaseMode = scrapy.Field() 24 # 所在小区 25 apartmentName = scrapy.Field() 26 # 位置 27 location1 = scrapy.Field() 28 location2 = scrapy.Field() 29 # 楼层 30 floor = scrapy.Field() 31 # 朝向 32 orientation = scrapy.Field() 33 # 装修 34 decorationSituation = scrapy.Field() 35 # 中介名字 36 intermediaryName = scrapy.Field() 37 # 中介电话 38 intermediaryPhone = scrapy.Field() 39 # 中介公司 40 intermediaryCompany = scrapy.Field() 41 # 中介门店 42 intermediaryStore = scrapy.Field() 43 # 房屋链接 44 link = scrapy.Field()
pipelines.py
1 # -*- coding: utf-8 -*- 2 3 # Define your item pipelines here 4 # 5 # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 8 import sqlite3 9 10 11 class AnjukeZufangPipeline(object): 12 def open_spider(self, spider): 13 self.file = open('G:\Python\网络爬虫\\anjuke\data\租房信息.txt', 'w', encoding='utf-8')
14 15 def process_item(self, item, spider): 16 self.file.write( 17 item['roomRent'] + "," + item['rentMode'] + "," + item['roomLayout'] + "," + item['roomSize'] + "," + item[ 18 'LeaseMode'] + "," + item['apartmentName'] + "," + item['location1'] + " " + item['location2'] + "," + item[ 19 'floor'] + "," + item['orientation'] + "," + item['decorationSituation'] + "," + item['intermediaryName'] + 20 "," + item['intermediaryPhone'] + "," + item['intermediaryCompany'] + "," + item['intermediaryStore'] + "," 21 + item['link'] + '\n') 22 23 return item 24 def spoder_closed(self, spider): 25 self.file.close()
setting.py
1 # -*- coding: utf-8 -*- 2 3 # Scrapy settings for anjuke_zufang project 4 # 5 # For simplicity, this file contains only settings considered important or 6 # commonly used. You can find more settings consulting the documentation: 7 # 8 # http://doc.scrapy.org/en/latest/topics/settings.html 9 # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 12 BOT_NAME = 'anjuke_zufang' 13 14 SPIDER_MODULES = ['anjuke_zufang.spiders'] 15 NEWSPIDER_MODULE = 'anjuke_zufang.spiders' 16 17 18 # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36' 20 21 # Obey robots.txt rules 22 ROBOTSTXT_OBEY = False 23 24 # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 #CONCURRENT_REQUESTS = 32 26 27 # Configure a delay for requests for the same website (default: 0) 28 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 29 # See also autothrottle settings and docs 30 #DOWNLOAD_DELAY = 3 31 # The download delay setting will honor only one of: 32 #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 #CONCURRENT_REQUESTS_PER_IP = 16 34 35 # Disable cookies (enabled by default) 36 #COOKIES_ENABLED = False 37 38 # Disable Telnet Console (enabled by default) 39 #TELNETCONSOLE_ENABLED = False 40 41 # Override the default request headers: 42 #DEFAULT_REQUEST_HEADERS = { 43 # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 # 'Accept-Language': 'en', 45 #} 46 47 # Enable or disable spider middlewares 48 # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 49 #SPIDER_MIDDLEWARES = { 50 # 'anjuke_zufang.middlewares.AnjukeZufangSpiderMiddleware': 543, 51 #} 52 53 # Enable or disable downloader middlewares 54 # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 55 #DOWNLOADER_MIDDLEWARES = { 56 # 'anjuke_zufang.middlewares.MyCustomDownloaderMiddleware': 543, 57 #} 58 59 # Enable or disable extensions 60 # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 61 #EXTENSIONS = { 62 # 'scrapy.extensions.telnet.TelnetConsole': None, 63 #} 64 65 # Configure item pipelines 66 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 67 ITEM_PIPELINES = { 68 'anjuke_zufang.pipelines.AnjukeZufangPipeline': 300, 69 } 70 71 # Enable and configure the AutoThrottle extension (disabled by default) 72 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 73 #AUTOTHROTTLE_ENABLED = True 74 # The initial download delay 75 #AUTOTHROTTLE_START_DELAY = 5 76 # The maximum download delay to be set in case of high latencies 77 #AUTOTHROTTLE_MAX_DELAY = 60 78 # The average number of requests Scrapy should be sending in parallel to 79 # each remote server 80 #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 81 # Enable showing throttling stats for every response received: 82 #AUTOTHROTTLE_DEBUG = False 83 84 # Enable and configure HTTP caching (disabled by default) 85 # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 86 #HTTPCACHE_ENABLED = True 87 #HTTPCACHE_EXPIRATION_SECS = 0 88 #HTTPCACHE_DIR = 'httpcache' 89 #HTTPCACHE_IGNORE_HTTP_CODES = [] 90 #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
middlewares.py
1 # -*- coding: utf-8 -*- 2 3 # Define here the models for your spider middleware 4 # 5 # See documentation in: 6 # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 8 from scrapy import signals 9 10 11 class AnjukeZufangSpiderMiddleware(object): 12 # Not all methods need to be defined. If a method is not defined, 13 # scrapy acts as if the spider middleware does not modify the 14 # passed objects. 15 16 @classmethod 17 def from_crawler(cls, crawler): 18 # This method is used by Scrapy to create your spiders. 19 s = cls() 20 crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 return s 22 23 def process_spider_input(self, response, spider): 24 # Called for each response that goes through the spider 25 # middleware and into the spider. 26 27 # Should return None or raise an exception. 28 return None 29 30 def process_spider_output(self, response, result, spider): 31 # Called with the results returned from the Spider, after 32 # it has processed the response. 33 34 # Must return an iterable of Request, dict or Item objects. 35 for i in result: 36 yield i 37 38 def process_spider_exception(self, response, exception, spider): 39 # Called when a spider or process_spider_input() method 40 # (from other spider middleware) raises an exception. 41 42 # Should return either None or an iterable of Response, dict 43 # or Item objects. 44 pass 45 46 def process_start_requests(self, start_requests, spider): 47 # Called with the start requests of the spider, and works 48 # similarly to the process_spider_output() method, except 49 # that it doesn’t have a response associated. 50 51 # Must return only requests (not items). 52 for r in start_requests: 53 yield r 54 55 def spider_opened(self, spider): 56 spider.logger.info('Spider opened: %s' % spider.name)
5. 依次运行爬虫:
进入爬虫anjuke_urls项目,运行scrapy crawl anjuke_getUrls
进入爬虫anjuke_urls项目,运行scrapy crawl anjuke_zufang
    兴趣是最好的老师,知识改变格局,转载请注明出处!
 
                    
                     
                    
                 
                    
                
 
                
            
         
         浙公网安备 33010602011771号
浙公网安备 33010602011771号