Scrapy 笔记(二)
一个scrapy爬虫知乎项目的笔记
1、通过命令创建项目
scrapy startproject zhihu
cd zhihu
scrapy genspider zhihu www.zhihu.com(临时的项目,非正式)
直接通过scrapy crawl zhihu启动爬虫会看到如下错误:500、501之类服务器错误。
解决:在setting.py 中为爬虫添加User-Agent
# Override the default request headers: DEFAULT_REQUEST_HEADERS = { 'User-Agent':"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36", 'authorization':'oauth c3cef7c66a1843f8b3a9e6a1e3160e20' }
继续尝试后出现401,错误
解决:分析浏览器行为,得到需要添加 header:'authorization':'oauth c3cef7c66a1843f8b3a9e6a1e3160e20'
尝试后成功访问到知乎。
2、书写项目中的所需数据item
class UserItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() name = scrapy.Field() headline = scrapy.Field() url_token = scrapy.Field()
3、midwares暂时不需要修改
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
4、主要代码zhihuspider书写

# -*- coding: utf-8 -*- import json import scrapy from scrapy import Spider,Request from zhihu.items import UserItem class ZhihuSpider(scrapy.Spider): name = "zhihu" allowed_domains = ["www.zhihu.com"] start_urls = ['http://www.zhihu.com/'] #这里定义一个start_user存储我们找的大V账号 #start_user = "-LKs-" start_user = 'undo-59-76' #这里把查询的参数单独存储为user_query,user_url存储的为查询用户信息的url地址 user_url = "https://www.zhihu.com/api/v4/members/{user}?include={include}" user_query = "locations,employments,gender,educations,business,voteup_count,thanked_Count,follower_count,following_count,cover_url,following_topic_count,following_question_count,following_favlists_count,following_columns_count,avatar_hue,answer_count,articles_count,pins_count,question_count,columns_count,commercial_question_count,favorite_count,favorited_count,logs_count,marked_answers_count,marked_answers_text,message_thread_token,account_status,is_active,is_bind_phone,is_force_renamed,is_bind_sina,is_privacy_protected,sina_weibo_url,sina_weibo_name,show_sina_weibo,is_blocking,is_blocked,is_following,is_followed,mutual_followees_count,vote_to_count,vote_from_count,thank_to_count,thank_from_count,thanked_count,description,hosted_live_count,participated_live_count,allow_message,industry_category,org_name,org_homepage,badge[?(type=best_answerer)].topics" #follows_url存储的为关注列表的url地址,fllows_query存储的为查询参数。这里涉及到offset和limit是关于翻页的参数,0,20表示第一页 follows_url = "https://www.zhihu.com/api/v4/members/{user}/followees?include={include}&offset={offset}&limit={limit}" follows_query = "data%5B*%5D.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics" #followers_url是获取粉丝列表信息的url地址,followers_query存储的为查询参数。 followers_url = "https://www.zhihu.com/api/v4/members/{user}/followers?include={include}&offset={offset}&limit={limit}" followers_query = "data%5B*%5D.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics" def start_requests(self): ''' start_requests方法,分别请求了用户查询的url和关注列表的查询以及粉丝列表信息查询 :return: ''' yield Request(self.user_url.format(user=self.start_user,include=self.user_query),callback=self.parse_user) yield Request(self.follows_url.format(user=self.start_user,include=self.follows_query,offset=0,limit=20),callback=self.parse_follows) yield Request(self.followers_url.format(user=self.start_user,include=self.followers_query,offset=0,limit=20),callback=self.parse_followers) def parse_user(self, response): ''' 因为返回的是json格式的数据,所以这里直接通过json.loads获取结果 :param response: :return: ''' result = json.loads(response.text) item = UserItem() #这里循环判断获取的字段是否在自己定义的字段中,然后进行赋值 for field in item.fields: if field in result.keys(): item[field] = result.get(field) print(item[field]) #这里在返回item的同时返回Request请求,继续递归拿关注用户信息的用户获取他们的关注列表 yield item yield Request(self.follows_url.format(user = result.get("url_token"),include=self.follows_query,offset=0,limit=20),callback=self.parse_follows) yield Request(self.followers_url.format(user = result.get("url_token"),include=self.followers_query,offset=0,limit=20),callback=self.parse_followers) def parse_follows(self, response): ''' 查询关注的callback函数 ''' results = json.loads(response.text) if 'data' in results.keys(): for result in results.get('data'): yield Request(self.user_url.format(user = result.get("url_token"),include=self.user_query),callback=self.parse_user) #这里判断page是否存在并且判断page里的参数is_end判断是否为False,如果为False表示不是最后一页,否则则是最后一页 if 'paging' in results.keys() and results.get('is_end') == False: next_page = results.get('paging').get("next") #获取下一页的地址然后通过yield继续返回Request请求,继续请求自己再次获取下页中的信息 yield Request(next_page,self.parse_follows) def parse_followers(self, response): ''' 这里其实和关乎列表的处理方法是一样的 用户粉丝列表的解析,这里返回的也是json数据 这里有两个字段data和paging,其中page是分页信息 ''' results = json.loads(response.text) if 'data' in results.keys(): for result in results.get('data'): yield Request(self.user_url.format(user = result.get("url_token"),include=self.user_query),callback=self.parse_user) #这里判断page是否存在并且判断page里的参数is_end判断是否为False,如果为False表示不是最后一页,否则则是最后一页 if 'paging' in results.keys() and results.get('paging').get('is_end') == False: next_page = results.get('paging').get("next") #获取下一页的地址然后通过yield继续返回Request请求,继续请求自己再次获取下页中的信息 yield Request(next_page,self.parse_followers)
# -*- coding: utf-8 -*- import json import scrapy from scrapy import Spider,Request from zhihu.items import UserItem class ZhihuSpider(scrapy.Spider): name = "zhihu" allowed_domains = ["www.zhihu.com"] start_urls = ['http://www.zhihu.com/'] yield Request(url,callback=self.parse_user) yield Request(url,callback=self.parse_follows) yield Request(url,callback=self.parse_followers) def parse_user(self, response): result = json.loads(response.text) item = UserItem() #这里循环判断获取的字段是否在自己定义的字段中,然后进行赋值 for field in item.fields: if field in result.keys(): item[field] = result.get(field) print(item[field]) #这里在返回item的同时返回Request请求,继续递归拿关注用户信息的用户获取他们的关注列表 yield item yield Request(url,callback=self.parse_follows) yield Request(url,callback=self.parse_followers) def parse_follows(self, response): ''' 查询关注的callback函数 ''' results = json.loads(response.text) if 'data' in results.keys(): for result in results.get('data'): yield Request(self.user_url.format(user = result.get("url_token"),include=self.user_query),callback=self.parse_user) #这里判断page是否存在并且判断page里的参数is_end判断是否为False,如果为False表示不是最后一页,否则则是最后一页 if 'paging' in results.keys() and results.get('is_end') == False: next_page = results.get('paging').get("next") #获取下一页的地址然后通过yield继续返回Request请求,继续请求自己再次获取下页中的信息 yield Request(next_page,self.parse_follows)
5、存储爬取到的数据
在pipline中处理爬取到的内容

import json import pymysql from scrapy.exceptions import DropItem #存入json文件pipeline class JsonFilePipeline(object): def __init__(self): self.file = open('items.jl', 'w',encoding='utf-8') def process_item(self, item, spider): line = json.dumps(dict(item),ensure_ascii=False) + "\n" #print('hello') print(line) self.file.write(line) return item #去重pipeline class DuplicatesPipeline(object): def __init__(self): self.ids_seen = set() def process_item(self, item, spider): if item['url_token'] in self.ids_seen: raise DropItem("Duplicate item found: %s" % item) else: self.ids_seen.add(item['url_token']) return item #存入mysql数据库pipeline class MysqlPipeline(object): MysqlHandle = '' Cusor = '' #def __init__(self): #打开爬虫时操作 def open_spider(self, spider): print('打开爬虫') print('连接sql') self.MysqlHandle = self.dbHandle() self.Cusor = self.MysqlHandle.cursor() def close_spider(self, spider): print('关闭爬虫') print('关闭sql') #关闭游标 self.Cusor.close() #关闭连接fetch self.MysqlHandle.close() def dbHandle(self): conn = pymysql.connect( host='localhost', user='root', passwd='passwd', db = "databases", charset='utf8', use_unicode=False ) return conn def process_item(self, item, spider): dbObject = self.MysqlHandle cursor = self.Cusor sql = 'insert ignore into zhihu(url_token,name,headline) values (%s,%s,%s)' try: cursor.execute(sql,(item['url_token'],item['name'],item['headline'])) dbObject.commit() except Exception as e: print(e) dbObject.rollback() return item
需要在setting中设置的内容
# Configure item pipelines # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { #保存在文件中 #'zhihu.pipelines.JsonFilePipeline': 300 #保存在mysql数据库中 'zhihu.pipelines.MysqlPipeline': 800, #去重 'zhihu.pipelines.DuplicatesPipeline':300 }
6、setting中可设置的选项(待添加)
#log change
LOG_ENABLED = False
#LOG_LEVEL = 'ERROR'
#LOG_FILE = 'log.txt'
7、scrapy的增量爬重(暂停的开启与关闭)
要启用一个爬虫的持久化,运行以下命令:
scrapy crawl somespider -s JOBDIR=crawls/somespider-1
然后,你就能在任何时候安全地停止爬虫(按Ctrl-C或者发送一个信号)。恢复这个爬虫也是同样的命令:
scrapy crawl somespider -s JOBDIR=crawls/somespider-1