五、Scrapy框架(8)——scrapy爬虫实战
小程序社区爬虫
数据保存到json文件中。使用 CrawlSpider
模拟登录豆瓣网爬虫
发送 post 请求模拟登录
图片下载爬虫
汽车之家宝马5系爬虫
BOSS直聘爬虫
BOSS直聘有很高的反爬虫机制,只要用同 个IP访问多个职位列表页,就会被封掉IP。采用代理ip的方式可解决问题。
简书网站整站爬虫
数据保存到mysql数据库中
将 selenium + chromedriver集成到scrapy
一、页面解析
- 在终端中创建项目
scrapy startproject jianshu_spider
cd jianshu_spider
scrapy genspider -t crawl js "jianshu.com"
- 用Pycharm打开项目,修改
settings.py配置,打开请求头
ROBOTSTXT_OBEY = False
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
}
-
新建
start.py文件,用于启动项目from scrapy import cmdline cmdline.execute("scrapy crawl js".split()) -
分析网页,选择要爬虫的项目,在
items.py创建ArticleItem类import scrapy class ArticleItem(scrapy.Item): title = scrapy.Field() # 标题 content = scrapy.Field() # 内容 article_id = scrapy.Field() # 文章id origin_url = scrapy.Field() # 原始地址 author = scrapy.Field() # 作者 avatar = scrapy.Field() # 头像 pub_time = scrapy.Field() # 发布时间 -
开始写爬虫
js.py# -*- coding: utf-8 -*- import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from jianshu_spider.items import ArticleItem class JsSpider(CrawlSpider): name = 'js' allowed_domains = ['jianshu.com'] start_urls = ['https://www.jianshu.com/'] rules = ( Rule(LinkExtractor(allow=r'.*/p/[0-9a-z]{12}'), callback='parse_detail', follow=True), ) def parse_detail(self, response): title = response.xpath("//h1[@class='title']/text()").get() avatar = response.xpath("//a[@class='avatar']/img/@src").get() author = response.xpath("//span[@class='name']/a//text()").get() pub_time = response.xpath("//span[@class='publish-time']//text()").get() # https://www.jianshu.com/p/66aeb8473df1?/u/052e3bd4d2bc?utm_campaign=maleskine&utm_content=user&utm_medium=seo_notes&utm_source=recommendation url = response.url url1 = url.split("?")[0] article_id = url1.split('/')[-1] content = response.xpath("//div[@class='show-content']").get() item = ArticleItem( title=title, avatar=avatar, author=author, pub_time=pub_time, article_id=article_id, origin_url=response.url, content=content ) yield item
二、数据保存到Mysql数据库
-
在
settings.py中打开ITEM_PIPELINES设置ITEM_PIPELINES = { 'jianshu_spider.pipelines.JianshuSpiderPipeline': 300, } # 下载延迟3秒钟 DOWNLOAD_DELAY = 3 -
在mysql中创建 jianshu数据库,新建 article表
-
编写
pipelines.py,连接数据库
import pymysql
from pymysql import cursors
class JianshuSpiderPipeline(object):
def __init__(self):
dbparams = {
'host': '127.0.0.1',
'port': 3306,
'user': 'root',
'password': 'root',
'database': 'jianshu',
'charset': 'utf8'
}
self.conn = pymysql.connect(**dbparams)
self.cursor = self.conn.cursor()
self._sql = None
def process_item(self, item, spider):
self.cursor.execute(self.sql, (
item['title'], item['content'], item['author'], item['avatar'], item['pub_time'], item['origin_url'],
item['article_id']))
self.conn.commit()
return item
@property
def sql(self):
if not self._sql:
self._sql = """
insert into article(title,content,author,avatar,pub_time,origin_url,article_id) values(%s,%s,%s,%s,%s,%s,%s)
"""
return self._sql
return self._sql
- 改成异常处理,
pipelines.py
import pymysql
from twisted.enterprise import adbapi
from pymysql import cursors
class JianshuTwistedPipeline(object):
def __init__(self):
dbparams = {
'host': '127.0.0.1',
'port': 3306,
'user': 'root',
'password': 'root',
'database': 'jianshu',
'charset': 'utf8',
'cursorclass': cursors.DictCursor
}
self.dbpool = adbapi.ConnectionPool('pymysql', **dbparams)
self._sql = None
@property
def sql(self):
if not self._sql:
self._sql = """
insert into article(title,content,author,avatar,pub_time,origin_url,article_id) values(%s,%s,%s,%s,%s,%s,%s)
"""
return self._sql
return self._sql
def process_item(self, item, spider):
defer = self.dbpool.runInteraction(self.insert_item, item)
defer.addErrback(self.handle_error,item,spider)
def insert_item(self, cursor, item):
cursor.execute(self.sql,(
item['title'], item['content'], item['author'], item['avatar'], item['pub_time'], item['origin_url'],
item['article_id']))
def handle_error(self, error, item, spider):
print('='*10+"error"+'='*10)
print(error)
print('='*10+"error"+'='*10)
把settings.py中的ITEM_PIPELINES改一下
ITEM_PIPELINES = {
# 'jianshu_spider.pipelines.JianshuSpiderPipeline': 300,
'jianshu_spider.pipelines.JianshuTwistedPipeline': 300,
}
三、爬取ajax数据
将 selenium + chromedriver集成到scrapy
-
在数据库加入阅读、喜欢、字数、专题、评论数 字段
-
在
items.py中添加字段import scrapy class ArticleItem(scrapy.Item): title = scrapy.Field() # 标题 content = scrapy.Field() # 内容 article_id = scrapy.Field() # 文章id origin_url = scrapy.Field() # 原始地址 author = scrapy.Field() # 作者 avatar = scrapy.Field() # 头像 pub_time = scrapy.Field() # 发布时间 read_count = scrapy.Field() # 阅读量 like_count = scrapy.Field() # 喜欢 word_count = scrapy.Field() # 字数 comment_count = scrapy.Field() # 评论数 subjects = scrapy.Field() # 专题 -
编写
middlewares.py
from scrapy.http import HtmlResponse
from selenium import webdriver
import time
class SeleniumDownloadMiddleware(object):
def __init__(self):
self.driver = webdriver.Chrome(executable_path="/Users/ren/Applications/chromedriver")
def process_request(self, request, spider):
self.driver.get(request.url)
time.sleep(1)
try:
while True :
showMore = self.driver.find_elements_by_class_name('show-more')
showMore.click()
time.sleep(0.3)
if not showMore:
break
except:
pass
source = self.driver.page_source
response = HtmlResponse(url=self.driver.current_url, body=source, request=request)
return response
- 优化
js.py
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from jianshu_spider.items import ArticleItem
class JsSpider(CrawlSpider):
name = 'js'
allowed_domains = ['jianshu.com']
start_urls = ['https://www.jianshu.com/']
rules = (
Rule(LinkExtractor(allow=r'.*/p/[0-9a-z]{12}'), callback='parse_detail', follow=True),
)
def parse_detail(self, response):
title = response.xpath("//h1[@class='title']/text()").get()
avatar = response.xpath("//a[@class='avatar']/img/@src").get()
author = response.xpath("//span[@class='name']/a//text()").get()
pub_time = response.xpath("//span[@class='publish-time']//text()").get()
# https://www.jianshu.com/p/66aeb8473df1?/u/052e3bd4d2bc?utm_campaign=maleskine&utm_content=user&utm_medium=seo_notes&utm_source=recommendation
url = response.url
url1 = url.split("?")[0]
article_id = url1.split('/')[-1]
content = response.xpath("//div[@class='show-content']").get()
word_count = response.xpath("//span[@class='wordage']/text()").get()
comment_count = response.xpath("//span[@class='comments-count']/text()").get()
read_count = response.xpath("//span[@class='views-count']/text()").get()
like_count = response.xpath("//span[@class='likes-count']/text()").get()
subjects = ",".join(response.xpath("//div[@class='include-collection']/a/div/text()").getall())
item = ArticleItem(
title=title,
avatar=avatar,
author=author,
pub_time=pub_time,
article_id=article_id,
origin_url=response.url,
content=content,
word_count=word_count,
read_count=read_count,
like_count=like_count,
comment_count=comment_count,
subjects=subjects
)
yield item
-
把下载中间件加到
settings.py中DOWNLOADER_MIDDLEWARES = { 'jianshu_spider.middlewares.SeleniumDownloadMiddleware': 543, }
浙公网安备 33010602011771号