目标
爬取当当网中python图书中的 图书标题、图片、作者、出版时间、价格、评论条数
链接地址 http://search.dangdang.com/?key=python&act=input&page_index=1
目的联系管道的使用
"""
遇到的问题主要是在通过管道下载不了图片,原因是获取的图片链接没有http: 导致无法获取,需要拼接字符串
"""
有兴趣的话可以看官方文档:https://docs.scrapy.org/en/latest/
items.py
import scrapy
# 图书标题、图片、作者、出版时间、价格、评论条数
class DangdangItem(scrapy.Item):
title = scrapy.Field()
pic = scrapy.Field()
author = scrapy.Field()
publish = scrapy.Field()
price = scrapy.Field()
comment = scrapy.Field()
image_urls = scrapy.Field()
dang.py(爬虫类)
import scrapy
from dangdang.items import DangdangItem
class DangSpider(scrapy.Spider):
name = 'dang'
allowed_domains = ['dangdang.com']
start_urls = ['http://search.dangdang.com/?key=python&act=input&page_index=1',]
def parse(self, response):
lis = response.selector.css('ul#component_59 li')
for li in lis:
item = DangdangItem()
item['title'] = li.css('a::attr(title)').get()
item['pic'] = li.css('a.pic img::attr(data-original)').get()
item['author'] = li.xpath('./p[@class="search_book_author"]/span[1]/a[1]/text()').get()
item['publish'] = li.xpath('./p[@class="search_book_author"]/span[2]/text()').get()
item['price'] = li.css('p.price span::text').get()
item['comment'] = li.css('p.search_star_line a::text').get()
item['image_urls'] = [item['pic']]
# print(item)
yield item
# 解析出当前页面中的下一页的url地址
next_url = response.selector.css('div.paging li.next a::attr(href)').get()
if next_url: # 判断是否有下一页
url = response.urljoin(next_url) # 构造绝对的url
yield scrapy.Request(url=url,callback=self.parse)
"""
用xpath匹配数据时,一个标签内出现多个span标签,而获取的标签是其中一个,可以使用span[索引]取值
注意:索引是从一开始的
"""
pipelines.py
class DangdangPipeline:
def process_item(self, item, spider):
return item
import pymysql
# 连接mysql数据库,存入数据(还可以用mongodb,形式上都差不多)
class MysqlPipeline:
def __init__(self, host, user, password, database, port):
self.host = host
self.user = user
self.password = password
self.database = database
self.port = port
@classmethod
def from_crawler(cls, crawler):
return cls(
host=crawler.settings.get('MYSQL_HOST'),
database=crawler.settings.get('MYSQL_DATABASE'),
user=crawler.settings.get('MYSQL_USER'),
password=crawler.settings.get('MYSQL_PASS'),
port=crawler.settings.get('MYSQL_PORT'),
)
def open_spider(self, spider):
self.db = pymysql.connect(
host=self.host,
user=self.user,
password=self.password,
database=self.database,
charset='utf8',
port=self.port
)
# 获取游标对象
self.cursor = self.db.cursor()
def close_spider(self, spider):
self.db.close()
def process_item(self, item, spider):
sql = "insert into dang(title,author,pic,publish,comment,price) values('%s','%s','%s','%s','%s','%s')"%(item['title'],item['author'],item['pic'],item['publish'],item['comment'],item['price'])
# 运行sql语句
self.cursor.execute(sql)
# 提交
self.db.commit()
return item
"""
scrapy框架自带管道下载图片
出现的问题:获取的图片链接不带http: 需要手动拼接 解决方案如下
"""
# import scrapy
# from scrapy.pipelines.images import ImagesPipeline
#
# class PhotoPipeline(ImagesPipeline):
#
# def get_media_requests(self, item, info):
#
# try:
# for url in item['image_urls']:
# url = 'http:' + url
# yield scrapy.Request(url)
# except Exception as e:
# print('错误信息======================',e)
# 自定义图片储存
import scrapy
from scrapy.exceptions import DropItem
from scrapy.pipelines.images import ImagesPipeline
class MyImagesPipeline(ImagesPipeline):
def get_media_requests(self, item, info):
# 通过抓取的item对象获取图片信息,并创建Request请求对象添加调度队列,等待调度执行
try:
for url in item['image_urls']:
url = 'http:' + url
yield scrapy.Request(url)
except Exception as e:
print('错误信息======================',e)
def file_path(self,request,response=None,info=None):
# 返回图片下载后保存的名称,没有此方法Scrapy则自动给一个唯一值作为图片名称
url = request.url
file_name = url.split("/")[-1]
return file_name
def item_completed(self, results, item, info):
# 下载完成后的处理方法,其中results内容结构如下说明
# print(results)
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem("Item contains no images")
item['pic'] = image_paths[0]
return item
settings.py
ITEM_PIPELINES = {
'dangdang.pipelines.DangdangPipeline': 300,
# 自定义下载图片
'dangdang.pipelines.MyImagesPipeline': 301,
# 'dangdang.pipelines.PhotoPipeline': 100,
# scrapy框架自带储存图片
# 'scrapy.pipelines.images.ImagesPipeline': 1,
'dangdang.pipelines.MysqlPipeline': 302,
}
IMAGES_STORE = "图片路径" # 图片保存的路径
# 配置数据库连接
MYSQL_HOST = 'localhost'
MYSQL_DATABASE = 'dangdang'
MYSQL_USER = 'root'
MYSQL_PASS = 'shuai'
MYSQL_PORT = 3306
"""
图片访问不了,可能是防盗链的问题,需要在该文件夹下配置请求头和referer
"""