管道学习

目标

爬取当当网中python图书中的 图书标题、图片、作者、出版时间、价格、评论条数
链接地址 http://search.dangdang.com/?key=python&act=input&page_index=1

目的联系管道的使用
"""
遇到的问题主要是在通过管道下载不了图片,原因是获取的图片链接没有http: 导致无法获取,需要拼接字符串
"""
有兴趣的话可以看官方文档:https://docs.scrapy.org/en/latest/

items.py

import scrapy

# 图书标题、图片、作者、出版时间、价格、评论条数
class DangdangItem(scrapy.Item):
    title = scrapy.Field()
    pic = scrapy.Field()
    author = scrapy.Field()
    publish = scrapy.Field()
    price = scrapy.Field()
    comment = scrapy.Field()
    image_urls = scrapy.Field()

dang.py(爬虫类)

import scrapy
from dangdang.items import DangdangItem


class DangSpider(scrapy.Spider):
    name = 'dang'
    allowed_domains = ['dangdang.com']
    start_urls = ['http://search.dangdang.com/?key=python&act=input&page_index=1',]

    def parse(self, response):
        lis = response.selector.css('ul#component_59 li')

        for li in lis:
            item = DangdangItem()
            item['title'] = li.css('a::attr(title)').get()
            item['pic'] = li.css('a.pic img::attr(data-original)').get()
            item['author'] = li.xpath('./p[@class="search_book_author"]/span[1]/a[1]/text()').get()
            item['publish'] = li.xpath('./p[@class="search_book_author"]/span[2]/text()').get()
            item['price'] = li.css('p.price span::text').get()
            item['comment'] = li.css('p.search_star_line a::text').get()
            item['image_urls'] = [item['pic']]
            # print(item)
            yield item

            # 解析出当前页面中的下一页的url地址
            next_url = response.selector.css('div.paging li.next a::attr(href)').get()
            if next_url: # 判断是否有下一页
                url = response.urljoin(next_url) # 构造绝对的url
                yield scrapy.Request(url=url,callback=self.parse)
              
"""
用xpath匹配数据时,一个标签内出现多个span标签,而获取的标签是其中一个,可以使用span[索引]取值
注意:索引是从一开始的
"""

pipelines.py

class DangdangPipeline:
    def process_item(self, item, spider):
        return item

import pymysql

# 连接mysql数据库,存入数据(还可以用mongodb,形式上都差不多)
class MysqlPipeline:

    def __init__(self, host, user, password, database, port):
        self.host = host
        self.user = user
        self.password = password
        self.database = database
        self.port = port

    @classmethod
    def from_crawler(cls, crawler):
        return cls(
            host=crawler.settings.get('MYSQL_HOST'),
            database=crawler.settings.get('MYSQL_DATABASE'),
            user=crawler.settings.get('MYSQL_USER'),
            password=crawler.settings.get('MYSQL_PASS'),
            port=crawler.settings.get('MYSQL_PORT'),
        )

    def open_spider(self, spider):
        self.db = pymysql.connect(
            host=self.host,
            user=self.user,
            password=self.password,
            database=self.database,
            charset='utf8',
            port=self.port
        )
        # 获取游标对象
        self.cursor = self.db.cursor()

    def close_spider(self, spider):
        self.db.close()

    def process_item(self, item, spider):
        sql = "insert into dang(title,author,pic,publish,comment,price) values('%s','%s','%s','%s','%s','%s')"%(item['title'],item['author'],item['pic'],item['publish'],item['comment'],item['price'])
        # 运行sql语句
        self.cursor.execute(sql)
        # 提交
        self.db.commit()
        return item


"""
scrapy框架自带管道下载图片
出现的问题:获取的图片链接不带http: 需要手动拼接 解决方案如下
"""
# import scrapy
# from scrapy.pipelines.images import ImagesPipeline
#
# class PhotoPipeline(ImagesPipeline):
#
#     def get_media_requests(self, item, info):
#
#         try:
#             for url in item['image_urls']:
#                 url = 'http:' + url
#                 yield scrapy.Request(url)
#         except Exception as e:
#             print('错误信息======================',e)


# 自定义图片储存
import scrapy
from scrapy.exceptions import DropItem
from scrapy.pipelines.images import ImagesPipeline

class MyImagesPipeline(ImagesPipeline):

    def get_media_requests(self, item, info):
        # 通过抓取的item对象获取图片信息,并创建Request请求对象添加调度队列,等待调度执行
        try:
            for url in item['image_urls']:
                url = 'http:' + url
                yield scrapy.Request(url)
        except Exception as e:
            print('错误信息======================',e)

    def file_path(self,request,response=None,info=None):
        # 返回图片下载后保存的名称,没有此方法Scrapy则自动给一个唯一值作为图片名称
        url = request.url
        file_name = url.split("/")[-1]
        return file_name

    def item_completed(self, results, item, info):
        # 下载完成后的处理方法,其中results内容结构如下说明
        # print(results)
        image_paths = [x['path'] for ok, x in results if ok]
        if not image_paths:
            raise DropItem("Item contains no images")
        item['pic'] = image_paths[0]
        return item

settings.py

ITEM_PIPELINES = {
   'dangdang.pipelines.DangdangPipeline': 300,

    # 自定义下载图片
   'dangdang.pipelines.MyImagesPipeline': 301,

   # 'dangdang.pipelines.PhotoPipeline': 100,
   # scrapy框架自带储存图片
   # 'scrapy.pipelines.images.ImagesPipeline': 1,

   'dangdang.pipelines.MysqlPipeline': 302,
}

IMAGES_STORE = "图片路径"   # 图片保存的路径
 
# 配置数据库连接
MYSQL_HOST = 'localhost'
MYSQL_DATABASE = 'dangdang'
MYSQL_USER = 'root'
MYSQL_PASS = 'shuai'
MYSQL_PORT = 3306

"""
图片访问不了,可能是防盗链的问题,需要在该文件夹下配置请求头和referer
"""
posted @ 2022-02-22 23:53  程序员少帅  阅读(38)  评论(0)    收藏  举报