第四次作业

作业①

爬取当当网图书数据

要求：熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法；Scrapy+Xpath+MySQL数据库存储技术路线爬取当当网站图书数据

候选网站：http://www.dangdang.com/

关键词：学生自由选择

输出信息：MYSQL的输出信息如下

代码:
spider.py

import scrapy
from dangdang.items import DangdangItem

class SpiderSpider(scrapy.Spider):
    name = 'spider'

    start_urls = ['http://search.dangdang.com/?key=python%B1%E0%B3%CC&act=input']
    id=0
    def parse(self, response):
        li_list = response.xpath('//*[@id="component_59"]/li')  # 解析文本，每个li对应一个商品信息
        for li in li_list:
            self.id+=1
            item = DangdangItem()
            # Id = li.xpath("./@ddt-pit").extract_first()
            Title = li.xpath("./a[1]/@title").extract_first()
            Author = li.xpath("./p[@class='search_book_author']/span[1]/a[1]/@title").extract_first()
            Pubilsher = li.xpath("./p[@class='search_book_author']/span[3]/a/@title").extract_first()
            Date = li.xpath("./p[@class='search_book_author']/span[2]/text()").extract_first()
            Price = li.xpath("./p[@class='price']/span[1]/text()").extract_first()
            Detail = li.xpath("./p[@class='detail']/text()").extract_first()
            item['Id'] = self.id
            item['Title'] = Title
            item['Author'] = Author
            item['Publisher'] = Pubilsher
            item['Date'] = Date   #有些书本数据是没有出版日期的
            item['Price'] = Price
            item['Detail'] = Detail   #有时没有，结果为None
            cur_page = response.xpath('//*[@id="t__cp"]/@value').extract_first() #获取当前页码
            all_page = response.xpath('//*[@id="12810"]/div[5]/div[2]/div/ul/li[last()-2]/a/text()').extract_first()  #总页码
            if int(cur_page) < int(all_page):
                new_url = self.start_urls[0]+"&page_index="+str(int(cur_page)+1)  #下一页
                yield scrapy.Request(url=new_url, callback=self.parse)
            yield item

items.py

import scrapy

class DangdangItem(scrapy.Item):   #对象结构定义
    # define the fields for your item here like:
    # name = scrapy.Field()
    Id=scrapy.Field()
    Title=scrapy.Field()
    Author=scrapy.Field()
    Publisher=scrapy.Field()
    Date=scrapy.Field()
    Price=scrapy.Field()
    Detail=scrapy.Field()

pipilines.py

import pymysql

# useful for handling different item types with a single interface
from itemadapter import ItemAdapter


class mysqlPipeline:
    conn = None
    cursor = None

    def open_spider(self, spider):
        self.conn = pymysql.Connect(host='127.0.0.1', port=3306, user='root', password='root', db='spider',  # 连接数据库
                                    charset='utf8')

    def process_item(self, item, spider):
        self.cursor = self.conn.cursor()

        try:  # 插入数据
            self.cursor.execute('insert into dangdang values(%s,%s,%s,%s,%s,%s,%s)',
                                (item["Id"], item["Title"], item['Author'], item['Publisher'], item['Date'],
                                 item['Price'], item['Detail']))
            self.conn.commit()
        except Exception as e:
            print(e)
            self.conn.rollback()

        return item

    def close_spider(self, spider):
        self.cursor.close()  # 关闭连接
        self.conn.close()

settings.py

BOT_NAME = 'dangdang'

SPIDER_MODULES = ['dangdang.spiders']
NEWSPIDER_MODULE = 'dangdang.spiders'
LOG_LEVEL = 'ERROR'  # 日志级别设为ERROR

# 设置user——agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36'

ROBOTSTXT_OBEY = False  # 不遵从robots协议

ITEM_PIPELINES = {
    'dangdang.pipelines.mysqlPipeline': 300,  # 打开管道
}

运行结果部分展示:

心得体会

加深scrapy的了解以及MySQL数据库的操作

问题与思考

关于xpath中的tbody，在用xpath解析网页的时候，会遇到tbody标签。tbody标签有的时候可以解析，有的时候不可以解析，遇到tbody标签时要看网页源代码，如果源代码有tbody标签，就要加上tbody标签才能解析。如果源代码没有tbody标签，那么tbody标签是浏览器对html文本进行一定的规范化而强行加上去的，这时如果xpath中有tbody则无法解析出来，此时去掉xpath中的tbody即可。

作业②

爬取股票信息

要求：熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法；Scrapy+Xpath+MySQL数据库存储技术路线爬取股票相关信息

候选网站：东方财富网：https://www.eastmoney.com/

新浪股票：http://finance.sina.com.cn/stock/

输出信息：MYSQL数据库存储和输出格式如下，表头应是英文命名例如：序号id，股票代码：bStockNo……，由同学们自行定义设计表头：

代码：
stock.py

import scrapy
from selenium import webdriver
from stocks.items import StocksItem


class StockSpider(scrapy.Spider):
    name = 'stock'
    start_urls = ['http://quote.eastmoney.com/center/gridlist.html#hs_a_board']  #加入其他模块会乱序，就只爬一个板块。

    def __init__(self):   #开启实验性功能参数
        options = webdriver.ChromeOptions()
        options.add_experimental_option('excludeSwitches', ['enable-logging'])
        self.bro = webdriver.Chrome(options=options)
        # self.bro=webdriver.Chrome()

    def parse(self, response):
        tr_list = response.xpath('//*[@id="table_wrapper-table"]/tbody/tr')
        for tr in tr_list:
            item = StocksItem()
            id = tr.xpath("./td[1]/text()").extract_first()
            no = tr.xpath("./td[2]/a/text()").extract_first()
            name = tr.xpath("./td[3]/a/text()").extract_first()
            latest_price = tr.xpath("./td[5]/span/text()").extract_first()
            range = tr.xpath("./td[6]/span/text()").extract_first()
            amount = tr.xpath("./td[7]/span/text()").extract_first()
            trading = tr.xpath("./td[8]/text()").extract_first()
            transaction = tr.xpath("./td[9]/text()").extract_first()
            item['id'] = id
            item['no'] = no
            item['name'] = name
            item['latest_price'] = latest_price
            item['range'] = range
            item['amount'] = amount
            item['trading'] = trading
            item['transaction'] = transaction
            yield item

    def closed(self, spider):
        self.bro.quit()      #关闭浏览器

items.py

import scrapy


class StocksItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    id=scrapy.Field()
    no=scrapy.Field()
    name=scrapy.Field()
    latest_price=scrapy.Field()
    range=scrapy.Field()
    amount=scrapy.Field()
    trading=scrapy.Field()
    transaction=scrapy.Field()

pipielines.py

import pymysql


class StocksPipeline:
    conn = None
    cursor = None

    def open_spider(self, spider):
        self.conn = pymysql.Connect(host='127.0.0.1', port=3306, user='root', password='root', db='spider',
                                    charset='utf8')

    def process_item(self, item, spider):
        self.cursor = self.conn.cursor()

        try:
            self.cursor.execute('insert into stocks values("%s","%s","%s","%s","%s","%s","%s","%s")' %
                                (item["id"], item["no"], item['name'], item['latest_price'], item['range'],
                                 item['amount'], item['trading'],item['transaction']))
            self.conn.commit()
        except Exception as e:
            print(e)
            self.conn.rollback()

        return item

    def close_spider(self, spider):
        self.cursor.close()
        self.conn.close()

middlewares.py

from scrapy.http import HtmlResponse
from time import sleep
from scrapy.selector import Selector

# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter


class StocksDownloaderMiddleware:

    def process_request(self, request, spider):
        return None

        # 该方法拦截响应对象，进行篡改

        def process_response(self, request, response, spider):  # spider爬虫对象
        html = ""
        page = 1
        all_page = 2
        # Called with the response returned from the downloader.
        bro = spider.bro  # 获取了在爬虫类中定义的浏览器对象
        bro.get(request.url)
        sleep(2)
        while (int(page) < 3):  # 我只爬了三页，如果需要可以爬取全部while (int(page)<int(all_page))
            bro.execute_script('window.scrollTo(0, document.body.scrollHeight)')
            sleep(2)
            page_text = bro.page_source
            html = html + page_text    #多页数据拼接
            text = Selector(text=page_text)
            page = text.xpath(
                '//*[@id="main-table_paginate"]/span[1]/a[@class="paginate_button current"]/text()').extract_first()
            all_page = text.xpath('//*[@id="main-table_paginate"]/span[1]/a[last()]/text()').extract_first()
            if int(page) < 3:
                bro.find_element_by_xpath('//*[@id="main-table_paginate"]/a[2]').click()
        # 篡改response
        # 实例化一个新的响应对象（符合需求：包含动态加载出的股票数据），替代原来旧的响应对象
        new_response = HtmlResponse(url=request.url, body=bytes(html, encoding='utf-8'), request=request)

        return new_response

    def process_exception(self, request, exception, spider):
        pass

settings.py

BOT_NAME = 'stocks'

SPIDER_MODULES = ['stocks.spiders']
NEWSPIDER_MODULE = 'stocks.spiders'
ROBOTSTXT_OBEY =False
LOG_LEVEL='ERROR'
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36'
DOWNLOADER_MIDDLEWARES = {
   'stocks.middlewares.StocksDownloaderMiddleware': 543,
}       #开启下载中间件
ITEM_PIPELINES = {
   'stocks.pipelines.StocksPipeline': 300,
}

运行结果部分展示：

心得体会

了解scrapy爬虫框架与selenium搭配使用

问题与思考

现在不少大网站有对selenium采取了监测机制。比如正常情况下我们用浏览器访问淘宝等网站的 window.navigator.webdriver的值为
undefined。而使用selenium访问则该值为true。只需要设置Chromedriver的启动参数即可解决问题。在启动Chromedriver之前，为Chrome开启实验性功能参数excludeSwitches，它的值为[‘enable-automation’]

scrapy框架+selenium的使用：

当引擎将url对应的请求提交给下载器后，下载器进行网页数据的下载，然后将下载到的页面数据，封装到response中，提交给引擎，引擎将response在转交给Spiders。Spiders接受到的response对象中存储的页面数据里是没有动态加载的数据。要想获取动态加载的数据，则需要在下载中间件中对下载器提交给引擎的response响应对象进行拦截，切对其内部存储的页面数据进行篡改，修改成携带了动态加载出的数据，然后将被篡改的response对象最终交给Spiders进行解析操作。
selenium在scrapy中的使用流程：
重写爬虫文件的构造方法，在该方法中使用selenium实例化一个浏览器对象（因为浏览器对象只需要被实例化一次）
重写爬虫文件的closed(self,spider)方法，在其内部关闭浏览器对象。该方法是在爬虫结束时被调用
重写下载中间件的process_response方法，让该方法对响应对象进行拦截，并篡改response中存储的页面数据
在配置文件中开启下载中间件

作业③

爬取外汇网站数据

要求：熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法；使用scrapy框架+Xpath+MySQL数据库存储技术路线爬取外汇网站数据。

候选网站：招商银行网：http://fx.cmbchina.com/hq/

输出信息：MYSQL数据库存储和输出格式

代码：
spider.py

import scrapy
from cmbchina.items import CmbchinaItem

class SpiderSpider(scrapy.Spider):
    name = 'spider'

    start_urls = ['http://fx.cmbchina.com/hq/']
    id=0

    def parse(self, response):
        tr_list=response.xpath('//*[@id="realRateInfo"]/table//tr')
        for tr in tr_list[1:]:
            self.id+=1
            Id=str(self.id)
            Currency = tr.xpath("./td[1]/text()").extract_first().strip()
            TSP = tr.xpath("./td[4]/text()").extract_first().strip()
            CSP = tr.xpath("./td[5]/text()").extract_first().strip()
            TBP = tr.xpath("./td[6]/text()").extract_first().strip()
            CBP = tr.xpath("./td[7]/text()").extract_first().strip()
            Time= tr.xpath("./td[8]/text()").extract_first().strip()
            item=CmbchinaItem()
            item['Id']=Id
            item['Currency'] = Currency
            item['TSP'] = TSP
            item['CSP'] = CSP
            item['TBP'] =TBP
            item['CBP'] = CBP
            item['Time'] = Time
            yield item

items.py



class CmbchinaItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()

    Id = scrapy.Field()
    Currency = scrapy.Field()
    TSP = scrapy.Field()
    CSP= scrapy.Field()
    TBP= scrapy.Field()
    CBP = scrapy.Field()
    Time= scrapy.Field()

pipelines.py

import pymysql

class CmbchinaPipeline:
    conn = None
    cursor = None

    def open_spider(self, spider):
        self.conn = pymysql.Connect(host='127.0.0.1', port=3306, user='root', password='root', db='spider',
                                    charset='utf8')

    def process_item(self, item, spider):
        self.cursor = self.conn.cursor()

        try:
            self.cursor.execute('insert into cmbchina values("%s","%s","%s","%s","%s","%s","%s")' %
                                (item['Id'],item['Currency'],item['TSP'],item['CSP'],item['TBP'],item['CBP'],item['Time']))
            self.conn.commit()
        except Exception as e:
            print(e)
            self.conn.rollback()

        return item

    def close_spider(self, spider):
        self.cursor.close()
        self.conn.close()

settings.py

BOT_NAME = 'cmbchina'

SPIDER_MODULES = ['cmbchina.spiders']
NEWSPIDER_MODULE = 'cmbchina.spiders'
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36'
ROBOTSTXT_OBEY = False
LOG_LEVEL='ERROR'
ITEM_PIPELINES = {
   'cmbchina.pipelines.CmbchinaPipeline': 300,
}

心得体会

加深scrapy的了解

posted on 2020-10-27 23:12 无名狼狈阅读(89) 评论(0) 收藏举报

刷新页面返回顶部

无名狼狈

第四次作业

作业①

爬取当当网图书数据

心得体会

问题与思考

作业②

爬取股票信息

心得体会

问题与思考

scrapy框架+selenium的使用：

作业③

爬取外汇网站数据

心得体会

导航

公告