数据采集与融合第四次作业

作业①:

要求:熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;Scrapy+Xpath+MySQL数据库存储技术路线爬取当当网站图书数据

代码

myspider

import scrapy
from dangdang.items import BookItem
from bs4 import BeautifulSoup
from bs4 import UnicodeDammit

class MySpider(scrapy.Spider):
    name = "mySpider"
    key = 'python'
    source_url='http://search.dangdang.com/'

    def start_requests(self):
        url = MySpider.source_url+"?key="+MySpider.key
        yield scrapy.Request(url=url,callback=self.parse)

    def parse(self, response):
        try:
            dammit = UnicodeDammit(response.body, ["utf-8", "gbk"])
            data = dammit.unicode_markup
            selector=scrapy.Selector(text=data)
            lis=selector.xpath("//li['@ddt-pit'][starts-with(@class,'line')]")
            for li in lis:
                title=li.xpath("./a[position()=1]/@title").extract_first()
                price =li.xpath("./p[@class='price']/span[@class='search_now_price']/text()").extract_first()
                author = li.xpath("./p[@class='search_book_author']/span[position()=1]/a/@title").extract_first()
                date =li.xpath("./p[@class='search_book_author']/span[position()=last()- 1]/text()").extract_first()
                publisher = li.xpath("./p[@class='search_book_author']/span[position()=last()]/a/@title ").extract_first()
                detail = li.xpath("./p[@class='detail']/text()").extract_first()
                #detail有时没有,结果None
                item=BookItem()
                item["title"]=title.strip() if title else ""
                item["author"]=author.strip() if author else ""
                item["date"] = date.strip()[1:] if date else ""
                item["publisher"] = publisher.strip() if publisher else ""
                item["price"] = price.strip() if price else ""
                item["detail"] = detail.strip() if detail else ""
                yield item
            #最后一页时link为None
            #翻页
            link=selector.xpath("//div[@class='paging']/ul[@name='Fy']/li[@class='next']/a/@href").extract_first()
            if link:
                url=response.urljoin(link)
                yield scrapy.Request(url=url, callback=self.parse)

        except Exception as err:
            print(err)

items

import scrapy
class BookItem(scrapy.Item):
        title = scrapy.Field()
        author = scrapy.Field()
        date = scrapy.Field()
        publisher = scrapy.Field()
        detail = scrapy.Field()
        price = scrapy.Field()

pipelines

import pymysql
class BookPipeline(object):
    #连接数据库
    def open_spider(self,spider):
        print("opened")
        try:
            self.con=pymysql.connect(host="127.0.0.1",port=3306,user="root",passwd="root",db="db",charset="utf8")
            self.cursor=self.con.cursor(pymysql.cursors.DictCursor)
            self.cursor.execute("delete from books")
            self.opened=True
            self.count=0
        except Exception as err:
            print(err)
            self.opened=False
    #关闭连接
    def close_spider(self, spider):
        if self.opened:
            self.con.commit()
            self.con.close()
            self.opened=False
        print("closed")
        print("总共爬取",self.count,"本书籍")
    def process_item(self, item, spider):
        try:
            print(item["title"])
            print(item["author"])
            print(item["publisher"])
            print(item["date"])
            print(item["price"])
            print(item["detail"])
            print()
            #插入表中
            if self.opened:
                self.cursor.execute("insert into books (bTitle,bAuthor,bPublisher,bDate,bPrice,bDetail) values(%s,%s,%s,%s,%s,%s)",(item["title"],item["author"],item["publisher"],item["date"],item["price"],item["detail"]))
                self.count+=1
        except Exception as err:
            print(err)
        return item

settings

ITEM_PIPELINES = {
    'dangdang.pipelines.BookPipeline': 300,
}

运行结果

心得体会

(1)这个实验主要在于mysql的存储和xpath的使用,掌握了如何连接数据库,向表中插入数据
(2)相比来说,xpath像查找文件路径一样,用起来更易懂。
(3)通过这几次实验,对scrapy更加熟悉了。

作业②

要求:熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;Scrapy+Xpath+MySQL数据库存储技术路线爬取股票相关信息

思路

(1)第一种:抓取json,用re正则表达式爬取,存储到数据库中(简单修改之前代码)

只修改上次作业的pipelines如下即可:

import pymysql
class SharesPipeline(object):
    def open_spider(self,spider):
        print("opened")
        print("{:4}\t{:8}\t{:8}\t{:8}\t{:8}\t{:8}\t{:8}\t{:12}\t{:12}\t{:8}\t{:8}\t{:8}\t{:8}".format("id","StockNo","StockName","StockQuote","Changerate","Chg","Volume","Turnover","StockAmplitude","highest","lowest","Pricetoday","PrevClose"))
        try:
            self.con=pymysql.connect(host="127.0.0.1",port=3306,user="root",passwd="root",db="db",charset="utf8")
            self.cursor=self.con.cursor(pymysql.cursors.DictCursor)
            self.cursor.execute("delete from stocks")
            self.opened=True
            self.count=0
        except Exception as err:
            print(err)
            self.opened=False
    def close_spider(self, spider):
        if self.opened:
            self.con.commit()
            self.con.close()
            self.opened=False
        print("closed")
        print("总共爬取",self.count,"条记录")
    def process_item(self, item, spider):
        try:
            print("{:4}\t{:8}\t{:8}\t{:8}\t{:8}\t{:8}\t{:8}\t{:12}\t{:12}\t{:8}\t{:8}\t{:8}\t{:8}".format(self.count+1,item["no"],item["name"],item["latest"],item["zdf"],item["zde"],item["cjl"],item["cje"],item["zf"],item["highest"],item["lowest"],item["today"],item["yesterday"]))
            if self.opened:
                self.cursor.execute(
                    "insert into stocks (id,StockNo,StockName,StockQuote,Changerate,Chg,Volume,Turnover,StockAmplitude,Highest,Lowest,Pricetoday,PrevClose) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",
                    (self.count+1,item["no"], item["name"], item["latest"], item["zdf"], item["zde"], item["cjl"], item["cje"],
                    item["zf"], item["highest"], item["lowest"], item["today"], item["yesterday"]))
                self.count += 1
        except Exception as err:
            print(err)
        return item

(2)第二种:用scrapy结合selenium,用xpath爬取(因为要用scrapy还要用xpath)

Ⅰ、用scrapy结合selenium参照了一篇博客[https://www.cnblogs.com/xiao-apple36/p/12635470.html]

  • 重写爬虫文件的构造方法,在该方法中使用selenium实例化一个浏览器对象
  • 重写爬虫文件的closed(self,spider)方法,在其内部关闭浏览器对象。该方法是在爬虫结束时被调用
  • 重写下载中间件的process_response方法,让该方法对响应对象进行拦截,并篡改response中存储的页面数据
  • 在配置文件中开启下载中间件

Ⅱ、xpath爬取

有的数据在td下的a中,有的在td下的span中,有的在td中

代码

建表

myspider

import scrapy
from shares.items import gupiaoItem
from selenium import webdriver
from  selenium.webdriver.chrome.options import Options
from bs4 import UnicodeDammit
chorme_options = Options()
chorme_options.add_argument("--headless")  #无头
chorme_options.add_argument("--disable-gpu") 
class MyspiderSpider(scrapy.Spider):
    name = 'myspider'
    start_urls = ['http://quote.eastmoney.com/center/gridlist.html#hs_a_board']
    def __init__(self):
        self.browser = webdriver.Chrome(chrome_options=chorme_options)
        super().__init__()
    def start_requests(self):
        url = 'http://quote.eastmoney.com/center/gridlist.html#hs_a_board'
        response = scrapy.Request(url, callback=self.parse)
        yield response
    def close(self, spider):
        self.browser.quit()
    def parse(self, response):
        try:
            dammit = UnicodeDammit(response.body, ["utf-8", "gbk"])
            data = dammit.unicode_markup
            selector=scrapy.Selector(text=data)
            tdd=selector.xpath("//div[@class='listview full']/table/tbody/tr")
            for td in tdd:
                id = td.xpath("./td[position()=1]/text()").extract()
                Volume = td.xpath("./td[position()=8]/text()").extract()
                Turnover = td.xpath("./td[position()=9]/text()").extract()
                StockAmplitude = td.xpath("./td[position()=10]/text()").extract()
                PrevClose = td.xpath("./td[position()=14]/text()").extract()
                no_name = td.xpath("./td/a/text()").extract()
                number = td.xpath("./td/span/text()").extract()
                item=gupiaoItem()
                item["id"] = id[0].strip()
                item["StockNo"] = no_name[0].strip()
                item["StockName"] = no_name[1].strip()
                item["StockQuote"] = number[0].strip()
                item["Changerate"] = number[1].strip()
                item["Chg"] = number[2].strip()
                item["Volume"] =Volume[0].strip()
                item["Turnover"] =Turnover[0].strip()
                item["StockAmplitude"] =StockAmplitude[0].strip()
                item["highest"] = number[3].strip()
                item["lowest"] = number[4].strip()
                item["Pricetoday"] = number[5].strip()
                item["PrevClose"] =PrevClose[0].strip()
                yield item
        except Exception as err:
            print(err)

items

import scrapy
class gupiaoItem(scrapy.Item):

    id = scrapy.Field()  #序号
    StockNo = scrapy.Field() #代码
    StockName = scrapy.Field()  #名称
    StockQuote= scrapy.Field()  #最新价
    Changerate = scrapy.Field() #涨跌幅
    Chg = scrapy.Field()  #涨跌额
    Volume = scrapy.Field() #成交量
    Turnover= scrapy.Field() #成交额
    StockAmplitude = scrapy.Field() #振幅
    highest = scrapy.Field() #最高
    lowest= scrapy.Field() #最低
    Pricetoday = scrapy.Field() #今开
    PrevClose = scrapy.Field() #昨收

pipelines

import pymysql
class SharesPipeline(object):
    def open_spider(self,spider):
        print("opened")
        print("{:4}\t{:8}\t{:8}\t{:8}\t{:8}\t{:8}\t{:8}\t{:12}\t{:12}\t{:8}\t{:8}\t{:8}\t{:8}".format("id","StockNo","StockName","StockQuote","Changerate","Chg","Volume","Turnover","StockAmplitude","highest","lowest","Pricetoday","PrevClose"))
        try:
            self.con=pymysql.connect(host="127.0.0.1",port=3306,user="root",passwd="root",db="db",charset="utf8")
            self.cursor=self.con.cursor(pymysql.cursors.DictCursor)
            self.cursor.execute("delete from stocks")
            self.opened=True
            self.count=0
        except Exception as err:
            print(err)
            self.opened=False
    def close_spider(self, spider):
        if self.opened:
            self.con.commit()
            self.con.close()
            self.opened=False
        print("closed")
        print("总共爬取",self.count,"条记录")
    def process_item(self, item, spider):
        try:
            print("{:4}\t{:8}\t{:8}\t{:8}\t{:8}\t{:8}\t{:8}\t{:12}\t{:12}\t{:8}\t{:8}\t{:8}\t{:8}".format(item["id"],item["StockNo"], item["StockName"], item["StockQuote"], item["Changerate"], item["Chg"], item["Volume"], item["Turnover"],
                    item["StockAmplitude"], item["highest"], item["lowest"], item["Pricetoday"], item["PrevClose"]))
            if self.opened:
                self.cursor.execute(
                    "insert into stocks (id,StockNo,StockName,StockQuote,Changerate,Chg,Volume,Turnover,StockAmplitude,Highest,Lowest,Pricetoday,PrevClose) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",
                    (item["id"],item["StockNo"], item["StockName"], item["StockQuote"], item["Changerate"], item["Chg"], item["Volume"], item["Turnover"],
                    item["StockAmplitude"], item["highest"], item["lowest"], item["Pricetoday"], item["PrevClose"]))
                self.count += 1
        except Exception as err:
            print(err)
        return item

middlewares修改

from time import sleep

修改class SharesDownloaderMiddleware中的process_response如下

    def process_response(self, request, response, spider):
        # Called with the response returned from the downloader.
        #挑选指定的响应对象进行篡改
        #通过url指定request
        #通过request指定response
        #spider爬虫对象
        bro = spider.browser # 获取爬虫类定义的浏览器对象
        if request.url in spider.start_urls:
            #response   # 进行篡改 实例化新的响应对象(包含动态加载的新闻数据)替代原来的旧响应对象
            # 基于seleium便捷获取动态数据
            bro.get(request.url)
            sleep(3)
            bro.execute_script('window.scrollTo(0, document.body.scrollHeight)')
            sleep(1)
            page_text = bro.page_source # 包含了动态加载对象
            new_response = HtmlResponse(url=request.url,body=page_text,encoding="utf-8",request=request)
            return new_response
        else:
            return response

settings

ROBOTSTXT_OBEY = True  #之前爬取json时改为了False,要改回来
DEFAULT_REQUEST_HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3141.8 Safari/537.36}"
    }

DOWNLOADER_MIDDLEWARES = {
    'shares.middlewares.SharesDownloaderMiddleware': 543,
}
ITEM_PIPELINES = {
    'shares.pipelines.SharesPipeline': 300,
}

运行结果

第一种结果:


第二种的结果:

心得体会

(1)知道了怎么用scrapy结合selenium,要修改中间件,在settings中也要修改相应的值。

(2)对比通过js文件爬取与selenium模拟浏览器:

  • 时间上selenium慢了不少
  • selenium是实时的数据
  • selenium加上xpath比较方便,比用re获取js文件中的值简单。

(3)对于用scrapy结合selenium实现翻页,快截至了没来得及做,之后补上。

(4)数据库建表时id要设为int型,一开始设成varchar,数据库中存储排序按照1,10.....来排序,而不是1,2,3...

(5) 还有就是之前没装谷歌浏览器的驱动,配置环境变量。

作业③:

要求:熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;使用scrapy框架+Xpath+MySQL数据库存储技术路线爬取外汇网站数据。

思路


每条数据在一个tr,第一个是表头,遍历tr下的每个td,就得到数据了,之后就是一样的连接数据库,插入数据。

代码

建表

myspider

import scrapy
from ICBC.items import IcbcItem
from bs4 import UnicodeDammit
class MySpider(scrapy.Spider):
    name = "mySpider"
    key = 'python'
    def start_requests(self):
        url = 'http://fx.cmbchina.com/hq/'
        yield scrapy.Request(url=url, callback=self.parse)
    def parse(self, response):
        try:
            dammit = UnicodeDammit(response.body, ["utf-8", "gbk"])
            data = dammit.unicode_markup
            selector = scrapy.Selector(text=data)
            trr = selector.xpath("//table[@class='data']/tr") #去掉了tbody
            for tr in trr[1:]:
                Currency = tr.xpath("./td[@class='fontbold']/text()").extract_first()
                TSP =tr.xpath("./td[@class='numberright']/text()").extract_first()
                CSP =tr.xpath("./td[@class='numberright']/text()")[1].extract() 
                TBP=tr.xpath("./td[@class='numberright']/text()")[2].extract()
                CBP=tr.xpath("./td[@class='numberright']/text()")[3].extract()
                Time=tr.xpath("./td[@align='center']/text()")[2].extract()
                item = IcbcItem()
                item["Currency"] = Currency.strip()
                item["TSP"] = TSP.strip()
                item["CSP"] = CSP.strip()
                item["TBP"] = TBP.strip()
                item["CBP"] = CBP.strip()
                item["Time"] = Time.strip()
                yield item
        except Exception as err:
            print(err)

items

import scrapy


class IcbcItem(scrapy.Item):
    # define the fields for your item here like:

    Currency = scrapy.Field()
    TSP = scrapy.Field()
    CSP = scrapy.Field()
    TBP = scrapy.Field()
    CBP = scrapy.Field()
    Time = scrapy.Field()

pipelines

import pymysql
class IcbcPipeline:
    def open_spider(self,spider):
        print("opened")
        try:
            self.con=pymysql.connect(host="127.0.0.1",port=3306,user="root",passwd="root",db="db",charset="utf8")
            self.cursor=self.con.cursor(pymysql.cursors.DictCursor)
            self.cursor.execute("delete from ICBC")
            self.opened=True
            self.count=0
            print("{:2}\t{:7}\t{:8}\t{:4}\t{:4}\t{:4}\t{:20}".format("Id","Currency","TSP","CSP","TBP","CBP","Time"))
        except Exception as err:
            print(err)
            self.opened=False

    def close_spider(self, spider):
        if self.opened:
            self.con.commit()
            self.con.close()
            self.opened=False
        print("closed")
        print("总共爬取",self.count,"条信息")
    def process_item(self, item, spider):
        try:
            print("{:2}\t{:7}\t{:8}\t{:4}\t{:4}\t{:4}\t{:20}".format(self.count+1,item["Currency"],item["TSP"],item["CSP"],item["TBP"] ,item["CBP"],item["Time"]))
            if self.opened:
                self.cursor.execute(
                    "insert into ICBC(ID,Currency,TSP,CSP,TBP,CBP,Time) values(%s,%s,%s,%s,%s,%s,%s)",
                    (self.count+1,item["Currency"], item["TSP"], item["CSP"], item["TBP"], item["CBP"], item["Time"]))
                self.count += 1
        except Exception as err:
            print(err)
        return item

settings

DEFAULT_REQUEST_HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3141.8 Safari/537.36}"
    }

ITEM_PIPELINES = {
    'ICBC.pipelines.IcbcPipeline': 300,
}

运行结果


心得体会

(1)使用scapy的xpath在匹配位置时无法获取,会爬取到空列表,去掉tbody就可以了
(2)与第一题基本类似,就是用来熟悉xpath,与mysql存储,还有scrapy的Item、Pipeline 数据的序列化输出方法


posted @ 2020-11-03 15:37  lumos1  阅读(187)  评论(0编辑  收藏  举报