第四次作业


作业①:
要求:熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;Scrapy+Xpath+MySQL数据库存储技术路线爬取当当网站图书数据
输出信息:MYSQL的输出信息如下

1)爬取当当网图书实验
代码
创建数据库(id采用自增,其余和课本相同)

CREATE TABLE books(
	bId INT PRIMARY KEY AUTO_INCREMENT,
	bTitle VARCHAR(512),
	bAuthor VARCHAR(256),
	bPublisher VARCHAR(256),
	bDate VARCHAR(32),
	bPrice VARCHAR(16),
	bDetail TEXT
)DEFAULT CHARACTER SET = utf8;

items.py

import scrapy

class BookItem(scrapy.Item):
    title = scrapy.Field()
    author = scrapy.Field()
    publisher = scrapy.Field()
    date = scrapy.Field()
    price = scrapy.Field()
    detail = scrapy.Field()

mySpiders.py

import scrapy
from book.items import BookItem
from bs4 import UnicodeDammit

class MySpider(scrapy.Spider):
    name = "mySpider"
    key = "虞初新志" # 盖其收录《口技》,作者林嗣环是泉州安溪人
    source_url = "http://search.dangdang.com/"

    def start_requests(self):
        url = MySpider.source_url + "?key=" + MySpider.key
        yield scrapy.Request(url,self.parse)

    def parse(self,response):
        try:
            dammit = UnicodeDammit(response.body, ["utf-8", "gbk"])
            data = dammit.unicode_markup
            selector = scrapy.Selector(text=data)
            # starts-with(@class,'line') class属性以line为开头
            lis = selector.xpath("//li['@ddt-pit'][starts-with(@class,'line')]")
            for li in lis:
                title=li.xpath("./a[position()=1]/@title").extract_first() 
                author = li.xpath("./p[@class='search_book_author']/span[position()=1]/a/@title").extract_first()
                publisher = li.xpath("./p[@class='search_book_author']/span[position()=last()]/a/@title").extract_first()
                date =li.xpath("./p[@class='search_book_author']/span[position()=last()- 1]/text()").extract_first()
                price =li.xpath("./p[@class='price']/span[@class='search_now_price']/text()").extract_first()
                detail = li.xpath("./p[@class='detail']/text()").extract_first()
                
                item = BookItem()
                item['title'] = title.strip() if title else ""
                item['author'] = author.strip() if author else ""
                item['publisher'] = publisher.strip() if publisher else ""
                item['date'] = date.strip() if date else ""
                item['price'] = price.strip() if price else ""
                item['detail'] = detail.strip() if detail else ""
                yield item
            
            link = selector.xpath("//div[@class='paging']/ul/li[@class='next']/a/@href").extract_first()
            if link:
                url = response.urljoin(link)
                yield scrapy.Request(url,self.parse)
        except Exception as e:
            print(e)

pipelines.py

from itemadapter import ItemAdapter
import pymysql

class BookPipeline:
    # 打开spider爬虫就会执行该函数
    def open_spider(self,spider):
        print("opened")
        try:
            self.db = pymysql.connect(
                host="127.0.0.1",
                port=3306,
                user="root",
                passwd="root",
                db="crawler",
                charset="utf8"
            )
            # 游标对象
            self.cursor = self.db.cursor()
            # 删除表
            self.cursor.execute("delete from books")
            self.opened = True
            self.count = 0
        except Exception as e:
            print(e)
            self.opened = False

    # 关闭spider爬虫就会执行该函数
    def close_spider(self,spider):
        if self.opened:
            self.db.commit()
            self.db.close()
            self.opened = False
            print("closed")
            print("总共爬取",self.count,"本书籍")

    def process_item(self, item, spider):
        try:
            if self.opened:
                print(item["title"])
                print(item["author"])
                print(item["publisher"])
                print(item["date"])
                print(item["price"])
                print(item["detail"])
                self.cursor.execute("insert into books(bTitle,bAuthor,bPublisher,bDate,bPrice,bDetail) values (%s,%s,%s,%s,%s,%s)", (item["title"], item["author"], item["publisher"], item["date"], item["price"], item["detail"]))
                self.count += 1
        except Exception as e:
            print(e)
        return item

settings.py把下面内容取消注释(实验2,3也一样,不再赘述)

# ITEM_PIPELINES = {
#    'stock2.pipelines.Stock2Pipeline': 300,
# }

run.py(实验2,3也一样,不再赘述)

from scrapy import cmdline

cmdline.execute("scrapy crawl mySpider -s LOG_ENABLED=False".split())

图片


2)心得体会
本实验是Scrapy+Xpath+MySQL数据库存储技术路线的课本样例,使我对使用xpath定位元素、翻页机制、数据库操作有了更深的理解。

作业②
要求:熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;Scrapy+Xpath+MySQL数据库存储技术路线爬取股票相关信息
输出信息:MYSQL数据库存储和输出格式如下,表头应是英文命名,自行定义

序号 股票代码 股票名称 最新报价 涨跌幅 涨跌额 成交量 成交额 振幅 最高 最低 今开 昨收
1 688093 N世华 28.47 62.22% 10.92 26.13万 7.6亿 22.34 32.0 28.08 30.2 17.55
2 ... ... ... ... ... ... ... ... ... ... ... ...

1)爬取股票信息实验
代码
创建数据库(全部使用字符型,操作比较方便)

CREATE TABLE stock(
	rank VARCHAR(5),
	stockCode VARCHAR(10),
	stockName VARCHAR(10),
	latestPrice VARCHAR(10),
	changeRange VARCHAR(10),
	changeValue VARCHAR(10),
	dealNumber VARCHAR(10),
	dealTotal VARCHAR(10),
	amplitude VARCHAR(10),
	maxPrice VARCHAR(10),
	minPrice VARCHAR(10),
	today VARCHAR(10),
	yesterday VARCHAR(10)
)DEFAULT CHARACTER SET = utf8;

items.py

import scrapy

class Stock2Item(scrapy.Item):
    rank = scrapy.Field()
    stockCode = scrapy.Field()
    stockName = scrapy.Field()
    latestPrice = scrapy.Field()
    changeRange = scrapy.Field()
    changeValue = scrapy.Field()
    dealNumber = scrapy.Field()
    dealTotal = scrapy.Field()
    amplitude = scrapy.Field()
    maxPrice = scrapy.Field()
    minPrice = scrapy.Field()
    today = scrapy.Field()
    yesterday = scrapy.Field()

mySpiders.py

import scrapy
from stock2.items import Stock2Item
from selenium import webdriver
import time

class MySpider(scrapy.Spider):
    name = "mySpider"
    url = 'http://quote.eastmoney.com/center/gridlist.html#hs_a_board'
    page = 5

    def start_requests(self):
        yield scrapy.Request(self.url,self.parse)

    def parse(self,response):
        try:
            driver = webdriver.Chrome()
            driver.get(self.url)
            for i in range(self.page):
                # 本网页的翻页不会造成url跳转,所以不能通过url地址拼接,只能通过selenium模拟点击
                # 第一次循环不点击"下一页"按钮
                if i:
                    # 第一个sleep是为了等待"下一页"按钮加载出来
                    time.sleep(3)
                    # 定位"下一页"按钮并点击
                    driver.find_element_by_xpath('//*[@id="main-table_paginate"]/a[2]').click()
                    # 没这行代码可能会出现重复的内容
                    time.sleep(3)
            
                selector = scrapy.Selector(text=driver.page_source)# page_source不能加括号,被卡🤮了
                # 每支股票对应一个tr,利用xpath提取相应信息即可
                trList = selector.xpath("//table[@id='table_wrapper-table']/tbody//tr")
                item = Stock2Item()
                for tr in trList:
                    item['rank'] = tr.xpath("./td[position()=1]/text()").extract_first()
                    item['stockCode'] = tr.xpath("./td[position()=2]/a/text()").extract_first()
                    item['stockName'] = tr.xpath("./td[position()=3]/a/text()").extract_first()
                    item['latestPrice'] = tr.xpath("./td[position()=5]/span/text()").extract_first()
                    item['changeRange'] = tr.xpath("./td[position()=6]/span/text()").extract_first()
                    item['changeValue'] = tr.xpath("./td[position()=7]/span/text()").extract_first()
                    item['dealNumber'] = tr.xpath("./td[position()=8]/text()").extract_first()
                    item['dealTotal'] = tr.xpath("./td[position()=9]/text()").extract_first()
                    item['amplitude'] = tr.xpath("./td[position()=10]/text()").extract_first()
                    item['maxPrice'] = tr.xpath("./td[position()=11]/span/text()").extract_first()
                    item['minPrice'] = tr.xpath("./td[position()=12]/span/text()").extract_first()
                    item['today'] = tr.xpath("./td[position()=13]/span/text()").extract_first()
                    item['yesterday'] = tr.xpath("./td[position()=14]/text()").extract_first()
                    yield item
        except Exception as e:
            print(e)

pipelines.py

from itemadapter import ItemAdapter
import pymysql

class Stock2Pipeline:
    # 打开spider爬虫就会执行该函数
    def open_spider(self, spider):
        print("opened")
        try:
            self.db = pymysql.connect(
                host="127.0.0.1",
                port=3306,
                user="root",
                passwd="root",
                db="crawler",
                charset="utf8"
            )
            self.cursor = self.db.cursor()
            self.opened = True
        except Exception as e:
            print(e)
            self.opened = False

    # 关闭spider爬虫就会执行该函数
    def close_spider(self, spider):
        if self.opened:
            self.db.commit()
            self.db.close()
            self.opened = False
            print("closed")

    def process_item(self, item, spider):
        try:
            if self.opened:
                self.cursor.execute("insert into stock(rank,stockCode,stockName,latestPrice,changeRange,changeValue,dealNumber,dealTotal,amplitude,maxPrice,minPrice,today,yesterday) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)", (
                    item["rank"], item["stockCode"], item["stockName"], item["latestPrice"], item["changeRange"], item["changeValue"], item["dealNumber"], item["dealTotal"], item["amplitude"], item["maxPrice"], item["minPrice"], item["today"], item["yesterday"]))
        except Exception as e:
            print(e)
        return item

图片

2)心得体会
本实验与实验一差不多,该网站源代码

标签中没有信息,估计是使用ajax从后端获取json数据,再将数据添加到表格中,所以得用selenium才能得到动态加载后的代码,此外在这里感谢一下阿新,不以臣卑鄙,委自枉屈,帮我找出span写成sapn的错误。

作业③:
要求:熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;使用scrapy框架+Xpath+MySQL数据库存储技术路线爬取外汇网站数据。
输出信息:MYSQL数据库存储和输出格式

Id Currency TSP CSP TBP CBP Time
1 港币 86.60 86.60 86.26 85.65 15:36:30
2 ... ... ... ... ... ...

1)爬取外汇实验
代码
创建数据库

CREATE TABLE exchange(
	Id INT PRIMARY KEY AUTO_INCREMENT,
	Currency VARCHAR(10),
	TSP VARCHAR(10),
	CSP VARCHAR(10),
	TBP VARCHAR(10),
	CBP VARCHAR(10),
	cTime VARCHAR(10)
)DEFAULT CHARACTER SET = utf8;

items.py

import scrapy

class WaihuiItem(scrapy.Item):
    Currency = scrapy.Field()
    TSP = scrapy.Field()	
    CSP = scrapy.Field()
    TBP = scrapy.Field()	
    CBP = scrapy.Field()
    Time = scrapy.Field()

mySpiders.py

import scrapy
from waihui.items import WaihuiItem

class MySpider(scrapy.Spider):
    name = "mySpider"
    url = "http://fx.cmbchina.com/hq/"

    def start_requests(self):
        yield scrapy.Request(self.url,self.parse)

    def parse(self,response):
        try:
            data = response.body.decode()
            selector = scrapy.Selector(text=data)
            trList = selector.xpath("//div[@id='realRateInfo']//tr")
            # 第一个tr是表头信息
            for tr in trList[1:]:
                item = WaihuiItem()
                item['Currency'] = tr.xpath("./td[position()=1]/text()").extract_first().strip()
                item['TSP'] = tr.xpath("./td[position()=4]/text()").extract_first().strip()
                item['CSP'] = tr.xpath("./td[position()=5]/text()").extract_first().strip()
                item['TBP'] = tr.xpath("./td[position()=6]/text()").extract_first().strip()
                item['CBP'] = tr.xpath("./td[position()=7]/text()").extract_first().strip()
                item['Time'] = tr.xpath("./td[position()=8]/text()").extract_first().strip()
                yield item
        except Exception as e:
            print(e)

pipelines.py

from itemadapter import ItemAdapter
import pymysql

class WaihuiPipeline:
    # 打开spider爬虫就会执行该函数
    def open_spider(self,spider):
        print("opened")
        try:
            self.db = pymysql.connect(
                host="127.0.0.1",
                port=3306,
                user="root",
                passwd="root",
                db="crawler",
                charset="utf8"
            )
            self.cursor = self.db.cursor()
            self.opened = True
        except Exception as e:
            print(e)
            self.opened = False

    # 关闭spider爬虫就会执行该函数
    def close_spider(self,spider):
        if self.opened:
            self.db.commit()
            self.db.close()
            self.opened = False
            print("closed")

    def process_item(self, item, spider):
        try:
            if self.opened:
                print(item["Currency"],item["TSP"],item["CSP"],item["TBP"],item["CBP"],item["Time"])
                self.cursor.execute("insert into exchange(Currency,TSP,CSP,TBP,CBP,cTime) values (%s,%s,%s,%s,%s,%s)", (
                    item["Currency"], item["TSP"], item["CSP"], item["TBP"], item["CBP"], item["Time"]))
        except Exception as e:
            print(e)
        return item

图片

2)心得体会
本实验比前面两个实验简单一点,不涉及翻页,只需简单处理就能得到我们想要的数据。忽然抚尺一下,群响毕绝。撤屏视之,一人、一桌、一椅、一扇、一抚尺而已。

posted @ 2020-11-02 22:54  家住海边所以浪  阅读(91)  评论(0编辑  收藏  举报