爬虫第四次作业

作业①:

(1)DangdangMySQL实验

  • 要求:熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;Scrapy+Xpath+MySQL数据库存储技术路线爬取当当网站图书数据

  • 候选网站:http://www.dangdang.com/

  • 关键词:学生自由选择

  • 输出信息:MYSQL的输出信息如下

image-20201026122301909

items.py

import scrapy


class DangdangItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    title = scrapy.Field()
    author = scrapy.Field()
    date = scrapy.Field()
    publisher = scrapy.Field()
    detail = scrapy.Field()
    price = scrapy.Field()
    pass

pipelines.py

import pymysql


class DangdangPipeline:
    # 连接数据库
    def open_spider(self, spider):
        print("opened")
        try:
            self.con = pymysql.connect(host="127.0.0.1", port=3306, user="root", passwd="123456", db="mydb", charset="utf8")
            self.cursor = self.con.cursor(pymysql.cursors.DictCursor)
            self.cursor.execute("delete from books")
            self.opened = True
            self.count = 0
        except Exception as err:
            print(err)
            self.opened = False

    # 关闭数据库
    def close_spider(self, spider):
        if self.opened:
            self.con.commit()
            self.con.close()
            self.opened = False
        print("closed")
        print("总共爬取", self.count, "本书籍")

    def process_item(self, item, spider):
        try:
            print(item["title"])
            print(item["author"])
            print(item["publisher"])
            print(item["date"])
            print(item["price"])
            print(item["detail"])
            print()
            # 将数据插入数据库的表中
            if self.opened:
                self.cursor.execute("insert into books (bTitle,bAuthor,bPublisher,bDate,bPrice,bDetail) values (%s,%s,%s,%s,%s,%s)",
                                    (item["title"], item["author"], item["publisher"], item["date"], item["price"], item["detail"]))
                self.count += 1
        except Exception as err:
            print(err)
        return item

settings.py

BOT_NAME = 'dangdang'

SPIDER_MODULES = ['dangdang.spiders']
NEWSPIDER_MODULE = 'dangdang.spiders'

ROBOTSTXT_OBEY = False

ITEM_PIPELINES = {
    'dangdang.pipelines.DangdangPipeline': 300,
}

mySpider.py

import scrapy
from ..items import DangdangItem
from bs4 import UnicodeDammit


class MySpider(scrapy.Spider):
    name = "mySpider"
    key = '大数据'  # 搜索关键词为大数据
    source_url = "http://search.dangdang.com"

    def start_requests(self):
        url = MySpider.source_url+"/?key="+MySpider.key
        yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):
        try:
            dammit = UnicodeDammit(response.body, ["utf-8", "gbk"])
            data = dammit.unicode_markup
            selector = scrapy.Selector(text=data)
            # 获取根据关键词搜索到的图书信息
            lis = selector.xpath("//li['@ddt-pit'][starts-with(@class,'line')]")
            for li in lis:
                title = li.xpath("./a[position()=1]/@title").extract_first()
                price = li.xpath("./p[@class='price']/span[@class='search_now_price']/text()").extract_first()
                author = li.xpath("./p[@class='search_book_author']/span[position()=1]/a/@title").extract_first()
                date = li.xpath("./p[@class='search_book_author']/span[position()= last()-1]/text()").extract_first()
                publisher = li.xpath("./p[@class='search_book_author']/span[position()=last()]/a/@title").extract_first()
                detail = li.xpath("./p[@class='detail']/text()").extract_first()

                # 处理数据格式并交由pipelines处理
                item = DangdangItem()
                item["title"] = title.strip() if title else ""
                item["author"] = author.strip() if author else ""
                item["date"] = date.strip(' /') if date else ""
                item["publisher"] = publisher.strip() if publisher else ""
                item["price"] = price.strip() if price else ""
                item["detail"] = detail.strip() if detail else ""
                yield item

            link = selector.xpath("//div[@class='paging']/ul[@name='Fy']/li[@class='next']/a/@href").extract_first()
            # 若下一页不为空则翻页
            if link:
                url = response.urljoin(link)
                yield scrapy.Request(url=url, callback=self.parse)

        except Exception as err:
            print(err)

(2)心得体会

总体实现不算很难,课本上有实例

作业②:

(1)StockMySQL实验

  • 要求:熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;Scrapy+Xpath+MySQL数据库存储技术路线爬取股票相关信息

  • 候选网站:东方财富网:https://www.eastmoney.com/

    ​ 新浪股票:http://finance.sina.com.cn/stock/

  • 输出信息:MYSQL数据库存储和输出格式如下,表头应是英文命名例如:序号id,股票代码:bStockNo……,由同学们自行定义设计表头:

    序号 股票代码 股票名称 最新报价 涨跌幅 涨跌额 成交量 成交额 振幅 最高 最低 今开 昨收
    1 688093 N世华 28.47 62.22% 10.92 26.13万 7.6亿 22.34 32.0 28.08 30.2 17.55
    2......

items.py

import scrapy


class StockItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    code = scrapy.Field()
    name = scrapy.Field()
    close = scrapy.Field()
    rate = scrapy.Field()
    volume = scrapy.Field()
    amount = scrapy.Field()
    amplitude = scrapy.Field()
    pass

pipelines.py

import pymysql


class StockPipeline:
    # 连接数据库
    def open_spider(self, spider):
        print("opened")
        try:
            self.con = pymysql.connect(host="127.0.0.1", port=3306, user="root", passwd="123456", db="mydb",
                                       charset="utf8")
            self.cursor = self.con.cursor(pymysql.cursors.DictCursor)
            self.cursor.execute("delete from stocks")
            self.opened = True
            self.count = 0
        except Exception as err:
            print(err)
            self.opened = False

    # 关闭数据库
    def close_spider(self, spider):
        if self.opened:
            self.con.commit()
            self.con.close()
            self.opened = False
        print("closed")
        print("总共爬取", self.count, "支股票")

    def process_item(self, item, spider):
        try:
            print(item["code"])
            print(item["name"])
            print(item["close"])
            print(item["rate"])
            print(item["volume"])
            print(item["amount"])
            print(item["amplitude"])
            print()
            # 将数据插入数据库的表中
            if self.opened:
                self.cursor.execute("insert into stocks (code,name,close,rate,volume,amount,amplitude) values (%s,%s,%s,%s,%s,%s,%s)",
                                    (item["code"], item["name"], item["close"], item["rate"], item["volume"], item["amount"], item["amplitude"]))
                self.count += 1
        except Exception as err:
            print(err)
        return item

settings.py

BOT_NAME = 'stock'

SPIDER_MODULES = ['stock.spiders']
NEWSPIDER_MODULE = 'stock.spiders'

ROBOTSTXT_OBEY = False

ITEM_PIPELINES = {
    'stock.pipelines.StockPipeline': 300,
}

mySpider.py

import scrapy
from ..items import StockItem
import re
import json


class MySpider(scrapy.Spider):
    name = "mySpider"
    start_urls = ["http://54.push2.eastmoney.com/api/qt/clist/get?cb=jQuery1124015380571520090935_1602750256400&pn=1&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&fid=f3&fs=m:0+t:6,m:0+t:13,m:0+t:80,m:1+t:2,m:1+t:23&fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f22,f11,f62,f128,f136,f115,f152&_=1602750256401"]

    def parse(self, response):
        try:
            data = response.text
            # 去除多余信息并转换成json
            start = data.index('{')
            data = json.loads(data[start:len(data) - 2])

            if data['data']:
                # 选择和处理数据格式
                for stock in data['data']['diff']:
                    item = StockItem()
                    item["code"] = str(stock['f12'])
                    item["name"] = stock['f14']
                    item["close"] = None if stock['f2'] == "-" else str(stock['f2'])
                    item["rate"] = None if stock['f3'] == "-" else str(stock['f3'])+'%'
                    item["volume"] = None if stock['f5'] == "-" else str(stock['f5'])
                    item["amount"] = None if stock['f6'] == "-" else str(stock['f6'])
                    item["amplitude"] = None if stock['f7'] == "-" else str(stock['f7'])+'%'
                    yield item

                # 查询当前页面并翻页
                pn = re.compile("pn=[0-9]*").findall(response.url)[0]
                page = int(pn[3:])
                url = response.url.replace("pn="+str(page), "pn="+str(page+1))
                yield scrapy.Request(url=url, callback=self.parse)

        except Exception as err:
            print(err)

(2)心得体会

涨跌额不知道为啥就是爬不出来,最后选择了放弃,俺以后有空了再试试,其他部分跟以前的差不多,就是多了个导入数据库,就还好

作业③:

(1)CurrencyMySQL实验

  • 要求:熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;使用scrapy框架+Xpath+MySQL数据库存储技术路线爬取外汇网站数据。

  • 候选网站:招商银行网:http://fx.cmbchina.com/hq/

  • 输出信息:MYSQL数据库存储和输出格式

    Id Currency TSP CSP TBP CBP Time
    1 港币 86.60 86.60 86.26 85.65 15:36:30
    2......

items.py

import scrapy


class CurrencyItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    currency = scrapy.Field()
    TSP = scrapy.Field()
    CSP = scrapy.Field()
    TBP = scrapy.Field()
    CBP = scrapy.Field()
    time = scrapy.Field()
    pass

pipelines.py

import pymysql


class CurrencyPipeline:
    # 连接数据库
    def open_spider(self, spider):
        print("opened")
        try:
            self.con = pymysql.connect(host="127.0.0.1", port=3306, user="root", passwd="123456", db="mydb",charset="utf8")
            self.cursor = self.con.cursor(pymysql.cursors.DictCursor)
            self.cursor.execute("delete from currency")
            self.opened = True
        except Exception as err:
            print(err)
            self.opened = False

    # 关闭数据库
    def close_spider(self, spider):
        if self.opened:
            self.con.commit()
            self.con.close()
            self.opened = False
        print("closed")

    def process_item(self, item, spider):
        try:
            print(item["currency"])
            print(item["TSP"])
            print(item["CSP"])
            print(item["TBP"])
            print(item["CBP"])
            print(item["time"])
            print()
            # 将数据插入数据库的表中
            if self.opened:
                self.cursor.execute("insert into currency (currency,TSP,CSP,TBP,CBP,time) values (%s,%s,%s,%s,%s,%s)",
                                    (item["currency"], item["TSP"], item["CSP"], item["TBP"], item["CBP"], item["time"]))
        except Exception as err:
            print(err)
        return item

settings.py

BOT_NAME = 'currency'

SPIDER_MODULES = ['currency.spiders']
NEWSPIDER_MODULE = 'currency.spiders'

ROBOTSTXT_OBEY = False

ITEM_PIPELINES = {
    'currency.pipelines.CurrencyPipeline': 300,
}

mySpider.py

import scrapy
from ..items import CurrencyItem
from bs4 import UnicodeDammit


class MySpider(scrapy.Spider):
    name = "mySpider"
    start_urls = ["http://fx.cmbchina.com/Hq/"]

    def parse(self, response):
        try:
            dammit = UnicodeDammit(response.body, ["utf-8", "gbk"])
            data = dammit.unicode_markup
            selector = scrapy.Selector(text=data)
            # 获取外汇信息
            trs = selector.xpath("//div[@id='realRateInfo']/table/tr")
            for tr in trs[1:]:
                currency = tr.xpath("./td[@class='fontbold'][position()=1]/text()").extract_first()
                tsp = tr.xpath("./td[@class='numberright'][position()=1]/text()").extract_first()
                csp = tr.xpath("./td[@class='numberright'][position()=2]/text()").extract_first()
                tbp = tr.xpath("./td[@class='numberright'][position()=3]/text()").extract_first()
                cbp = tr.xpath("./td[@class='numberright'][position()=4]/text()").extract_first()
                time = tr.xpath("./td[@align='center'][position()=3]/text()").extract_first()

                # 处理数据格式并交由pipelines处理
                item = CurrencyItem()
                item["currency"] = currency.strip() if currency else ""
                item["TSP"] = tsp.strip() if tsp else ""
                item["CSP"] = csp.strip() if csp else ""
                item["TBP"] = tbp.strip() if tbp else ""
                item["CBP"] = cbp.strip() if cbp else ""
                item["time"] = time.strip() if time else ""
                yield item
        except Exception as err:
            print(err)

(2)心得体会

俺一开始用的是xpath("tag[condition1 and condition2]")不知道为啥不行,后来改成了xpath("tag[condition1][condition2]")才可以

posted @ 2020-10-29 21:51  爱死酷普斯  阅读(156)  评论(0编辑  收藏  举报
levels of contents