2023数据采集与融合技术实践作业三

第三次作业

一、作业内容

  • 作业①:
    • 要求:指定一个网站,爬取这个网站中的所有的所有图片,例如:中国气象网(http://www.weather.com.cn)。使用scrapy框架分别实现单线程和多线程的方式爬取。
    • 输出信息: 将下载的Url信息在控制台输出,并将下载的图片存储在images子文件中,并给出截图。
    • Gitee文件夹链接

代码

items:

class WeatherItem(scrapy.Item):
    img_link = scrapy.Field()
    pass

Spider:

import scrapy
import os
from ..items import WeatherItem
from bs4 import UnicodeDammit
class weatherSpider(scrapy.Spider):
    name = "weatherSpider"
    start_urls = 'http://www.weather.com.cn/'
    def start_requests(self):
        url = weatherSpider.start_urls
        yield scrapy.Request(url=url, callback=self.parse)
    def parse(self, response):
        try:
            dammit = UnicodeDammit(response.body, ["utf-8", "gbk"])
            data = dammit.unicode_markup
            selector = scrapy.Selector(text=data)
            images = selector.xpath("//img/@src").getall()
            for image in images:
                item = WeatherItem()
                item['img_link'] = image
                yield item
                # yield scrapy.Request(url=image, callback=self.save_image)
        except Exception as err:
            print(err)

pipelines:

  • 单线程:
import requests
import os
class WeatherPipeline(object):
    count = 0
    def process_item(self, item, spider):
        WeatherPipeline.count += 1
        try:
            print(item["img_link"])
            filename = "图片" + str(WeatherPipeline.count) + ".jpg"
            directory = 'E:\\作业\\大三\\数据采集\\weather_images'
            save_path = os.path.join(directory, filename)
            dllink = requests.get(item["img_link"])
            with open(save_path, 'wb') as file:
                file.write(dllink.content)
            print(filename + "下载完成")
        except Exception as err:
            print(err)
        return item
  • 多线程:
import threading
class WeatherPipeline(object):
    count = 0
    lock = threading.Lock()
    def process_item(self, item, spider):
        with WeatherPipeline.lock:
            WeatherPipeline.count += 1
        try:
            print(item["img_link"])
            filename = "图片" + str(WeatherPipeline.count) + ".jpg"
            directory = 'E:\\作业\\大三\\数据采集\\weather_images'
            save_path = os.path.join(directory, filename)
            dllink = requests.get(item["img_link"])
            with open(save_path, 'wb') as file:
                file.write(dllink.content)
            print(filename + "下载完成")
        except Exception as err:
            print(err)
        return item
# 创建多个线程来并发执行 process_item 方法
def start_threads():
    num_threads = 4  # 设定线程数量
    threads = []
    for _ in range(num_threads):
        thread = threading.Thread(target=run_pipeline)  # 每个线程都执行 run_pipeline 方法
        thread.start()
        threads.append(thread)
    for thread in threads:
        thread.join()  # 等待所有线程执行完毕
# 实例化 WeatherPipeline 类并调用 process_item 方法
def run_pipeline():
    pipeline = WeatherPipeline()
    # 在这里可以添加爬取数据的逻辑,例如使用爬虫框架进行数据爬取,并将数据传递给 process_item 方法
# 调用函数启动多线程
start_threads()
  • 其实也可以通过修改settings中的CONCURRENT_REQUESTS来实现多线程,他的默认值是16:
# Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS = 32

settings:

ITEM_PIPELINES = {
   "demo.pipelines.WeatherPipeline": 300,
}

run:

from scrapy import cmdline
cmdline.execute("scrapy crawl weatherSpider -s LOG_ENABLED=False".split())

运行结果截图:

心得体会

第一个实验还是比较简单,就是xpath使用不够熟练,以及scrapy在运行时需要同时更改run和settings的内容,前几次玩家==忘记改了,导致一直没有跑出内容。

  • 作业②:
    • 要求:熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;使用scrapy框架+Xpath+MySQL数据库存储技术路线爬取股票相关信息。
    • 候选网站:东方财富网:https://www.eastmoney.com/
    • 输出信息:MySQL数据库存储和输出格式如下:
      表头英文命名例如:序号id,股票代码:bStockNo……,由同学们自行定义设计
    • Gitee文件夹链接

代码

items:

class StockItem(scrapy.Item):
    name = scrapy.Field()
    daima = scrapy.Field()
    newprice = scrapy.Field()
    diezhanfu = scrapy.Field()
    diezhane = scrapy.Field()
    cjl = scrapy.Field()
    cje = scrapy.Field()
    zf = scrapy.Field()
    zg = scrapy.Field()
    zd = scrapy.Field()
    jk = scrapy.Field()
    zs = scrapy.Field()
    pass

Spider:

import scrapy
import json
from ..items import StockItem
import pymysql
class stocksSpider(scrapy.Spider):
    name = "stocksSpider"
    def start_requests(self):
        url = 'http://42.push2.eastmoney.com/api/qt/clist/get?cb=jQuery1124004594853117051989_1697955533168&pn=1&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&wbp2u=|0|0|0|web&fid=f3&fs=m:0+t:6,m:0+t:80,m:1+t:2,m:1+t:23,m:0+t:81+s:2048&fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f22,f11,f62,f128,f136,f115,f152&_=1697955533169/'
        yield scrapy.Request(url, callback=self.parse)
    def parse(self, response):
        try:
            self.db = BankDB()
            self.db.openDB()
            content = response.text
            temp_file = content
            temp_file = temp_file.split('(', maxsplit=1)[1]
            temp_file = temp_file[:-2]
            json_file = json.loads(temp_file)
            stocks = json_file['data']['diff']
            for stock in stocks:
                item = StockItem()  # 实例化 StockItem 类
                item['name'] = stock['f14']  # 名称
                item['daima'] = stock['f12']  # 代码
                item['newprice'] = stock['f2']  # 最新价格
                item['diezhane'] = stock['f4']  # 跌涨额
                item['diezhanfu'] = stock['f3']  # 跌涨幅
                item['cjl'] = stock['f5']  # 成交量
                item['cje'] = stock['f6']  # 成交额
                item['zf'] = stock['f7']  # 振幅
                item['zg'] = stock['f15']  # 最高
                item['zd'] = stock['f16']  # 最低
                item['jk'] = stock['f17']  # 今开
                item['zs'] = stock['f18']  # 昨收
                self.db.insert(item['name'],item['daima'],item['newprice'],item['diezhane'],item['diezhanfu'],item['cjl'],item['cje'],item['zf'],item['zg'],item['zd'],item['jk'],item['zs'])
                yield item
            self.db.closeDB()
        except Exception as err:
            print(err)
class BankDB:
    def openDB(self):
        self.con = pymysql.connect(host='localhost', user='root', password='LiamCapis1', db='bank')
        self.cursor = self.con.cursor()
        try:
            self.cursor.execute(
                'CREATE TABLE stock (id INT NOT NULL AUTO_INCREMENT PRIMARY KEY,name VARCHAR(255) NOT NULL,code VARCHAR(10) NOT NULL,latest_price DECIMAL(8, 2) NOT NULL,price_change DECIMAL(8, 2) NOT NULL,price_change_percent DECIMAL(8, 2) NOT NULL,volume INT NOT NULL,turnover DECIMAL(12, 2) NOT NULL,amplitude DECIMAL(8, 2) NOT NULL,highest DECIMAL(8, 2) NOT NULL,lowest DECIMAL(8, 2) NOT NULL,opening_price DECIMAL(8, 2) NOT NULL,closing_price DECIMAL(8, 2) NOT NULL)')
        except:
            self.cursor.execute("delete from stock")
    def closeDB(self):
        self.con.commit()
        self.con.close()
    def insert(self, name, code, latest_price, price_change, price_change_percent, volume, turnover, amplitude, highest, lowest, opening_price, closing_price):
        try:            self.cursor.execute("INSERT INTO stock (name, code, latest_price, price_change, price_change_percent, volume, turnover, amplitude, highest, lowest, opening_price, closing_price) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s);",
                                (name, code, latest_price, price_change, price_change_percent, volume, turnover, amplitude, highest, lowest, opening_price, closing_price))
        except Exception as err:
            print(err)

pipelines:

class StockPipeline(object):
    count = 0
    def process_item(self, item, spider):
        StockPipeline.count += 1
        try:
            if StockPipeline.count == 1:
                print("{:<10} {:<10} {:<10} {:<10} {:<10} {:<10} {:<10} {:<10} {:<10} {:<10} {:<10} {:<10}".format(
                    "名称", "代码", "最新价格", "跌涨额", "跌涨幅", "成交量", "成交额", "振幅", "最高", "最低", "今开", "昨收"
                ))
            print("{:<10} {:<10} {:<10} {:<10} {:<10} {:<10} {:<20} {:<10} {:<10} {:<10} {:<10} {:<10}".format(
                item['name'], item['daima'], item['newprice'], item['diezhane'], item['diezhanfu'], item['cjl'],
                item['cje'], item['zf'], item['zg'], item['zd'], item['jk'], item['zs']
            ))
        except Exception as err:
            print(err)
        return item

settings:
需要把ROBOTSTXT_OBEY = True改为False,否则会一直无法执行Spider中的parse函数

ROBOTSTXT_OBEY = False
ITEM_PIPELINES = {
   "demo.pipelines.StockPipeline": 300,
}

run:

from scrapy import cmdline
cmdline.execute("scrapy crawl stocksSpider -s LOG_ENABLED=False".split())

运行结果截图:

心得体会

这个实验按照scrapy框架写加上之前已经有的代码,本来以为很快能写完,但因为开始一直没有修改ROBOTSTXT_OBEY导致一直跑不出内容,做了很多无用功。

  • 作业③:
    • 要求:熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;使用scrapy框架+Xpath+MySQL数据库存储技术路线爬取外汇网站数据。
    • 候选网站:中国银行网:https://www.boc.cn/sourcedb/whpj/
    • 输出信息:
    • Gitee文件夹链接

代码

items:

class BankItem(scrapy.Item):
    name = scrapy.Field()
    TBP = scrapy.Field()  # 现汇买入价
    CBP = scrapy.Field()  # 现钞买入价
    TSP = scrapy.Field()  # 现汇卖出价
    CSP = scrapy.Field()  # 现钞卖出价
    Time = scrapy.Field()
    pass

Spider:

import scrapy
import os
from ..items import BankItem
from bs4 import UnicodeDammit
import re
import pymysql
class bankSpider(scrapy.Spider):
    name = "bankSpider"
    start_urls = 'https://www.boc.cn/sourcedb/whpj/'
    def start_requests(self):
        url = bankSpider.start_urls
        yield scrapy.Request(url=url, callback=self.parse)
        print(scrapy.Request(url=url, callback=self.parse))
    def parse(self, response):
        try:
            self.db = BankDB()
            self.db.openDB()
            dammit = UnicodeDammit(response.body, ["utf-8", "gbk"])
            data = dammit.unicode_markup
            selector = scrapy.Selector(text=data)
            tables = selector.xpath(
                '//table[@cellpadding="0" and @align="left" and @cellspacing="0" and @width="100%"]')
            rows = tables.xpath('.//tr').getall()
            row_num = 0
            for row in rows:
                if row_num == 0:
                    row_num += 1
                    continue
                messages = re.findall(r"<td>(.*?)</td>", str(row))
                item = BankItem()
                item['name'] = messages[0]
                item['TBP'] = messages[1]
                item['CBP'] = messages[2]
                item['TSP'] = messages[3]
                item['CSP'] = messages[4]
                item['Time'] = messages[6]
                self.db.insert(item['name'], item['TBP'], item['CBP'], item['TSP'], item['CSP'], item['Time'])
                yield item
            self.db.closeDB()
        except Exception as err:
            print(err)
class BankDB:
    def openDB(self):
        self.con = pymysql.connect(host='localhost', user='root', password='LiamCapis1', db='bank')
        self.cursor = self.con.cursor()
        try:
            self.cursor.execute(
                "create table bank (Currency varchar(16),TBP varchar(16),CBP varchar(16),TSP varchar(16),CSP varchar(16),Time varchar(20),constraint pk_bank primary key (currency, Time))")
        except:
            self.cursor.execute("delete from bank")
    def closeDB(self):
        self.con.commit()
        self.con.close()
    def insert(self, currency, TBP, CBP, TSP, CSP, Time):
        try:
            self.cursor.execute("insert into bank (currency, TBP, CBP, TSP, CSP, Time) values (%s,%s,%s,%s,%s,%s)",
                                (currency, TBP, CBP, TSP, CSP, Time))
        except Exception as err:
            print(err)

pipelines:

class BankPipeline(object):
    count = 0
    def process_item(self, item, spider):
        BankPipeline.count += 1
        try:
            if BankPipeline.count == 1:  # 第一行为表头
                print("{:<10}|{:<10}|{:<10}|{:<10}|{:<10}|{:<20}".format('currency', 'TBP', 'CBP', 'TSP', 'CSP', 'Time'))
                print(
                    "{:<10}|{:<10}|{:<10}|{:<10}|{:<10}|{:<20}".format('-' * 10, '-' * 10, '-' * 10, '-' * 10, '-' * 10,
                                                                       '-' * 20))
            print(
                "{:<10}|{:<10}|{:<10}|{:<10}|{:<10}|{:<20}".format(item['name'], item['TBP'], item['CBP'], item['TSP'],
                                                                   item['CSP'], item['Time']))
        except Exception as err:
            print(err)
        return item

settings:

ITEM_PIPELINES = {
   "demo.pipelines.BankPipeline": 300,
}

run:

from scrapy import cmdline
cmdline.execute("scrapy crawl bankSpider -s LOG_ENABLED=False".split())

运行结果截图:

心得体会:

学习了scrapy框架的使用以及mysql的表的创建、嵌入信息、提取信息,就是第三个实验在存入mysql中的顺序与在运行结果打印出来的不一样。

posted @ 2023-10-23 11:22  LiamCap  阅读(49)  评论(0)    收藏  举报