2023数据采集与融合技术实践作业三
第三次作业
一、作业内容
- 作业①:
- 要求:指定一个网站,爬取这个网站中的所有的所有图片,例如:中国气象网(http://www.weather.com.cn)。使用scrapy框架分别实现单线程和多线程的方式爬取。
- 输出信息: 将下载的Url信息在控制台输出,并将下载的图片存储在images子文件中,并给出截图。
- Gitee文件夹链接
代码
items:
class WeatherItem(scrapy.Item): img_link = scrapy.Field() passSpider:
import scrapy import os from ..items import WeatherItem from bs4 import UnicodeDammit class weatherSpider(scrapy.Spider): name = "weatherSpider" start_urls = 'http://www.weather.com.cn/' def start_requests(self): url = weatherSpider.start_urls yield scrapy.Request(url=url, callback=self.parse) def parse(self, response): try: dammit = UnicodeDammit(response.body, ["utf-8", "gbk"]) data = dammit.unicode_markup selector = scrapy.Selector(text=data) images = selector.xpath("//img/@src").getall() for image in images: item = WeatherItem() item['img_link'] = image yield item # yield scrapy.Request(url=image, callback=self.save_image) except Exception as err: print(err)pipelines:
- 单线程:
import requests import os class WeatherPipeline(object): count = 0 def process_item(self, item, spider): WeatherPipeline.count += 1 try: print(item["img_link"]) filename = "图片" + str(WeatherPipeline.count) + ".jpg" directory = 'E:\\作业\\大三\\数据采集\\weather_images' save_path = os.path.join(directory, filename) dllink = requests.get(item["img_link"]) with open(save_path, 'wb') as file: file.write(dllink.content) print(filename + "下载完成") except Exception as err: print(err) return item
- 多线程:
import threading class WeatherPipeline(object): count = 0 lock = threading.Lock() def process_item(self, item, spider): with WeatherPipeline.lock: WeatherPipeline.count += 1 try: print(item["img_link"]) filename = "图片" + str(WeatherPipeline.count) + ".jpg" directory = 'E:\\作业\\大三\\数据采集\\weather_images' save_path = os.path.join(directory, filename) dllink = requests.get(item["img_link"]) with open(save_path, 'wb') as file: file.write(dllink.content) print(filename + "下载完成") except Exception as err: print(err) return item # 创建多个线程来并发执行 process_item 方法 def start_threads(): num_threads = 4 # 设定线程数量 threads = [] for _ in range(num_threads): thread = threading.Thread(target=run_pipeline) # 每个线程都执行 run_pipeline 方法 thread.start() threads.append(thread) for thread in threads: thread.join() # 等待所有线程执行完毕 # 实例化 WeatherPipeline 类并调用 process_item 方法 def run_pipeline(): pipeline = WeatherPipeline() # 在这里可以添加爬取数据的逻辑,例如使用爬虫框架进行数据爬取,并将数据传递给 process_item 方法 # 调用函数启动多线程 start_threads()
- 其实也可以通过修改settings中的CONCURRENT_REQUESTS来实现多线程,他的默认值是16:
# Configure maximum concurrent requests performed by Scrapy (default: 16) CONCURRENT_REQUESTS = 32settings:
ITEM_PIPELINES = { "demo.pipelines.WeatherPipeline": 300, }run:
from scrapy import cmdline cmdline.execute("scrapy crawl weatherSpider -s LOG_ENABLED=False".split())运行结果截图:
心得体会
第一个实验还是比较简单,就是xpath使用不够熟练,以及scrapy在运行时需要同时更改run和settings的内容,前几次玩家==忘记改了,导致一直没有跑出内容。
- 作业②:
- 要求:熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;使用scrapy框架+Xpath+MySQL数据库存储技术路线爬取股票相关信息。
- 候选网站:东方财富网:https://www.eastmoney.com/
- 输出信息:MySQL数据库存储和输出格式如下:
表头英文命名例如:序号id,股票代码:bStockNo……,由同学们自行定义设计 ![]()
- Gitee文件夹链接
代码
items:
class StockItem(scrapy.Item): name = scrapy.Field() daima = scrapy.Field() newprice = scrapy.Field() diezhanfu = scrapy.Field() diezhane = scrapy.Field() cjl = scrapy.Field() cje = scrapy.Field() zf = scrapy.Field() zg = scrapy.Field() zd = scrapy.Field() jk = scrapy.Field() zs = scrapy.Field() passSpider:
import scrapy import json from ..items import StockItem import pymysql class stocksSpider(scrapy.Spider): name = "stocksSpider" def start_requests(self): url = 'http://42.push2.eastmoney.com/api/qt/clist/get?cb=jQuery1124004594853117051989_1697955533168&pn=1&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&wbp2u=|0|0|0|web&fid=f3&fs=m:0+t:6,m:0+t:80,m:1+t:2,m:1+t:23,m:0+t:81+s:2048&fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f22,f11,f62,f128,f136,f115,f152&_=1697955533169/' yield scrapy.Request(url, callback=self.parse) def parse(self, response): try: self.db = BankDB() self.db.openDB() content = response.text temp_file = content temp_file = temp_file.split('(', maxsplit=1)[1] temp_file = temp_file[:-2] json_file = json.loads(temp_file) stocks = json_file['data']['diff'] for stock in stocks: item = StockItem() # 实例化 StockItem 类 item['name'] = stock['f14'] # 名称 item['daima'] = stock['f12'] # 代码 item['newprice'] = stock['f2'] # 最新价格 item['diezhane'] = stock['f4'] # 跌涨额 item['diezhanfu'] = stock['f3'] # 跌涨幅 item['cjl'] = stock['f5'] # 成交量 item['cje'] = stock['f6'] # 成交额 item['zf'] = stock['f7'] # 振幅 item['zg'] = stock['f15'] # 最高 item['zd'] = stock['f16'] # 最低 item['jk'] = stock['f17'] # 今开 item['zs'] = stock['f18'] # 昨收 self.db.insert(item['name'],item['daima'],item['newprice'],item['diezhane'],item['diezhanfu'],item['cjl'],item['cje'],item['zf'],item['zg'],item['zd'],item['jk'],item['zs']) yield item self.db.closeDB() except Exception as err: print(err) class BankDB: def openDB(self): self.con = pymysql.connect(host='localhost', user='root', password='LiamCapis1', db='bank') self.cursor = self.con.cursor() try: self.cursor.execute( 'CREATE TABLE stock (id INT NOT NULL AUTO_INCREMENT PRIMARY KEY,name VARCHAR(255) NOT NULL,code VARCHAR(10) NOT NULL,latest_price DECIMAL(8, 2) NOT NULL,price_change DECIMAL(8, 2) NOT NULL,price_change_percent DECIMAL(8, 2) NOT NULL,volume INT NOT NULL,turnover DECIMAL(12, 2) NOT NULL,amplitude DECIMAL(8, 2) NOT NULL,highest DECIMAL(8, 2) NOT NULL,lowest DECIMAL(8, 2) NOT NULL,opening_price DECIMAL(8, 2) NOT NULL,closing_price DECIMAL(8, 2) NOT NULL)') except: self.cursor.execute("delete from stock") def closeDB(self): self.con.commit() self.con.close() def insert(self, name, code, latest_price, price_change, price_change_percent, volume, turnover, amplitude, highest, lowest, opening_price, closing_price): try: self.cursor.execute("INSERT INTO stock (name, code, latest_price, price_change, price_change_percent, volume, turnover, amplitude, highest, lowest, opening_price, closing_price) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s);", (name, code, latest_price, price_change, price_change_percent, volume, turnover, amplitude, highest, lowest, opening_price, closing_price)) except Exception as err: print(err)pipelines:
class StockPipeline(object): count = 0 def process_item(self, item, spider): StockPipeline.count += 1 try: if StockPipeline.count == 1: print("{:<10} {:<10} {:<10} {:<10} {:<10} {:<10} {:<10} {:<10} {:<10} {:<10} {:<10} {:<10}".format( "名称", "代码", "最新价格", "跌涨额", "跌涨幅", "成交量", "成交额", "振幅", "最高", "最低", "今开", "昨收" )) print("{:<10} {:<10} {:<10} {:<10} {:<10} {:<10} {:<20} {:<10} {:<10} {:<10} {:<10} {:<10}".format( item['name'], item['daima'], item['newprice'], item['diezhane'], item['diezhanfu'], item['cjl'], item['cje'], item['zf'], item['zg'], item['zd'], item['jk'], item['zs'] )) except Exception as err: print(err) return itemsettings:
需要把ROBOTSTXT_OBEY = True改为False,否则会一直无法执行Spider中的parse函数ROBOTSTXT_OBEY = False ITEM_PIPELINES = { "demo.pipelines.StockPipeline": 300, }run:
from scrapy import cmdline cmdline.execute("scrapy crawl stocksSpider -s LOG_ENABLED=False".split())运行结果截图:
心得体会
这个实验按照scrapy框架写加上之前已经有的代码,本来以为很快能写完,但因为开始一直没有修改ROBOTSTXT_OBEY导致一直跑不出内容,做了很多无用功。
- 作业③:
- 要求:熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;使用scrapy框架+Xpath+MySQL数据库存储技术路线爬取外汇网站数据。
- 候选网站:中国银行网:https://www.boc.cn/sourcedb/whpj/
- 输出信息:
![]()
- Gitee文件夹链接
代码
items:
class BankItem(scrapy.Item): name = scrapy.Field() TBP = scrapy.Field() # 现汇买入价 CBP = scrapy.Field() # 现钞买入价 TSP = scrapy.Field() # 现汇卖出价 CSP = scrapy.Field() # 现钞卖出价 Time = scrapy.Field() passSpider:
import scrapy import os from ..items import BankItem from bs4 import UnicodeDammit import re import pymysql class bankSpider(scrapy.Spider): name = "bankSpider" start_urls = 'https://www.boc.cn/sourcedb/whpj/' def start_requests(self): url = bankSpider.start_urls yield scrapy.Request(url=url, callback=self.parse) print(scrapy.Request(url=url, callback=self.parse)) def parse(self, response): try: self.db = BankDB() self.db.openDB() dammit = UnicodeDammit(response.body, ["utf-8", "gbk"]) data = dammit.unicode_markup selector = scrapy.Selector(text=data) tables = selector.xpath( '//table[@cellpadding="0" and @align="left" and @cellspacing="0" and @width="100%"]') rows = tables.xpath('.//tr').getall() row_num = 0 for row in rows: if row_num == 0: row_num += 1 continue messages = re.findall(r"<td>(.*?)</td>", str(row)) item = BankItem() item['name'] = messages[0] item['TBP'] = messages[1] item['CBP'] = messages[2] item['TSP'] = messages[3] item['CSP'] = messages[4] item['Time'] = messages[6] self.db.insert(item['name'], item['TBP'], item['CBP'], item['TSP'], item['CSP'], item['Time']) yield item self.db.closeDB() except Exception as err: print(err) class BankDB: def openDB(self): self.con = pymysql.connect(host='localhost', user='root', password='LiamCapis1', db='bank') self.cursor = self.con.cursor() try: self.cursor.execute( "create table bank (Currency varchar(16),TBP varchar(16),CBP varchar(16),TSP varchar(16),CSP varchar(16),Time varchar(20),constraint pk_bank primary key (currency, Time))") except: self.cursor.execute("delete from bank") def closeDB(self): self.con.commit() self.con.close() def insert(self, currency, TBP, CBP, TSP, CSP, Time): try: self.cursor.execute("insert into bank (currency, TBP, CBP, TSP, CSP, Time) values (%s,%s,%s,%s,%s,%s)", (currency, TBP, CBP, TSP, CSP, Time)) except Exception as err: print(err)pipelines:
class BankPipeline(object): count = 0 def process_item(self, item, spider): BankPipeline.count += 1 try: if BankPipeline.count == 1: # 第一行为表头 print("{:<10}|{:<10}|{:<10}|{:<10}|{:<10}|{:<20}".format('currency', 'TBP', 'CBP', 'TSP', 'CSP', 'Time')) print( "{:<10}|{:<10}|{:<10}|{:<10}|{:<10}|{:<20}".format('-' * 10, '-' * 10, '-' * 10, '-' * 10, '-' * 10, '-' * 20)) print( "{:<10}|{:<10}|{:<10}|{:<10}|{:<10}|{:<20}".format(item['name'], item['TBP'], item['CBP'], item['TSP'], item['CSP'], item['Time'])) except Exception as err: print(err) return itemsettings:
ITEM_PIPELINES = { "demo.pipelines.BankPipeline": 300, }run:
from scrapy import cmdline cmdline.execute("scrapy crawl bankSpider -s LOG_ENABLED=False".split())运行结果截图:
心得体会:
学习了scrapy框架的使用以及mysql的表的创建、嵌入信息、提取信息,就是第三个实验在存入mysql中的顺序与在运行结果打印出来的不一样。









浙公网安备 33010602011771号