作业3

第三次作业
作业①:

要求:

指定一个网站,爬取这个网站中的所有的所有图片,例如:中国气象网(

http://www.weather.com.cn

)。使用scrapy框架分别实现单线程和多线程的方式爬取。

–务必控制总页数(学号尾数2位)、总下载的图片数量(尾数后3位)等限制爬取的措施。

输出信息: 将下载的Url信息在控制台输出,并将下载的图片存储在images子文件中,并给出截图。
spiders:
`import scrapy
from w0.items import W0Item
from bs4 import BeautifulSoup

class DangSpider(scrapy.Spider):
name = "w0"
allowed_domains = ["weather.com.cn"]
start_urls = ["http://www.weather.com.cn/"]

def parse(self, response):
    items = []
    soup = BeautifulSoup(response.body, "html.parser")

    for img_tag  in soup.find_all("img"):
        url = img_tag["src"]
        i = W0Item()
        i['url'] = url
        print(url)
        items.append(i)

    return items

pipelines:from itemadapter import ItemAdapter
from random import randint

class W0Pipeline:
def open_spider(self,spider):
self.fp = open('book.json', 'w',encoding='utf-8')

def process_item(self, item, spider):
    # self.fp.write(str(item))
    return item

def close_spider(self,spider):
    self.fp.close()

import urllib.request

class Img_downloadPipeline:

def process_item(self,item,spider):
    print('download...')
    print(item)
    url = item['url']#.get('src')
    filename = '.\\imgs\\'+str(randint(-9999999999,9999999999))+'.jpg'
    urllib.request.urlretrieve(url,filename)

`
输出如下:

要求:熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;使用scrapy框架+Xpath+MySQL数据库存储技术路线爬取股票相关信息。
候选网站:东方财富网:https://www.eastmoney.com/

输出信息:MySQL数据库存储和输出格式如下:
表头英文命名例如:序号id,股票代码:bStockNo……,由同学们自行定义设计
spiders:
`
import scrapy
from w1.items import W1Item
from bs4 import BeautifulSoup
import json
import re

class DangSpider(scrapy.Spider):
name = "w1"
allowed_domains = ["eastmoney.com"]
start_urls = ["http://54.push2.eastmoney.com/api/qt/clist/get?cb=jQuery1124015380571520090935_1602750256400&pn=1&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&fid=f3&fs=m:0+t:6,m:0+t:13,m:0+t:80,m:1+t:2,m:1+t:23&fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f22,f11,f62,f128,f136,f115,f152&_=1602750256401"]
count = 1

def parse(self, response):
    data = response.text
    # 去除多余信息并转换成json
    start = data.index('{')
    data = json.loads(data[start:len(data) - 2])

    if data['data']:
        # 选择和处理数据格式
        for stock in data['data']['diff']:
            item = W1Item()
            item['id'] = str(self.count)
            self.count+=1
            item["number"] = str(stock['f12'])
            item["name"] = stock['f14']
            item["value"] = None if stock['f2'] == "-" else str(stock['f2'])
            yield item

        # 查询当前页面并翻页
        pn = re.compile("pn=[0-9]*").findall(response.url)[0]
        page = int(pn[3:])
        url = response.url.replace("pn="+str(page), "pn="+str(page+1))
        yield scrapy.Request(url=url, callback=self.parse)

pipelines:from itemadapter import ItemAdapter
import sqlite3

class W1Pipeline:
def process_item(self, item, spider):
return item

class writeDB:

def open_spider(self,spider):

    self.fp = sqlite3.connect('test.db')
    # 建表的sql语句
    sql_text_1 = '''CREATE TABLE scores
            (id TEXT,
                代码 TEXT,
                名称 TEXT,
                价格 TEXT);'''
    # 执行sql语句
    self.fp.execute(sql_text_1)
    self.fp.commit()

def close_spider(self,spider):
    self.fp.close()

def process_item(self, item, spider):
    sql_text_1 = "INSERT INTO scores VALUES('"+item['id']+"', '"+item['number']+"', '"+item['name']+"','"+item['value']+"')"
    # 执行sql语句
    self.fp.execute(sql_text_1)
    self.fp.commit()
    return item`

结果:

·要求:熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;使用scrapy框架+Xpath+MySQL数据库存储技术路线爬取外汇网站数据。
候选网站:中国银行网:https://www.boc.cn/sourcedb/whpj/
spiders:
`import scrapy
from w2.items import W2Item
from bs4 import BeautifulSoup
import json
import re

class DangSpider(scrapy.Spider):
name = "w2"
allowed_domains = ["boc.cn"]
start_urls = ["https://www.boc.cn/sourcedb/whpj/"]
count = 0

def parse(self, response):
    
    bs_obj=BeautifulSoup(response.body,features='lxml')
    t=bs_obj.find_all('table')[1]
    all_tr=t.find_all('tr')
    all_tr.pop(0)
    for r in all_tr:
        item = W2Item()
        all_td=r.find_all('td')
        item['currency']=all_td[0].text
        item['tbp']=all_td[1].text
        item['cbp']=all_td[2].text
        item['tsp']=all_td[3].text
        item['csp']=all_td[4].text
        item['time']=all_td[6].text
        print(all_td)
        yield item

    self.count+=1
    url = 'http://www.boc.cn/sourcedb/whpj/index_{}.html'.format(self.count)
    if self.count != 5:
        yield scrapy.Request(url=url, callback=self.parse)

pipelines:from itemadapter import ItemAdapter
import sqlite3

class W2Pipeline:
def process_item(self, item, spider):
return item

class writeDB:

def open_spider(self,spider):

    self.fp = sqlite3.connect('test.db')
    # 建表的sql语句
    sql_text_1 = '''CREATE TABLE scores
            (Currency TEXT,
                TBP TEXT,
                CBP TEXT,
                TSP TEXT,
                CSP TEXT,
                TIME TEXT);'''
    # 执行sql语句
    self.fp.execute(sql_text_1)
    self.fp.commit()

def close_spider(self,spider):
    self.fp.close()

def process_item(self, item, spider):
    sql_text_1 = "INSERT INTO scores VALUES('"+item['currency']+"', '"+item['tbp']+"', '"+item['cbp']+"', '"+item['tsp']+"', '"+item['csp']+"', '"+item['time']+"')"
    # 执行sql语句
    self.fp.execute(sql_text_1)
    self.fp.commit()
    return item`

结果:

posted @ 2023-11-02 02:37  风宝风宝世界最好  阅读(37)  评论(0)    收藏  举报