2023数据采集与融合技术实践作业二

作业①：

要求：在中国气象网（http://www.weather.com.cn）给定城市集的 7
日天气预报，并保存在数据库。
输出信息：

序号	地区	日期	天气信息	温度
1	北京	15日（今天）	多云	...

Gitee文件夹链接：实践作业2

代码：

from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
import urllib.request
import sqlite3

class WeatherDB:
    def openDB(self):
        self.con=sqlite3.connect("weathers.db")
        self.cursor=self.con.cursor()
        try:
            self.cursor.execute("create table weathers (wCity varchar(16),wDate varchar(16),wWeather varchar(64),wTemp varchar(32),constraint pk_weather primary key (wCity,wDate))")
        except:
            self.cursor.execute("delete from weathers")

    def closeDB(self):
        self.con.commit()
        self.con.close()


    def insert(self, city, date, weather, temp):
        try:
            self.cursor.execute("insert into weathers (wCity,wDate,wWeather,wTemp) values (?,?,?,?)",
                                (city, date, weather, temp))
        except Exception as err:
            print(err)

    def show(self):
        self.cursor.execute("select * from weathers")
        rows = self.cursor.fetchall()
        print("%-16s%-16s%-32s%-16s" % ("city", "date", "weather", "temp"))
        for row in rows:
            print("%-16s%-16s%-32s%-16s" % (row[0], row[1], row[2], row[3]))


class WeatherForecast:
    def __init__(self):
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre"}
        self.cityCode = {"北京": "101010100", "上海": "101020100", "广州": "101280101", "深圳": "101280601"}

    def forecastCity(self, city):
        if city not in self.cityCode.keys():
            print(city + " code cannot be found")
            return
        url = "http://www.weather.com.cn/weather/" + self.cityCode[city] + ".shtml"
        try:
            req = urllib.request.Request(url, headers=self.headers)
            data = urllib.request.urlopen(req)
            data = data.read()
            dammit = UnicodeDammit(data, ["utf-8", "gbk"])
            data = dammit.unicode_markup
            soup = BeautifulSoup(data, "lxml")
            lis = soup.select("ul[class='t clearfix'] li")

            for li in lis:
                try:
                    date = li.select('h1')[0].text
                    weather = li.select('p[class="wea"]')[0].text
                    temp = li.select('p[class="tem"] span')[0].text + "/" + li.select('p[class="tem"] i')[0].text
                    print(city, date, weather, temp)
                    self.db.insert(city, date, weather, temp)
                except Exception as err:
                    print(err)
        except Exception as err:
            print(err)

    def process(self, cities):
        self.db = WeatherDB()
        self.db.openDB()
        for city in cities:
            self.forecastCity(city)
        self.db.closeDB()

ws = WeatherForecast()
ws.process(["北京", "上海", "广州", "深圳"])
print("completed")

结果截图：

实验心得：

主要是进行复现，并没有什么难度

作业②

要求：用requests和BeautifulSoup库方法定向爬取股票相关信息，并存储在数据库中。
候选网站：东方财富网：https://www.eastmoney.com/
新浪股票：http://finance.sina.com.cn/stock/
技巧：在谷歌浏览器中进入F12调试模式进行抓包，查找股票列表加载使用的url，并分析api返回的值，并根据所要求的参数可适当更改api的请求参数。根据URL可观察请求的参数f1、f2可获取不同的数值，根据情况可删减请求的参数。
参考链接：https://zhuanlan.zhihu.com/p/50099084
输出信息：
Gitee文件夹链接：实践作业2

代码：

import requests
from bs4 import BeautifulSoup
import re
import pandas as pd

def getData(url, gupiao):
    response = requests.get(url)
    database = []
    daima = re.findall('f12":"(.*?)",', response.text)
    name = re.findall('f14":"(.*?)",', response.text)
    newprice = re.findall('f2":(.*?),', response.text)
    diezhanfu = re.findall('f4":(.*?),', response.text)
    diezhane = re.findall('f3":(.*?),', response.text)
    cjl = re.findall('f5":(.*?),', response.text)
    cje = re.findall('f6":(.*?),', response.text)
    zf = re.findall('f7":(.*?),', response.text)
    zg = re.findall('f15":(.*?),', response.text)
    zd = re.findall('f16":(.*?),', response.text)
    jk = re.findall('f17":(.*?),', response.text)
    zs = re.findall('f18":(.*?),', response.text)
    for i in range(0, len(zs)):
        data = []
        data.append(str(i+1))
        data.append(name[i])
        data.append(daima[i])
        data.append(newprice[i])
        data.append(diezhanfu[i])
        data.append(diezhane[i])
        data.append(cjl[i])
        data.append(cje[i])
        data.append(zf[i])
        data.append(zg[i])
        data.append(zd[i])
        data.append(jk[i])
        data.append(zs[i])
        database.append(data)
        # print(database)
    columns = {1:"序号",2:"代码",3:"名称",4:"最新价格",5:"跌涨幅",6:"跌涨额",7:"成交量",8:"成交额",9:"振幅",10:"最高",11:"最低",12:"今开",13:"昨收"}
    df = pd.DataFrame(database, columns=columns.values())
    docename = gupiao + ".xlsx"
    df.to_excel(docename, index=False)
    print("已保存" + docename)

def main():
    desirable_page = 1
    fs = {
        "沪深京A股": "m:0+t:6,m:0+t:80,m:1+t:2,m:1+t:23,m:0+t:81+s:2048",
        "上证A股": "m:1+t:2,m:1+t:23",
        "深证A股": "m:0+t:6,m:0+t:80",
        "北证A股": "m:0+t:81+s:2048",
        "新股": "m:0+t:81+s:2048",
        "创业板": "m:0+t:80"
    }
    for gupiao in fs:
        url = f"https://89.push2.eastmoney.com/api/qt/clist/get?cb=jQuery112408428773349332392_1697292673356&pn={desirable_page}&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&wbp2u=|0|0|0|web&fid=f3&fs={fs[gupiao]}&fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23&_=1697292673357"
        getData(url, gupiao)
        # print(getData(url))

if __name__ == "__main__":
    main()

抓包过程：
结果截图：

实验心得：

学习到一种新的方式，用抓包的方式去获得url，但获取到的数据不像能在开发者模式下用元素那么容易看懂，另外还需要使用json解析，但我直接使用了正则表达式来提取信息，最后保存在excel里。

作业③:

要求：爬取中国大学2021主榜（https://www.shanghairanking.cn/rankings/bcur/2021）所有院校信息，并存储在数据库中，同时将浏览器F12调试分析的过程录制Gif加入至博客中。
技巧：分析该网站的发包情况，分析获取数据的api
输出信息：
Gitee文件夹链接：实践作业2

代码：

import re
import requests
from bs4 import BeautifulSoup
import pandas as pd

province = {
    'k': '江苏','n': '山东', 'o': '河南','p': '河北','q': '北京','r': '辽宁','s': '陕西','t': '四川','u': '广东',
    'v': '湖北','w': '湖南','x': '浙江','y': '安徽','z': '江西','A': '黑龙江','B': '吉林','C': '上海','D': '福建','E': '山西',
    'F': '云南','G': '广西','I': '贵州','J': '甘肃','K': '内蒙古','L': '重庆','M': '天津','N': '新疆','Y': '海南'
}

unicata = {
    'f': '综合',
    'e': '理工',
    'h': '师范',
    'm': '农业',
    'T': '林业',
}

def getData(url):
    response = requests.get(url)
    response.raise_for_status()
    response.encoding = response.apparent_encoding
    uniname = re.findall(r'univNameCn:"(.*?)",', response.text)
    uniscore = re.findall(r'score:(.*?),', response.text)
    uniprovince = re.findall(r'province:(.*?),', response.text)
    unica = re.findall(r'univCategory:(.*?),', response.text)
    database = []
    for i in range(0, len(unica)):
        data = []
        data.append(str(i+1))
        data.append(uniname[i])
        shengshi = ""
        if uniprovince[i] in province:
            shengshi = province[uniprovince[i]]
        else:
            shengshi = uniprovince
        data.append(shengshi)
        leixing = ""
        if unica[i] in unicata:
            leixing = unicata[unica[i]]
        else:
            leixing = unica[i]
        data.append(leixing)
        data.append(uniscore[i])
        database.append(data)
    columns = {1:"排名",2:"学校名称",3:"省市",4:"学校类型",5:"学校总分"}
    df = pd.DataFrame(database, columns=columns.values())
    df.to_excel('school_rank.xlsx', index=False)
    print("已保存school_rank.xlsx")


url = "http://www.shanghairanking.cn/_nuxt/static/1697106492/rankings/bcur/2021/payload.js"
getData(url)