数据采集与融合技术作业二

作业①

实验要求及结果

  • 要求
    在中国气象网(http://www.weather.com.cn)给定城市集的7日天气预报,并保存在数据库。
  • 代码
点击查看代码
from bs4 import BeautifulSoup, UnicodeDammit
import urllib.request
import sqlite3

# 天气数据库类
class WeatherDB:
    def openDB(self):
        self.con = sqlite3.connect("weathers.db")
        self.cursor = self.con.cursor()
        try:
            self.cursor.execute("create table weathers (wCity varchar(16), wDate varchar(16), wWeather varchar(64), wTemp varchar(32), constraint pk_weather primary key (wCity, wDate))")
        except Exception as err:
            print("Table already exists, clearing data:", err)
            self.cursor.execute("delete from weathers")

    def closeDB(self):
        self.con.commit()
        self.con.close()

    def insert(self, city, date, weather, temp):
        try:
            self.cursor.execute("insert into weathers (wCity, wDate, wWeather, wTemp) values (?, ?, ?, ?)",
                                (city, date, weather, temp))
        except Exception as err:
            print("Insert error:", err)

    def show(self):
        self.cursor.execute("select * from weathers")
        rows = self.cursor.fetchall()
        print("%-16s%-16s%-32s%-16s" % ("City", "Date", "Weather", "Temp"))
        for row in rows:
            print("%-16s%-16s%-32s%-16s" % (row[0], row[1], row[2], row[3]))


# 天气预报类
class WeatherForecast:
    def __init__(self):
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre"
        }
        self.cityCode = {"北京": "101010100", "上海": "101020100", "广州": "101280101", "深圳": "101280601"}
        self.db = None

    def forecastCity(self, city):
        if city not in self.cityCode.keys():
            print(f"{city} code cannot be found")
            return

        url = "http://www.weather.com.cn/weather/" + self.cityCode[city] + ".shtml"
        try:
            req = urllib.request.Request(url, headers=self.headers)
            data = urllib.request.urlopen(req)
            data = data.read()
            dammit = UnicodeDammit(data, ["utf-8", "gbk"])
            data = dammit.unicode_markup
            soup = BeautifulSoup(data, "lxml")
            lis = soup.select("ul[class='t clearfix'] li")

            for li in lis:
                try:
                    date = li.select('h1')[0].text
                    weather = li.select('p[class="wea"]')[0].text
                    temp = li.select('p[class="tem"] span')[0].text + "/" + li.select('p[class="tem"] i')[0].text
                    print(city, date, weather, temp)
                    self.db.insert(city, date, weather, temp)
                except Exception as err:
                    print("Parsing error:", err)

        except Exception as err:
            print("Request error:", err)

    def process(self, cities):
        self.db = WeatherDB()
        self.db.openDB()

        for city in cities:
            self.forecastCity(city)

        self.db.show()
        self.db.closeDB()


# 调用天气预报程序
ws = WeatherForecast()
ws.process(["北京", "上海", "广州", "深圳"])
print("Completed")
  • 运行结果

心得体会

  • 通过这次作业,我学会了如何利用Python的requests和BeautifulSoup库来爬取中国气象网的7日天气预报数据。
  • 在这个过程中,我学会了如何分析和处理HTML页面结构此外,我还熟练掌握了SQLite数据库的基本操作,包括创建表、插入数据和查询数据。

作业②

实验要求及结果

  • 要求:用requests和BeautifulSoup库方法定向爬取股票相关信息,并存储在数据库中。
    候选网站:东方财富网:https://www.eastmoney.com/

  • 代码:

点击查看代码


import requests
import re
import sqlite3

# 连接SQLite数据库,创建数据库文件
connection = sqlite3.connect('stocks_data.db')
cursor = connection.cursor()

# 创建表格
create_table_sql = """
CREATE TABLE IF NOT EXISTS stocks (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    code TEXT,
    name TEXT,
    latest_price REAL,
    change_rate REAL,
    change_amount REAL,
    volume INTEGER,
    turnover INTEGER,
    amplitude REAL,
    highest REAL,
    lowest REAL,
    opening_price REAL,
    previous_close REAL,
    volume_ratio REAL,
    turnover_rate REAL,
    pe_ratio REAL,
    pb_ratio REAL
);
"""
cursor.execute(create_table_sql)
connection.commit()

# 请求头
header = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36 Edg/129.0.0.0",
    "Cookie": "qgqp_b_id=c358a3ef431fdeb8bab829150de55857; st_si=45644677308239; HAList=ty-0-300059-%u4E1C%u65B9%u8D22%u5BCC; st_asi=delete; st_pvi=36719701084546; st_sp=2024-10-15%2014%3A56%3A34; st_inirUrl=https%3A%2F%2Fwww.eastmoney.com%2F; st_sn=7; st_psi=20241015151240611-113200301321-6110855007"
}

# 获取股票数据的接口函数
def get_html(cmd, page):
    url = f"https://98.push2.eastmoney.com/api/qt/clist/get?cb=jQuery112409605352694558194_1728976797660&pn={page}&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&dect=1&wbp2u=|0|0|0|web&fid={cmd}&fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f22,f11,f62,f128,f136,f115,f152&_=1728976797661"
    response = requests.get(url, headers=header)
    data = response.text

    # 提取JSON格式数据
    left_data = re.search(r'^.*?(?=\()', data).group()
    data = re.sub(left_data + '\(', '', data)
    data = re.sub('\);', '', data)
    data = eval(data)  # 解析成Python字典格式
    return data

# 股票分类及接口参数
cmd = {
    "沪深京A股": "f3&fs=m:0+t:6,m:0+t:80,m:1+t:2,m:1+t:23,m:0+t:81+s:2048",
    "上证A股": "f3&fs=m:1+t:2,m:1+t:23",
    "深证A股": "f3&fs=m:0+t:6,m:0+t:80",
    "北证A股": "f3&fs=m:0+t:81+s:2048",
}

# 爬取并保存股票数据到数据库
for market_name, market_code in cmd.items():
    page = 0
    while True:
        page += 1
        data = get_html(market_code, page)
        if page <= 2:
            print(f"正在爬取 {market_name} 第 {page} 页")
            stock_list = data['data']['diff']
            for stock in stock_list:
                code = stock["f12"]
                name = stock["f14"]
                latest_price = stock["f2"]
                change_rate = stock["f3"]
                change_amount = stock["f4"]
                volume = stock["f5"]
                turnover = stock["f6"]
                amplitude = stock["f7"]
                highest = stock["f15"]
                lowest = stock["f16"]
                opening_price = stock["f17"]
                previous_close = stock["f18"]
                volume_ratio = stock["f10"]
                turnover_rate = stock["f8"]
                pe_ratio = stock["f9"]
                pb_ratio = stock["f23"]

                # 插入数据到SQLite数据库
                insert_sql = """
                INSERT INTO stocks (code, name, latest_price, change_rate, change_amount, volume, turnover, amplitude, 
                                    highest, lowest, opening_price, previous_close, volume_ratio, turnover_rate, pe_ratio, pb_ratio)
                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
                """
                cursor.execute(insert_sql,
                               (code, name, latest_price, change_rate, change_amount, volume, turnover, amplitude,
                                highest, lowest, opening_price, previous_close, volume_ratio, turnover_rate, pe_ratio,
                                pb_ratio))
        else:
            break

# 提交并关闭数据库连接
connection.commit()
cursor.close()
connection.close()

print("股票数据已成功存储到SQLite数据库中")

  • 截图

心得体会

作业③

实验要求及结果

点击查看代码
import requests
import re
import sqlite3


class UniversityDB:
    def __init__(self):
        self.con = sqlite3.connect("universities.db")
        self.cursor = self.con.cursor()
        self.create_table()

    def create_table(self):
        self.cursor.execute("""
            CREATE TABLE IF NOT EXISTS universities (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                rank INTEGER,
                name TEXT,
                province TEXT,
                category TEXT,
                score REAL
            )
        """)
        self.con.commit()

    def closeDB(self):
        self.con.commit()
        self.con.close()

    def insert(self, rank, name, province, category, score):
        self.cursor.execute("""
            INSERT INTO universities (rank, name, province, category, score)
            VALUES (?, ?, ?, ?, ?)
        """, (rank, name, province, category, score))

    def show(self):
        self.cursor.execute("SELECT * FROM universities")
        rows = self.cursor.fetchall()
        print("{:<10} {:<20} {:<15} {:<15} {:<10}".format("排名", "学校", "省份", "类型", "总分"))
        for row in rows:
            print("{:<10} {:<20} {:<15} {:<15} {:<10}".format(row[1], row[2], row[3], row[4], row[5]))


class UniversityForecast:
    def __init__(self):
        self.db = UniversityDB()

    def fetch_data(self, url):
        response = requests.get(url)
        response.raise_for_status()
        return response.text

    def parse_data(self, text):
        name = re.findall(',univNameCn:"(.*?)",', text)
        score = re.findall(',score:(.*?),', text)
        category = re.findall(',univCategory:(.*?),', text)
        province = re.findall(',province:(.*?),', text)

        code_name = re.findall('function(.*?){', text)
        start_code = code_name[0].find('a')
        end_code = code_name[0].find('pE')
        code_name = code_name[0][start_code:end_code].split(',')

        value_name = re.findall('mutations:(.*?);', text)
        start_value = value_name[0].find('(')
        end_value = value_name[0].find(')')
        value_name = value_name[0][start_value + 1:end_value].split(",")

        universities = []
        for i in range(len(name)):
            province_name = value_name[code_name.index(province[i])][1:-1]
            category_name = value_name[code_name.index(category[i])][1:-1]
            universities.append((i + 1, name[i], province_name, category_name, score[i]))
        return universities

    def process(self, url):
        try:
            text = self.fetch_data(url)
            universities = self.parse_data(text)
            for uni in universities:
                self.db.insert(uni[0], uni[1], uni[2], uni[3], float(uni[4]))
        except Exception as err:
            print(f"Error processing data: {err}")

    def show_database(self):
        print("\n开始输出数据库:\n")
        self.db.show()

    def close_database(self):
        self.db.closeDB()


# 使用示例
if __name__ == "__main__":
    forecast = UniversityForecast()
    url = "https://www.shanghairanking.cn/_nuxt/static/1728872418/rankings/bcur/2021/payload.js"
    forecast.process(url)
    forecast.show_database()
    forecast.close_database()
    print("completed")
    print("输出数据库完成")

  • 运行结果:

心得体会

  • 正则表达式的应用:通过正则表达式提取复杂文本中的特定信息,提高了数据处理的灵活性,但也需要注意正则的匹配精度。
posted @ 2024-10-17 16:52  acedia7  阅读(26)  评论(0)    收藏  举报