数据采集第二次作业

作业1

相关代码和结果
from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
import urllib.request
import sqlite3

class WeatherDB:
    def openDB(self):
        self.con=sqlite3.connect("weathers.db")
        self.cursor=self.con.cursor()
        try:
            self.cursor.execute("create table weathers (wCity varchar(16),wDate varchar(16),wWeather varchar(64),wTemp varchar(32),constraint pk_weather primary key (wCity,wDate))")
        except:
            self.cursor.execute("delete from weathers")

    def closeDB(self):
        self.con.commit()
        self.con.close()

    def insert(self, city, date, weather, temp):
        try:
            self.cursor.execute("insert into weathers (wCity,wDate,wWeather,wTemp) values (?,?,?,?)",
                                (city, date, weather, temp))
        except Exception as err:
            print(err)

    def show(self):
            self.cursor.execute("select * from weathers")
            rows = self.cursor.fetchall()
            print("%-16s%-16s%-32s%-16s" % ("city", "date", "weather", "temp"))
            for row in rows:
                print("%-16s%-16s%-32s%-16s" % (row[0], row[1], row[2], row[3]))

class WeatherForecast:
    def __init__(self):
        self.db = None
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre"}
        self.cityCode = {"北京": "101010100", "上海": "101020100", "广州": "101280101",
                         "深圳": "101280601"}  # 城市的编码本

    def forecastCity(self, city):
        if city not in self.cityCode.keys():
            print(city + " code cannot be found")
            return

        url = "http://www.weather.com.cn/weather/" + self.cityCode[city] + ".shtml"
        try:
            req = urllib.request.Request(url, headers=self.headers)
            data = urllib.request.urlopen(req)
            data = data.read()
            dammit = UnicodeDammit(data, ["utf-8", "gbk"])
            data = dammit.unicode_markup
            soup = BeautifulSoup(data, "lxml")
            lis = soup.select("ul[class='t clearfix'] li")
            for li in lis:
                try:
                    date = li.select('h1')[0].text
                    weather = li.select('p[class="wea"]')[0].text
                    temp = li.select('p[class="tem"] span')[0].text + "/" + li.select('p[class="tem"] i')[0].text
                    print(city, date, weather, temp)
                    self.db.insert(city, date, weather, temp)
                except Exception as err:
                    print(err)
        except Exception as err:
            print(err)

    def process(self, cities):
        self.db = WeatherDB()
        self.db.openDB()


        for city in cities:
            self.forecastCity(city)
        # self.db.show()
        self.db.closeDB()
ws = WeatherForecast()
ws.process(["北京", "上海", "广州", "深圳"])
print("completed")

注意到跳转的链接那指定城市有一个编码,如北京为:101010100。只要把链接改成对应城市即可查看相关城市的天气
image

结果:

image

心得

这里学会了通过观察url中的差异来访问不同的网站,来自动化获取想要的信息。此外学习了sqlite3的一些用法。

链接:https://gitee.com/wsxxs233/data-collection/tree/master/task2/blog/q1

作业2

代码和结果
import json
import sqlite3
import time

import pandas as pd
import requests


def format_number_to_units(number, precision=2):
    """
    将浮点数转换为以万或亿为单位的字符串

    参数:
    number: 要转换的浮点数
    precision: 保留的小数位数,默认为2位

    返回:
    转换后的字符串,如"1.00万"、"100.00万"、"1.00亿"
    """
    abs_num = abs(number)

    # 判断条件决定使用哪个单位
    if abs_num >= 1e8:  # 大于等于1亿
        result = number / 1e8
        unit = '亿'
    elif abs_num >= 1e4:  # 大于等于1万但小于1亿
        result = number / 1e4
        unit = '万'
    else:  # 小于1万
        result = number
        unit = ''

    # 格式化数字,保留指定小数位数
    if unit:  # 如果转换了单位
        formatted_num = f"{result:.{precision}f}"
        return f"{formatted_num}{unit}"
    else:  # 如果没有转换单位,直接返回原数字的格式化
        return f"{result:.{precision}f}"


import beautifultable

headers = {
'accept-language':'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'connection':'keep-alive',
'cookie':'fullscreengg=1; fullscreengg2=1; qgqp_b_id=110282da83710be39d3f04bc22ad49b4; st_si=47037965562448; st_asi=delete; st_nvi=ze_XYSElrfUrGQxs2o7x999b8; nid=0ee489b5a0c26ceb35970162b7983121; nid_create_time=1761721084597; gvi=IVLmXbTDpuiq7cx02UETE789b; gvi_create_time=1761721084597; st_pvi=74642916940458; st_sp=2025-10-29%2014%3A58%3A03; st_inirUrl=; st_sn=26; st_psi=20251029161701538-113200301321-3924716102',
'host':'push2.eastmoney.com',
'referer':'https://quote.eastmoney.com/center/gridlist.html',
'sec-ch-ua':'"Microsoft Edge";v="141", "Not?A_Brand";v="8", "Chromium";v="141"',
'sec-ch-ua-mobile':'?0',
'sec-ch-ua-platform':'"Windows"',
'sec-fetch-dest':'script',
"user-agent":
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/141.0.0.0 Safari/537.36 Edg/141.0.0.0'
}

table = beautifultable.BeautifulTable()
table.columns.header = ["序号", "代码", "名称", "最新价", "涨跌幅", "跌涨额", "成交量", "成交额", "涨幅"]
table.columns.width = [15, 15, 15, 15, 15, 15, 15, 15, 15]
table.columns.alignment = beautifultable.enums.ALIGN_LEFT
for i in range(5):
    url = f'https://push2.eastmoney.com/api/qt/clist/get?np=1&fltt=1&invt=2&cb=jQuery3710277153053545653_1761725821366&fs=m%3A0%2Bt%3A6%2Bf%3A!2%2Cm%3A0%2Bt%3A80%2Bf%3A!2%2Cm%3A1%2Bt%3A2%2Bf%3A!2%2Cm%3A1%2Bt%3A23%2Bf%3A!2%2Cm%3A0%2Bt%3A81%2Bs%3A262144%2Bf%3A!2&fields=f12%2Cf13%2Cf14%2Cf1%2Cf2%2Cf4%2Cf3%2Cf152%2Cf5%2Cf6%2Cf7%2Cf15%2Cf18%2Cf16%2Cf17%2Cf10%2Cf8%2Cf9%2Cf23&fid=f3&pn={i+1}&pz=20&po=1&dect=1&ut=fa5fd1943c7b386f172d6893dbfba10b&wbp2u=%7C0%7C0%7C0%7Cweb&_=1761725821372'
    time.sleep(5)

    response=requests.get(url,headers=headers)
    response.encoding=response.apparent_encoding
    response_text=response.text
    start_index = response_text.find('(') + 1
    end_index = response_text.rfind(')')
    json_data_str = response_text[start_index:end_index]
    js=json.loads(json_data_str)

    for id,item in enumerate(js['data']['diff']):
        f2='{}.{}'.format(str(item['f2'])[:-2],str(item['f2'])[-2:])
        f3='{}.{}%'.format(str(item['f3'])[:-2],str(item['f3'])[-2:])
        f4='{}.{}'.format(str(item['f4'])[:-2],str(item['f4'])[-2:])
        f7='{}.{}%'.format(str(item['f7'])[:-2],str(item['f3'])[-2:])
        table.rows.append([id+20*i+1,item["f12"],item["f14"],f2,f3,f4,format_number_to_units(float(item["f5"])),format_number_to_units(float(item["f6"])),f7])
    print(f'page{i+1}')


df=table.to_df()
conn=sqlite3.connect('stocks.db')
df.to_sql(
    name='A',
    con=conn,
    if_exists='replace',
    index=False
)
conn.close()
# 从数据库读取数据验证
conn = sqlite3.connect('stocks.db')
# 读取整个表
result_df = pd.read_sql('SELECT * FROM A', conn)
print("从数据库读取的数据:")
print(result_df)
conn.close()

结果:

image
这里通过F12调试发现股票信息是存在这个js文件的,用request下载再用json解析即可得到想要的信息。
image

心得

这里学会了怎么通过接口获取信息和使用json解析。同时发现请求的时候带上请求标头不容易被服务器拦截。
链接:
https://gitee.com/wsxxs233/data-collection/tree/master/task2/blog/q2

作业3

代码与结果
import sqlite3

import beautifultable
import js2py
import pandas as pd
import requests
import json
import re
url='https://www.shanghairanking.cn/_nuxt/static/1762223212/rankings/bcur/2021/payload.js'
headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36 Edg/142.0.0.0',
    'cookie':'_clck=1vlmizh%5E2%5Eg0r%5E0%5E2112;'
             ' Hm_lvt_af1fda4748dacbd3ee2e3a69c3496570=1760366429,1760516847,1760627317,1762352079; '
             'Hm_lpvt_af1fda4748dacbd3ee2e3a69c3496570=1762352079; HMACCOUNT=3C6775A98EF952A2; '
             '_clsk=xrufho%5E1762352080299%5E1%5E1%5Ev.clarity.ms%2Fcollect'
}
# js=requests.get(url,headers=headers)
# with open("cache.js",'w',encoding='utf-8') as f:
#     f.write(js.text)


with open('cache.js', 'r', encoding='utf-8') as f:
    s = f.read()


pattern = r'__NUXT_JSONP__\("/rankings/bcur/\d+",\s*\(function\(([^)]*)\)\s*{([\s\S]*)}\s*\(([^)]*)\)\)\);'
match = re.search(pattern, s)

func_params = match.group(1)  # 函数参数: a, b, c, d, e, f, ...
function_body = match.group(2)  # 函数体
actual_params = match.group(3)  # 实际参数值


def extract_with_js_engine(func_params: str, function_body: str, actual_params: str) :
    """
    使用JavaScript引擎执行函数并提取数据
    """
    try:
        # 创建JavaScript执行环境
        context = js2py.EvalJs()

        # 构建完整的JavaScript代码
        js_code = f"""
        // 定义函数
        function getData({func_params}) {{
            {function_body}
        }}

        // 调用函数并返回结果
        var result = getData({actual_params});

        // 返回univData数组
        result.data[0].univData;
        """

        # 执行JavaScript代码
        univ_data = context.eval(js_code)
        # 转换为Python列表
        universities = []
        for univ in univ_data:
            university_info = {
                '排名': univ.get('ranking', ''),
                '学校名称': univ.get('univNameCn', ''),
                '英文名称': univ.get('univNameEn', ''),
                '学校类型': univ.get('univCategory', ''),
                '所在省份': univ.get('province', ''),
                '总分': univ.get('score', '')
            }
            universities.append(university_info)

        return universities
    except Exception as e:
        print(f"JavaScript执行失败: {e}")
        return []

import beautifultable
table = beautifultable.BeautifulTable()
table.columns.header = ["排名", "学校名称", "英文名称", "学校类型", "所在省份", "总分"]
table.columns.width = [15, 15, 60, 15, 15, 15]
for data in extract_with_js_engine(func_params, function_body, actual_params):
    table.append_row(data.values())


df=table.to_df()
conn=sqlite3.connect('ranking.db')
df.to_sql(
    name='A',
    con=conn,
    if_exists='replace',
    index=False
)
conn.close()

# 从数据库读取数据验证
conn = sqlite3.connect('ranking.db')

# 读取整个表
result_df = pd.read_sql('SELECT * FROM A', conn)
print("从数据库读取的数据:")
print(result_df)


conn.close()

这里下载下来是js里的函数,一些参数设置成形参了直接json提取不方便,用jspy运行这个函数得到好看的结果。
image

无标题视频——使用Clipchamp制作

心得

学会了怎么运行js代码并获取有用的信息。

链接:
https://gitee.com/wsxxs233/data-collection/tree/master/task2/blog/q3

posted @ 2025-11-05 23:34  XYC_666  阅读(9)  评论(0)    收藏  举报