## 数据采集与融合技术第二次实验

### 作业①

#### （1）实验内容

1 北京 7日（今天） 晴间多云，北部山区有阵雨或雷阵雨转晴转多云 31℃/17℃
2 北京 8日（明天） 多云转晴，北部地区有分散阵雨或雷阵雨转晴 34℃/20℃
3 北京 9日（后天） 晴转多云 36℃/22℃
4 北京 10日（周六） 阴转阵雨 30℃/19℃
5 北京 11日（周日） 阵雨 27℃/18℃
6......

#### （2）代码实现

（书本代码复现）

1）导入所需的包

from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
import urllib.request
import sqlite3


2) 创建WeatherDB类

class WeatherDB:
def openDB(self):
self.con = sqlite3.connect("weathers.db")
self.cursor = self.con.cursor()
try:
self.cursor.execute(
"create table weathers (wCity varchar(16), wDate varchar(16), wWeather varchar(64), wTemp varchar(32), constraint pk_weather primary key (wCity, wDate))")
except:
self.cursor.execute("delete from weathers")

def closeDB(self):
self.con.commit()
self.con.close()

def insert(self, city, date, weather, temp):
try:
self.cursor.execute("insert into weathers (wCity,wDate,wWeather,wTemp) values (?,?,?,?)", (city, date, weather, temp))
except Exception as err:
print(err)

def show(self):
count = 1
self.cursor.execute("select * from weathers")
rows = self.cursor.fetchall()
tplt = "{0:^16}\t{1:^16}\t{2:^16}\t{3:^32}\t{4:^16}"
print(tplt.format("序号", "地区", "日期", "天气信息", "温度", chr(12288)))
for row in rows:
print(tplt.format(str(count), row[0], row[1], row[2], row[3], chr(12288)))
count += 1


3) 创建WeatherForecast类

class WeatherForecast:
def __init__(self):
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36"}
self.cityCode = {"北京": "101010100", "上海": "101020100", "广州": "101280101", "深圳": "101280601"}

def forecastCity(self, city):
if city not in self.cityCode.keys():
print(city + " code cannot be found")
return
url = "http://www.weather.com.cn/weather/" + self.cityCode[city] + ".shtml"
try:
data = urllib.request.urlopen(req)
dammit = UnicodeDammit(data, ["utf-8", "gbk"])
data = dammit.unicode_markup
soup = BeautifulSoup(data, "lxml")
lis = soup.select("ul[class='t clearfix'] li")
for li in lis:
try:
date = li.select('h1')[0].text
weather = li.select('p[class="wea"]')[0].text
temp = li.select('p[class="tem"] span')[0].text + "/" + li.select('p[class="tem"] i')[0].text
# 若在白天爬取，则爬取温度阈值
self.db.insert(city, date, weather, temp)
except Exception:
try:
temp = li.select('p[class="tem"] i')[0].text
# 若是在晚上爬取，则只爬取一个温度值
self.db.insert(city, date, weather, temp)
except Exception as err:
print(err)
except Exception as err:
print(err)

def process(self, cities):
self.db = WeatherDB()
self.db.openDB()
for city in cities:
self.forecastCity(city)
self.db.show()
self.db.closeDB()


4) 运行程序

ws = WeatherForecast()
ws.process(["北京", "上海", "广州", "深圳"])


5) 运行结果

#### （3）心得体会

temp = li.select('p[class="tem"] span')[0].text + "/" + li.select('p[class="tem"] i')[0].text


temp = li.select('p[class="tem"] i')[0].text


### 作业②

#### （1）实验内容

1 688093 N世华 28.47 62.22% 10.92 26.13万 7.6亿 22.3% 32.0 28.08 30.2 17.55
2......

#### （2）代码实现

1) 导入所需的包

import urllib.request
import re
import sqlite3


2) 创建StockDB类

class StockDB:
def openDB(self):
self.con = sqlite3.connect("stocks.db")
self.cursor = self.con.cursor()
try:
self.cursor.execute(
"create table stocks (sNum varchar(16), sCode varchar(16), sName varchar(16), sNew_price varchar(16), sAmplitude1 varchar(16), sPrice varchar(16), sVolume varchar(16), sTurnover varchar(16), sAmplitude2 varchar(16), sHighest varchar(16), sLowest varchar(16), sToday varchar(16), sYesterday varchar(16), constraint pk_stock primary key (sNum, sCode))")
except:
self.cursor.execute("delete from stocks")

def closeDB(self):
self.con.commit()
self.con.close()

def insert(self, Num, Code, Name, New_price, Amplitude1, Price, Volume, Turnover, Amplitude2, Highest, Lowest, Today, Yesterday):
try:
self.cursor.execute(
"insert into stocks (sNum, sCode, sName, sNew_price, sAmplitude1, sPrice, sVolume, sTurnover, sAmplitude2, sHighest, sLowest, sToday, sYesterday) values (?,?,?,?,?,?,?,?,?,?,?,?,?)",
(Num, Code, Name, New_price, Amplitude1, Price, Volume, Turnover, Amplitude2, Highest, Lowest, Today, Yesterday))
except Exception as err:
print(err)

def show(self):
count = 1
self.cursor.execute("select * from stocks")
rows = self.cursor.fetchall()
tplt = "{0:^10}\t{1:^10}\t{2:^15}\t{3:^15}\t{4:^15}\t{5:^15}\t{6:^15}\t{7:^15}\t{8:^15}\t{9:^15}\t{10:^15}\t{11:^15}\t{12:^15}"
print(tplt.format("序号", "股票代码", "股票名称", "最新报价", "涨跌幅", "涨跌额", "成交量", "成交额", "振幅", "最高", "最低", "今开", "昨收", chr(12288)))
for row in rows:
print(tplt.format(row[0], row[1], row[2], row[3], row[4], row[5], row[6], row[7], row[8], row[9], row[10], row[11], row[12], chr(12288)))


3) 创建Stock类

class Stocks:
def __init__(self):
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36"}

def stockInformation(self):
count = 1
for i in range(1, 7):
# 获取翻页信息
url = "http://40.push2.eastmoney.com/api/qt/clist/get?cb=jQuery112402500315997542808_1634088452555&pn=" + str(i) + "&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&fid=f3&fs=m:1+t:2,m:1+t:23&fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f22,f11,f62,f128,f136,f115,f152"
data = urllib.request.urlopen(req)
data = data.decode()

data = re.findall('"diff":$(.*)$', data)
for j in range(len(data[0].split('},{'))):
# 提取所需信息
# 股票代码
code = re.findall('"f12":"(\d+)"', data[0].split('},{')[j])
# 股票名称
name = re.findall('"f14":"(.*?)"', data[0].split('},{')[j])
# 最新报价
new_price = re.findall('"f2":([0-9\.]*)', data[0].split('},{')[j])
# 涨跌幅
amplitude1 = re.findall('"f3":([0-9\.]*)', data[0].split('},{')[j])
# 涨跌额
price = re.findall('"f4":([0-9\.]*)', data[0].split('},{')[j])
# 成交量
volume = re.findall('"f5":([0-9\.]*)', data[0].split('},{')[j])
# 成交额
turnover = re.findall('"f6":([0-9\.]*)', data[0].split('},{')[j])
# 振幅
amplitude2 = re.findall('"f7":([0-9\.]*)', data[0].split('},{')[j])
# 最高
highest = re.findall('"f15":([0-9\.]*)', data[0].split('},{')[j])
# 最低
lowest = re.findall('"f16":([0-9\.]*)', data[0].split('},{')[j])
# 今开
today = re.findall('"f17":([0-9\.]*)', data[0].split('},{')[j])
# 昨收
yesterday = re.findall('"f18":([0-9\.]*)', data[0].split('},{')[j])
self.db.insert(count, code[0], name[0], new_price[0], amplitude1[0], price[0], volume[0], turnover[0], amplitude2[0] + '%', highest[0], lowest[0], today[0], yesterday[0])
count += 1

def process(self):
self.db = StockDB()
self.db.openDB()
self.stockInformation()
self.db.show()
self.db.closeDB()


4) 运行程序

st = Stocks()
st.process()


5) 运行结果

### 作业③

1 清华大学 969.2

#### （2）代码实现

1) 导入所需的包

import urllib.request
import re
import sqlite3


2) 创建CollegeDB类

class CollegeDB:
def openDB(self):
self.con = sqlite3.connect("colleges.db")
self.cursor = self.con.cursor()
try:
self.cursor.execute(
"create table colleges (cRank varchar(16), cName varchar(64), cScore varchar(16), constraint pk_college primary key (cRank, cName))")
except:
self.cursor.execute("delete from colleges")

def closeDB(self):
self.con.commit()
self.con.close()

def insert(self, rank, name, score):
try:
self.cursor.execute("insert into colleges (cRank, cName, cScore) values (?,?,?)", (rank, name, score))
except Exception as err:
print(err)

def show(self):
self.cursor.execute("select * from colleges")
rows = self.cursor.fetchall()
tplt = "{0:^16}\t{1:^16}\t{2:^16}"
print(tplt.format("排名", "学校", "总分", chr(12288)))
for row in rows:
print(tplt.format(row[0], row[1], row[2], chr(12288)))


3) 创建Colleges类

class Colleges:
def __init__(self):
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36"}

def collegeInformation(self):
try:
data = urllib.request.urlopen(req)
data = data.decode()

# 提取所需信息
name = re.findall('univNameCn:"(.*?)"', data)
score = re.findall('score:(.*?),', data)
rank = re.findall('ranking:(.*?),', data)

# 找到乱码与数字对应的内容
function = re.findall('function$$(.*?)$${', data)
function = function[0].split(',')
item = re.findall('void 0}}$$(.*?)$$', data)
item = item[0].split(',')

for i in range(len(score)):
try:
score[i] = eval(score[i])
except Exception:
try:
index = function.index(score[i])
score[i] = eval(item[index + 1])  # 将总分的乱码替换成数字
except Exception as err:
print(err)

for i in range(len(rank)):
try:
index = function.index(rank[i])
rank[i] = eval(eval(item[index + 1]))  # 将排名的乱码替换成数字
except Exception:
rank[i] = eval(eval(item[index]))  # 将特殊内容替换成数字

for i in range(len(rank)):
self.db.insert(rank[i], name[i], score[i])
except Exception as err:
print(err)

def process(self):
self.db = CollegeDB()
self.db.openDB()
self.collegeInformation()
self.db.show()
self.db.closeDB()


4) 运行程序

ws = Colleges()
ws.process()


5) 运行结果

#### （3）心得体会

function = re.findall('function$$(.*?)$${', data)
function = function[0].split(',')
item = re.findall('void 0}}$$(.*?)$$', data)
item = item[0].split(',')

for i in range(len(score)):
try:
score[i] = eval(score[i])
except Exception:
try:
index = function.index(score[i])
# 由于item的第一个值为空值，所以所得到的下标index需要+1
score[i] = eval(item[index + 1])  # 将总分的乱码替换成数字
except Exception as err:
print(err)


for i in range(len(rank)):
try:
index = function.index(rank[i])
rank[i] = eval(eval(item[index + 1]))  # 将排名的乱码替换成数字
except Exception:
rank[i] = eval(eval(item[index]))  # 将特殊内容替换成数字


posted on 2021-10-16 16:48  yzayr  阅读(15)  评论(0编辑  收藏  举报