# 【数据采集与融合】第二次实践

1. ## 作业①

### 要求：在中国气象网（http://www.weather.com.cn）给定城市集的7日天气预报，并保存在 数据库

1 北京 7日（今天） 晴间多云，北部山区有阵雨或雷阵雨转晴转多云 31℃/17℃
2 北京 8日（明天） 多云转晴，北部地区有分散阵雨或雷阵雨转晴 34℃/20℃
3 北京 9日（后台） 晴转多云 36℃/22℃
4 北京 10日（周六） 阴转阵雨 30℃/19℃
5 北京 11日（周日） 阵雨 27℃/18℃
6......

### 核心代码如下：

#### 1.创建一个天气数据库的类

class WeatherDB:
def openDB(self):
self.con = sqlite3.connect("weathers.db")
self.cursor = self.con.cursor()
try:
self.cursor.execute("create table weathers (wCity varchar(16),wDate varchar(16),wWeather varchar(64),wTemp varchar(32),constraint pk_weather primary key (wCity,wDate))")
except:
self.cursor.execute("delete from weathers")

def closeDB(self):
self.con.commit()
self.con.close()

def insert(self, city, date, weather, temp):
try:
self.cursor.execute("insert into weathers (wCity,wDate,wWeather,wTemp) values (?,?,?,?)",
(city, date, weather, temp))
except Exception as err:
print(err)

def show(self):
global num
self.cursor.execute("select * from weathers")
rows = self.cursor.fetchall()
print("{0:^16}{1:{5}^16}{2:{5}^16}{3:{5}^32}{4:{5}^32}".format("序号","city","date", "weather","temp",chr(12288)))
for row in rows:
print("{0:^16}{1:{5}^16}{2:{5}^16}{3:{5}^32}{4:{5}^32}".format(str(num),row[0],row[1],row[2],row[3],chr(12288)))
num += 1

#### 2.爬取天气数据，并插入数据库

class WeatherForecast:
def __init__(self):
"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre"}
self.cityCode = {"北京": "101010100", "上海": "101020100", "广州": "101280101", "深圳": "101280601"}

def forecastCity(self, city):
if city not in self.cityCode.keys():
print(city + " code cannot be found")
return
url = "http://www.weather.com.cn/weather/" + self.cityCode[city] + ".shtml"
try:
data = urllib.request.urlopen(req)
dammit = UnicodeDammit(data, ["utf-8", "gbk"])
data = dammit.unicode_markup
soup = BeautifulSoup(data, "lxml")
lis = soup.select("ul[class='t clearfix'] li")
for li in lis:
try:
date = li.select('h1')[0].text
weather = li.select('p[class="wea"]')[0].text
temp = li.select("p[class='tem']")[0].text.strip()
# print(city,date,weather,temp)
self.db.insert(city, date, weather, temp)
except Exception as err:
print(err)
except Exception as err:
print(err)

def process(self, cities):
self.db = WeatherDB()  # 创建天气数据库对象，db
self.db.openDB()  # 打开数据库

for city in cities:
self.forecastCity(city)  # 循环遍历，逐一爬取和存储天气预报数据

self.db.show()  # 打印数据库中数据
self.db.closeDB()  # 关闭数据库

### 1.4实验心得：

这次实验对之前爬取天气的实例进行了复现，让我加深了对爬取网页的代码的学习，同时我们还学习了对如何创建一个数据库类，并且向数据库中插入数据，学习如何在pycharm中连接数据库，查看表格

## 2.1作业内容

### 2.2实验步骤：

打开对应的url查看：

#### 2.核心代码：

def getHtml(url):
try:
"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre"}
html = resp.text
return html
except Exception as err:
print(err)

num = 1  # 序号
def getContent(html):
stocks = re.findall(r"\"diff\":$(.*?)$",html,re.M|re.S)
# 使用正则表达式获取stock
# print(stocks)打印stock列表
stocks = list(eval(stocks[0]))
# 将爬取的数据转换为列表
global num
result = []  # 爬取结果存入列表
for stock in stocks:
stockcode = stock["f12"]
stockname = stock["f14"]
newprice = stock["f2"]
diefu = stock["f3"]
diee = stock["f4"]
dealnum = stock["f5"]
deale = stock["f6"]
zhenfu = stock["f7"]
most = stock["f15"]
least = stock["f16"]
today = stock["f17"]
yesterday = stock["f18"]
# 字典获取数据
result.append([num,stockcode,stockname,newprice,diefu,diee,dealnum,deale,zhenfu,most,least,today,yesterday])
# 存入结果列表
num += 1

return result

# 股票数据库
class stockDB:
# 开启
def openDB(self):
self.con = sqlite3.connect("stocks.db")
self.cursor = self.con.cursor()
try:
self.cursor.execute("create table stocks (Num varchar(16), stockCode varchar(16),stockName varchar(16),Newprice varchar(16),RiseFallpercent varchar(16),RiseFall varchar(16),Turnover varchar(16),Dealnum varchar(16),Amplitude varchar(16),max varchar(16),min varchar(16),today varchar(16),yesterday varchar(16))")
except:
self.cursor.execute("delete from stocks")

# 关闭
def closeDB(self):
self.con.commit()
self.con.close()

# 插入
def insert(self,Num,stockcode,stockname,newprice,risefallpercent,risefall,turnover,dealnum,Amplitude,max,min,today,yesterday):
try:
self.cursor.execute("insert into stocks(Num,stockCode,stockName,Newprice,RiseFallpercent,RiseFall,Turnover,Dealnum,Amplitude,max,min,today,yesterday) values (?,?,?,?,?,?,?,?,?,?,?,?,?)",
(Num,stockcode,stockname,newprice,risefallpercent,risefall,turnover,dealnum,Amplitude,max,min,today,yesterday))
except Exception as err:
print(err)

    s = "{0:^10}\t{1:{13}^10}\t{2:{13}^10}\t{3:{13}^10}\t{4:{13}^10}\t{5:{13}^10}\t{6:{13}^10}\t{7:{13}^10}\t{8:{13}^10}\t{9:{13}^10}\t{10:{13}^10}\t{11:{13}^10}\t{12:{13}^10}"
print(s.format("序号","股票代码","股票名称","最新价","涨跌幅","涨跌额","成交量","成交额","振幅","最高","最低","今收","昨收",chr(12288)))
stockdb = stockDB()  # 创建数据库对象
stockdb.openDB()  # 开启数据库
for page in range(1, 6):
# 因为本人学号尾数是5，所以只爬了五页
url = "http://19.push2.eastmoney.com/api/qt/clist/get?cb=jQuery11240009917002240502182_1634088844934&pn=" + str(page) + "&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&fid=f3&fs=m:1+t:2,m:1+t:23&fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f22,f11,f62,f128,f136,f115,f152&_=1634088845178"
html = getHtml(url)
stocks = getContent(html)
for stock in stocks:
print(s.format(stock[0],stock[1],stock[2],stock[3],stock[4],stock[5],stock[6],stock[7],stock[8],stock[9],stock[10],stock[11],stock[12],chr(12288)))
stockdb.insert(stock[0],stock[1],stock[2],stock[3],stock[4],stock[5],stock[6],stock[7],stock[8],stock[9],stock[10],stock[11],stock[12])
# 存入数据库
stockdb.closeDB()

### 2.4实验心得：

1.学习了如何抓包，过程中有很多名字一样的response，要点进f12功能【网络】面板的【预览】功能，查看索要爬取的数据的内容，找到相应的url

2.同时也学习了如何创建数据库的类，把爬取下来的数据内传入到数据库里

3.这次实验不仅加深了对之前爬取网页内容的学习，还学习了新的内容，为我后面的爬虫学习之旅奠定了一定的基础

## 作业③

### 3.2实验步骤：

#### 获取数据

def getHtml(url):
try:
"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre"}
resp.raise_for_status()
resp.encoding = resp.apparent_encoding
return resp.text
except Exception as err:
print(err)

def getContent(html):
r1 = 'univNameCn:"(.*?)"'
r2 = 'score:(.*?),'

namelist = re.findall(r1,html,re.S|re.M)
scorelist = re.findall(r2,html,re.S|re.M)
collegedb = collegeDB()
collegedb.openDB()
print("{0:^10}\t{1:{3}^10}\t{2:{3}^10}".format("排名", "学校名称", "总分", chr(12288)))
num = 1
for i in range(len(namelist)):
print("{0:^10}\t{1:{3}^10}\t{2:{3}^10}".format(num, namelist[i], scorelist[i], chr(12288)))
collegedb.insert(num,namelist[i],scorelist[i])
num += 1

#### 创建数据库类：

class collegeDB:

def openDB(self):
self.con = sqlite3.connect("colleges.db") # 连接数据库，没有的话会注定创建一个
self.cursor = self.con.cursor() # 设置一个游标
try:
self.cursor.execute("create table colleges(Rank varchar(10),Schoolname varchar(10),Score varchar(10))")
except:
self.cursor.execute("delete from colleges")

def closeDB(self):
self.con.commit()
self.con.close()

def insert(self,rank,schoolname,score):
try:
self.cursor.execute("insert into colleges(Rank,Schoolname,Score) values (?,?,?)", (rank, schoolname, score))
except Exception as err:
print(err)

### 3.4实验心得：

1.这次实验学习了如何抓包，过程中有很多名字一样的response，要点进f12功能【网络】面板的【预览】功能，查看索要爬取的数据的内容，找到相应的url进行爬取，虽然这次实验和上一次爬取大学排名有一点相似，但是也增加了很多新的内容，学到了很多新知识

2.同时也学习了如何创建数据库的类，把爬取下来的数据内传入到数据库里

3.这次实验不仅加深了对之前爬取网页内容的学习，还学习了新的内容，为我后面的爬虫学习之旅奠定了一定的基础

### 全部代码地址：第二次实践作业 · 王欢/数据采集与融合 - 码云 - 开源中国 (gitee.com)

posted @ 2021-10-14 18:49  cowhorse  阅读(24)  评论(0编辑  收藏  举报