第二次作业

作业一

  • 要求

在中国气象网(http://www.weather.com.cn)给定城市集的7日天气预报,并保存在数据库。

  • 代码

from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
import urllib.request
import sqlite3
class WeatherDB:
	def openDB(self):
		self.con = sqlite3.connect("weathers.db")
		self.cursor = self.con.cursor()
		try:
			self.cursor.execute("create table weathers (wCity varchar(16),wDate varchar(16),wWeather varchar(64),wTemp varchar(32),constraint pk_weather primary key (wCity,wDate))")
		except:
			self.cursor.execute("delete from weathers")
			
	def closeDB(self):
		self.con.commit()
		self.con.close()
		
	def insert(self,city,date,weather,temp):
		try:
			self.cursor.execute("insert into weathers (wCity,wDate,wWeather,wTemp) values(?,?,?,?)",(city,date,weather,temp))
		except Exception as err:
			print(err)
		
	def show(self):
		self.cursor.execute("select * from weathers")
		rows = self.cursor.fetchall()
		print("%-16s%-16s%-32s%-16s"%("city","data","weather","temp"))
		for row in rows:
			print("%-16s%-16s%-32s%-16s"%(row[0],row[1],row[2],row[3]))
			
class WeatherForecast:
	def __init__(self):
		self.headers = {"User-Agent":"Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US;rv:1.9pre)Gecko/2008072421 Minefield/3.0.2pre"}
		self.cityCode = {"北京":"101010100","上海":"101020100","广州":"101280101","深圳":"101280601"}
		
	def forecastCity(self,city):
		if city not in self.cityCode.keys():
			print(city+"code cannot be found")
			return 
		url = "http://www.weather.com.cn/weather/"+self.cityCode[city]+".shtml"
		try:
			req = urllib.request.Request(url,headers=self.headers)
			data = urllib.request.urlopen(req)
			data = data.read()
			dammit = UnicodeDammit(data,["utf-8","gbk"])
			data = dammit.unicode_markup
			soup = BeautifulSoup(data,"lxml")
			lis = soup.select("ul[class='t clearfix'] li")
			for li in lis:
				try:
					date = li.select('h1')[0].text
					weather = li.select('p[class="wea"]')[0].text
					temp = li.select('p[class="tem"] span')[0].text+"/"+li.select('p[class="tem"] i')[0].text
					print(city,date, weather,temp)
					self.db.insert(city,date,weather,temp)
				except Exception as err:
					print(err)
		except Exception as err:
			print(err)
	
	def process(self,cities):
		self.db = WeatherDB()
		self.db.openDB()
		
		for city in cities:
			self.forecastCity(city)
		
		self.db.show()	
		self.db.closeDB()
		
ws = WeatherForecast()
ws.process(["北京","上海","广州","深圳"])
print("completed")
	
  • 运行结果截图

  • 心得体会

通过将书上的代码进行复现,巩固了对BeautifulSoup的使用,同时熟悉了对数据库的建立以及使用

作业二

  • 要求

用requests和BeautifulSoup库方法定向爬取股票相关信息。

  • 候选网站

东方财富网:https://www.eastmoney.com/
​新浪股票:http://finance.sina.com.cn/stock/

  • 思路

1.因为网页数据是动态加载的,首先需要通过js文件获取数据所在的url
2.因为需要获取不同模块以及页数上的股票信息,通过修改url的对应的值来达到翻页以及查看不同模块的效果
3.通过正则表达式获取股票信息,再根据数据的特征进行处理,最后获得需要的数据

  • 代码

import requests
import re
import prettytable as pt

x = pt.PrettyTable()
x.field_names = ["序号","代码","名称","最新价","涨跌幅","涨跌额","成交量","成交额","振幅","最高","最低","今开","昨收"]

#用get方法访问服务器并提取页面数据
def getHtml(cmd,page,fields):
    headers = {"User-Agent":"Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US;rv:1.9pre)"
                            "Gecko/2008072421 Minefield/3.0.2pre"}
    #通过改变url来访问不同模块不用页数的股票信息
    url = "http://19.push2.eastmoney.com/api/qt/clist/get?cb=jQuery112403324490377009397_1602209502288" \
          "&pn="+str(page)+"&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2" \
                           "&fid=f3&fs="+cmd+"&fields="+fields+"&_=1602209502289"
    r = requests.get(url,headers = headers)
    r.encoding = "utf-8"
    html = r.text
    pat = '"diff":\[(.*?)]'
    data = re.compile(pat,re.S).findall(html)	#使用re.S参数以后,正则表达式会将这个字符串作为一个整体,在整体中进行匹配
    return data

#获取单个页面股票数据
def getOnePageStock(cmd,page,fields):
    data = getHtml(cmd,page,fields)
    #通过}.{的组合将各股票信息分割,并去除第一个的{和最后一个的}
    datas = data[0].split("},{")
    datas[0] = datas[0].replace('{','')
    datas[-1] = datas[-1].replace('}','')
    stocks = []
    for i in range(len(datas)):
        #将“去除,并再次通过,进行分割每支股票的各类信息
        stock = datas[i].split(",")
        stocks.append(stock)
    return stocks

def main():
    cmd = {
        "沪深A股": "m:0+t:6,m:0+t:13,m:0+t:80,m:1+t:2,m:1+t:23",
        "上证A股": "m:1+t:2,m:1+t:23"
    }
    results = []
    #想要获取的信息
    #f12:代码,f14:名称,f2:最新价,f3:涨跌幅,f4:涨跌额,f5:成交量,f6:成交额,f7:振幅,f15:最高,f16;最低,f17:今开,f18:昨收
    fields = "f12,f14,f2,f3,f4,f5,f6,f7,f15,f16,f17,f18"
    for i in cmd.keys():
        page = 1
        #爬取三页的股票信息
        while page <3:
            result = getOnePageStock(cmd[i],page,fields)
            results += result
            page += 1
    i = 0
    for stock in results:
        i = i+1
        x.add_row([i,stock[6].split(":")[1],stock[7].split(":")[1],stock[0].split(":")[1],stock[1].split(":")[1],stock[2].split(":")[1],stock[3].split(":")[1],stock[4].split(":")[1],stock[5].split(":")[1],stock[8].split(":")[1],stock[9].split(":")[1],stock[10].split(":")[1],stock[11].split(":")[1]])
    print(x)
main()
  • 运行结果截图




  • 心得体会

1.了解了动态加载的网页数据的爬取过程,明白了如何通过查看js文件获取数据所在的url
2.熟悉了对翻页处理的使用
3.学会了使用prettytable来输出数据

作业三

  • 要求

根据自选3位数+学号后3位选取股票,获取印股票信息。抓包方法同作作业二。

  • 候选网站

东方财富网:https://www.eastmoney.com/
​新浪股票:http://finance.sina.com.cn/stock/

  • 思路

1.通过观察不同代码的股票的url区别,了解如何直接通过搜索获取不同代码的股票的所需信息
2.当获得相应代码的股票信息的url之后,后面的思路跟作业二的思路一样

  • 代码

import requests
import re
import prettytable as pt

x = pt.PrettyTable()
x.field_names = ["股票代码","股票名称","今日开","今日最高","今日最低"]

#用get方法访问服务器并提取页面数据
def getHtml(num,fields):
    headers = {"User-Agent":"Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US;rv:1.9pre)Gecko"
                            "/2008072421 Minefield/3.0.2pre"}
    #通过改变secid
    url = "http://push2.eastmoney.com/api/qt/stock/get?ut=fa5fd1943c7b386f172d6893dbfba10b&invt=2" \
          "&fltt=2&fields="+fields+"&secid="+num+"&cb=jQuery11240959380450062036_1601458369843&_=1601458369844"
    r = requests.get(url,headers = headers)
    r.encoding = "utf-8"
    html = r.text
    pat = '"data":\{(.*?)\}'
    data = re.compile(pat,re.S).findall(html)	#使用re.S参数以后,正则表达式会将这个字符串作为一个整体,在整体中进行匹配
    return data

#获取单个页面股票数据
def getOnePageStock(num,fields):
    data = getHtml(num,fields)
    datas = data[0].strip("'").split(',')
    result = []
    for i in datas:
        result.append(i.split(':')[1])
    x.add_row([result[3].strip('"'),result[4].strip('"'),result[2],result[0],result[1]])

def main():
    nums = ["0.002125","0.300125","1.600125"]
    #要获取的信息
    fields = "f44,f45,f46,f57,f58"
    for num in nums:
        try:
            getOnePageStock(num,fields)
        except:
            print("没有股票:"+num)
    print(x)
main()
  • 运行结果截图

  • 心得体会

在作业二的基础上,对url进行一些修改就能获取想要的数据。

posted @ 2020-10-06 19:11  苏镜泽  阅读(222)  评论(0)    收藏  举报