# coding=utf-8
from bs4 import BeautifulSoup #网页解析获取数据
import re#正则表达式。进行文字匹配
import urllib.request,urllib.error#制定URL ,获取网页数据
import xlwt#进行excle操作
import sqlite3#进行数据库操作
import urllib.request
import urllib.parse
import parsel as parsel
import pymysql#导入数据库函数
from datetime import datetime, date
import math
#1.爬取网页
#2.解析数据
#3.保存数据
roomnamelist = ['基教','一教','二教','三教','龙山A','龙山B','龙山C','龙山D']
def jiaoshi(jiaoshi):
for i in range(8):
#or item1 in jiaoshi:
name1 = roomnamelist[i]
#print(name1)
item1 = jiaoshi[i]
main(item1,name1)
def main(item1,name1):
for i in range(5):
print("开始爬取教务系统网站......")
baseurl = "https://tiedao.vatuu.com/vatuu/CourseAction"
datalist = getData(baseurl,i,item1)
savepath=f"{name1}教务系统{i}课表.xls"
print("保存成功"+savepath)
savaData(datalist,savepath)
savedatasql(datalist)
findXh=re.compile(r'<td>(\d{1,2})</td>')
findX=re.compile(r'<font color="#000080">(.*?)</font>')
findJxl=re.compile(r'<td>(.*)</td>')
findJsmc=re.compile(r'<font color="#0000FF">(.*)</font>')
findJslx=re.compile(r'<td>(多媒体)</td>')
findRl=re.compile(r'<td>(\d*)</td>')
findZt=re.compile(r'<font color="blue">(空闲)</font>')
findSfkj=re.compile(r'(可借|不可借)')
findZc=re.compile(r'<td>(第.*周)</td>')
findXq=re.compile(r'(星期.)')
findJc=re.compile(r'<td>(第.*节)</td>')
jijiao = [["0000000000011","1","13"],["0000000011100","1","13"],["0000001100000","1","13"],["0000110000000","1","13"],["1111000000000","1","13"]]
jiaoshilist = [[["0000000000011","1","13"],["0000000011100","1","13"],["0000001100000","1","13"],["0000110000000","1","13"],["1111000000000","1","13"]],[["0000000000011","1","31"],["0000000011100","1","31"],["0000001100000","1","31"],["0000110000000","1","31"],["1111000000000","1","31"]],[["0000000000011","1","7"],["0000000011100","1","7"],["0000001100000","1","7"],["0000110000000","1","7"],["1111000000000","1","7"]],[["0000000000011","1","21"],["0000000011100","1","21"],["0000001100000","1","21"],["0000110000000","1","21"],["1111000000000","1","21"]],[["0000000000011","2","36"],["0000000011100","2","36"],["0000001100000","2","36"],["0000110000000","2","36"],["1111000000000","2","36"]],[["0000000000011","2","37"],["0000000011100","2","37"],["0000001100000","2","37"],["0000110000000","2","37"],["1111000000000","2","37"]],[["0000000000011","2","38"],["0000000011100","2","38"],["0000001100000","2","38"],["0000110000000","2","38"],["1111000000000","2","38"]],[["0000000000011","2","39"],["0000000011100","2","39"],["0000001100000","2","39"],["0000110000000","2","39"],["1111000000000","2","39"]]]
def getData(baseurl,i,item1):#获取数据
#print(f"+++++++++++{i}")
datalist = []
html = askURL(baseurl,item1[i]) # 保存获取到的网页源码
#2.逐一解析
soup = BeautifulSoup(html, "html.parser")
for item in soup.select('tr'):# 查找符合要求的字符串,形成列表
data = [] # 保存一部电影的全部信息
item = str(item)
xh = re.findall(findXh, item)
if(len(xh)==2):# re库通过正则表达式来查找指定的字符串,
data.append(xh[0])
else:# 添加序号
data.append(xh)
x = re.findall(findX, item) # re库通过正则表达式来查找指定的字符串,
data.append(x) # 添加校区
jxl = re.findall(findJxl, item)
if(len(jxl)>0):# re库通过正则表达式来查找指定的字符串,
data.append(jxl[1]) # 添加教学楼
jsmc = re.findall(findJsmc, item) # re库通过正则表达式来查找指定的字符串,
data.append(jsmc) # 添加教室名称
jslx = re.findall(findJslx, item) # re库通过正则表达式来查找指定的字符串,
data.append(jslx) # 添加教室类型
rl = re.findall(findRl, item)
if(len(rl)==2):# re库通过正则表达式来查找指定的字符串,
data.append(rl[1]) # 添加教室容量
zt = re.findall(findZt, item) # re库通过正则表达式来查找指定的字符串,
data.append(zt) # 添加教室状态
fkj = re.findall(findSfkj, item) # re库通过正则表达式来查找指定的字符串,
data.append(fkj) # 添加教室是否可借
zc = re.findall(findZc, item) # re库通过正则表达式来查找指定的字符串,
data.append(zc) # 添加教室周次
xq = re.findall(findXq, item) # re库通过正则表达式来查找指定的字符串,
data.append(xq) # 添加教室星期
jc = re.findall(findJc, item) # re库通过正则表达式来查找指定的字符串,
data.append(jc) # 添加教室节次
datalist.append(data) # 把一部电影的信息放入数组
#print(datalist)
#逐一解析数据
return datalist#返回数据列表
def savaData(datalist,savapath):
book = xlwt.Workbook(encoding="utf-8",style_compression=0) # 创建workbook对象
sheet = book.add_sheet('教务系统课表',cell_overwrite_ok=True) # 创建工作表
col = ("序号","校区","教学楼","教室名称","教室类型","容量","状态","是否可借","周次","星期","节次")
for i in range(0,11):
sheet.write(0,i,col[i])#设置列名
for i in range(3, len(datalist)):
print("第%d条"%i)
data=datalist[i]
for j in range(0,11):
if len(data[j])==0:
break
sheet.write(i+1-3,j,data[j])
book.save(savapath)#保存
#得到指定一个url的网页内容
def week():
def askURL(url):
head = { #伪装请求头,模拟浏览器访问
"User-Agent":" Mozilla / 5.0(Linux;Android6.0;Nexus5 Build / MRA58N) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 99.0.4844.51Mobile Safari / 537.36"
}
request = urllib.request.Request(url,headers=head)
html = ""
try:
response = urllib.request.urlopen(request)
html = response.read().decode('utf-8')
#print(html)
except urllib.error.URLError as e:
if hasattr(e,"code"):
print(e.code)
if hasattr(e,"reason"):
print(e.reason)
return html #返回爬到所有的html数据
html = askURL("https://tiedao.vatuu.com/vatuu/CourseAction?setAction=classroomQuery")
#print(html)
findweek=re.compile(r'第(\d*)周')
selector = parsel.Selector(html)
week = selector.xpath('//*[@id="table2"]/tr[1]/td/text()[2]').get()
week = week.strip();
week = str(re.findall(findweek,week))
week = week.replace("['",'')
week = week.replace("']",'')
#print(week)
return week
def askURL(url,list):
week_no = week();
week_no = int(week_no)-1
week_no = int(math.pow(2,week_no))
#print(week_no)
dayOfWeek = datetime.now().weekday()
day_no = int(math.pow(2,dayOfWeek))
data = bytes(urllib.parse.urlencode({
# case "基础楼-本部":place="13";break;
# case "一教-本部":place="31";;break;
# case "二教-本部":place="7";;break;
# case "三教-本部":place="21";;break;
# case "新教A-龙山":place="36";;break;
# case "新教B-龙山":place="37";;break;
# case "新教C-龙山":place="38";;break;
# case "新教D-龙山":place="39";;break;
"setAction": "classroomQuery",
"PageAction": "Query",
"day_time_text": f"{list[0]}",
"school_area_code": f"{list[1]}",
"building": f"{list[2]}",
"week_no": f"{week_no}"
, "day_no": f"{day_no}",
"day_time1": "ON",
"B1": "查询"}), encoding="utf-8")
headers = { # 模拟浏览器头部信息,向浏览器发送消息
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:99.0) Gecko/20100101 Firefox/99.0"
}
#用户代理,表示告诉服务器,我么是什么类型的机器,本质上是告诉浏览器,我们可以接受什么类型的内容
request=urllib.request.Request(url,headers=headers,data=data,method="POST")
html=""
try:
resonse = urllib.request.urlopen(request)
html=resonse.read().decode("utf-8")
#print(html)
except urllib.error.URLError as e:
if hasattr(e,"code"):
print(e.code)
if hasattr(e,"reason"):
print(e.reason)
return html
def savedatasql(datalist):
conn = pymysql.connect(host='localhost',
user='root',
password='root',
database='test',
cursorclass=pymysql.cursors.DictCursor)
# 建立游标
cursor = conn.cursor()
list = []
data = []
for i in range(3, len(datalist)):
list = datalist[i]
data1 = tuple(list)
sql = 'insert into vatuu(xh,area,Jxl,Jsmc,Jslx,Rl,Zt,Sfkj,Zc,Xq,Jc) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'
# (2)准备数据
# (3)操作
try:
cursor.execute(sql, data1)
conn.commit()
except Exception as e:
print('插入数据失败', e)
conn.rollback() # 回滚
# 关闭游标
#cursor.close()
# 关闭连接
#conn.close()
if __name__=="__main__":#当前程序被调用执行时
#调用函数
jiaoshi(jiaoshilist)
print("成功导入到数据库")