2023级数据采集与融合技术实践作业一
作业1
1)、实验要求
用requests和BeautifulSoup库方法定向爬取给定网址(http://www.shanghairanking.cn/rankings/bcur/2020 )的数据,屏幕打印爬取的大学排名信息。
输出信息:
| 排名 | 学校名称 | 省市 | 学校类型 | 总分 |
|---|---|---|---|---|
| 1 | 清华大学 | 北京 | 综合 | 852.5 |
| 2...... |
代码:
import requests
from bs4 import BeautifulSoup
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.76"
}
def getHTMLText(url):
try:
r = requests.get(url, headers=headers)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return "未能调用"
def fillUnivList(soup):
ulist = []
table = soup.find("table", class_="rk-table")
for tr in table.tbody.find_all("tr"):
try:
td_list = tr.find_all("td")
if len(td_list) >= 5:
rank = td_list[0].text.strip() # 排名
name = td_list[1].a.text.strip() # 学校名称
location = td_list[2].text.strip() # 省市
category = td_list[3].text.strip() # 学校类型
score = td_list[4].text.strip() # 总分
ulist.append([rank, name, location, category, score])
except Exception as err:
print("发生异常:", err)
return ulist
def printUnivList(ulist, num):
print("{:^10}\t{:^14}\t{:^6}\t{:^8}\t{:^6}".format("排名", "学校名称", "省市" ,"学校类型", "总分")) # 取10/14/6/8/6位中间对齐
for i in range(min(num, len(ulist))):
u = ulist[i]
print("{:^10}\t{:^14}\t{:^6}\t{:^8}\t{:^6}".format(u[0], u[1], u[2], u[3], u[4]))
def main():
url = "http://www.shanghairanking.cn/rankings/bcur/2020"
html = getHTMLText(url)
if html == "未能调用":
print("无法获取网页内容")
return
soup = BeautifulSoup(html, "html.parser")
ulist = fillUnivList(soup)
printUnivList(ulist, 30)
main()
2)、心得体会
实践中,通过try-except处理异常的方法,运行过程中哪一个步骤出现错误就很容易发现,并且该网站可能面向大众,反爬机制相对较弱,获取信息过程中基本上不会遇到什么阻碍。
作业2
1)、实验要求
用requests和re库方法设计某个商城(自已选择)商品比价定向爬虫,爬取该商城,以关键词“书包”搜索页面的数据,爬取商品名称和价格。
输出信息:
| 序号 | 价格 | 商品名 |
|---|---|---|
| 1 | 65.00 | xxx |
| 2...... |
代码:
import re
import urllib.request
import http.cookiejar
cookies = {
"shshshfpa": "6a3735a9-83c4-d291-c1d8-5aaee220fef7-1661328416",
"__jdu": "1508279914",
"shshshfp": "2f609deafdea6ee925650952de2d2492",
"shshshfpx": "6a3735a9-83c4-d291-c1d8-5aaee220fef7-1661328416",
"areaId": "16",
"ipLoc-djd": "16-1303-0-0",
"PCSYCityID": "CN_350000_350100_0",
"logintype": "wx",
"unick": "chPuaBDUHXMk",
"pin": "wdchPuaBDUHXMk",
"npin": "wdchPuaBDUHXMk",
}
def getHtmlText(url):
try:
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.81"
}
cj = http.cookiejar.CookieJar()
opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))
req = urllib.request.Request(url, headers=headers)
data = opener.open(req).read().decode('utf-8')
with open("html.txt", "w", encoding='utf-8') as f:
f.write(data)
except Exception as err:
print("未能调用", err)
def getData():
result = []
try:
with open("html.txt", "r", encoding='utf-8') as f:
html = f.read()
name_pattern = r'<div class="p-name.*?">.*?<em>(.*?)</em>.*?</div>'
price_pattern = r'<div class="p-price">(.*?)</div>'
name_list = re.findall(name_pattern, html, re.S)
price_list = re.findall(price_pattern, html, re.S)
for i, (name, price) in enumerate(zip(name_list, price_list), start=1):
name = re.sub(r'<.*?>', '', name).strip() # 去掉标签
price = re.sub(r'<.*?>', '', price).strip() # 去掉标签
result.append([i, price, name])
except Exception as err:
print("错误:", err)
return result
def main():
url = "https://search.jd.com/Search?keyword=%E4%B9%A6%E5%8C%85&enc=utf-8&wq=%E4%B9%A6%E5%8C%85&pvid=77bd6af9b1894e1296fd835c97958e17"
getHtmlText(url)
data = getData()
print("{:^5}\t{:^5}\t{:^15}".format("序号", "价格", "商品名"))
for item in data:
print("{:^5}\t{:^5}\t{:^20}".format(item[0], item[1], item[2]))
if __name__ == "__main__":
main()
2)、心得体会
实践过程中,对正则表达式的使用不够熟练,出现了一个问题——在京东商品页面中有的价格会显示两个:一个原价,一个学生价。反复调整表达式,但是还是不能匹配到具体的一个,最终只能两个都保留,这一点有待改进。还有一个问题——京东网站的反爬机制:不能连续多次爬取网页信息。为解决这个问题,选择将爬取的页面信息转换成html文本保存到txt文本文档里面,方便后面提取数据。
作业3
1)、实验要求
爬取一个给定网页( https://xcb.fzu.edu.cn/info/1071/4481.htm)或者自选网页的所有JPEG和JPG格式文件
输出信息:将自选网页内的所有JPEG和JPG文件保存在一个文件夹中
代码:
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.36"
}
def getImagelinks(url):
img_links = []
try:
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
for img in soup.find_all("img"):
src = img.get("src")
if src is not None and ('.jpg' in src or '.jpeg' in src):
# 构建完整的图片链接
full_url = urljoin(url, src)
img_links.append(full_url)
except Exception as err:
print("未能获取网页图片链接")
return img_links
# 下载图片
def download_images(img_links):
folder_name = 'downloaded_images'
os.makedirs(folder_name, exist_ok=True)
for link in img_links:
try:
response = requests.get(link, stream=True, headers=headers)
file_name = link.split('/')[-1].split('?')[0]
file_name = file_name.replace(':', '_').replace('?', '_')
with open(os.path.join(folder_name, file_name), 'wb') as f:
for chunk in response.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
print(f'Downloaded: {file_name}')
except Exception as e:
print(e)
if __name__ == '__main__':
url = "https://xcb.fzu.edu.cn/info/1071/4481.htm"
img_links = getImagelinks(url)
download_images(img_links)
2)、心得体会
实践过程中发现图片链接会报错,查了一下才知道getImagelinks()函数返回的图片链接是相对路径,缺少了协议(如http或https)导致requests.get()方法无法识别这些链接类型。为了解决这个问题,采用urllib.parse.urljoin()函数来构建完整的图片链接。其次就是下载后的照片查看不了,显示格式问题,后面才知道Image.open()方法是通过Pillow库来打开和显示图片的,使用这个方法需要提前安装Pillow库。
浙公网安备 33010602011771号