2023数据采集与融合技术实践作业一
作业①
1)、实验
- 代码
import re
import urllib.request
from bs4 import BeautifulSoup
url = "http://www.shanghairanking.cn/rankings/bcur/2020"
response = urllib.request.urlopen(url)
html = response.read()
soup = BeautifulSoup(html, "html.parser")
table = soup.find("table", class_="rk-table")
print("排名\t学校名称\t省市\t学校类型\t总分")
count = 0
for row in table.find_all("tr")[1:]: # 跳过第一行表头
columns = row.find_all("td")
rank = columns[0].text.strip()
school_name = re.sub(r'[^\u4e00-\u9fa5]+', '', columns[1].text.strip())
school_name_encode = school_name.replace('双一流', '')
province = columns[2].text.strip()
school_type = columns[3].text.strip()
total_score = columns[4].text.strip()
print(f"{rank}\t{school_name_encode}\t{province}\t{school_type}\t{total_score}")
count += 1
if count == 15:
break
- 运行结果
![]()
2)、心得体会
学习了解了urllib.request模块和BeautifulSoup模块,掌握了如何爬取网页信息,并用标签来锁定网页中所需要的内容,不足的是最后的输出没有进行格式化观赏性较差。
作业②
1)、实验
- 代码:
import urllib.request
from bs4 import BeautifulSoup
import re
def getHTML(url):
headers = {
"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.81'
}
request = urllib.request.Request(url, headers=headers)
html = ""
response = urllib.request.urlopen(request)
html = response.read().decode('gbk') # 编码形式
return html
def getData(url, desirable_page):
for i in range(1, desirable_page + 1):
search_url = url + "&page_index=" + str(i)
html = getHTML(search_url)
# soup = BeautifulSoup(html, "html.parser")
# for items in soup.find_all('p', attrs={"class": "name", "name": "title"}):
# names = items.find_all('a')
# for name in names:
# title = name["title"]
# nameList.append(title)
namelist = re.findall(".' alt=' (.*?)' /><p class=", html)
# for items in soup.find_all('span', attrs={"class": "price_n"}):
# priceList.append(items.string)
pricelist = re.findall('<span class="price_n">¥(.*?)</span>', html)
print("\n")
print("-------这是第%d页--------" % i)
print("{:<3}\t{:<25}\t{:>}\t".format('序号', '商品名称', '商品价格'))
for j in range(0, len(namelist)):
print("{:<3}\t{:<25}\t{:>}\t".format(str(j+1), namelist[j], pricelist[j]))
def main():
desirable_page = 2 # 想打印几页
key = '书包' # 想要的商品
key_encoded = urllib.parse.quote(key)
url = f"http://search.dangdang.com/?key={key_encoded}&act=input"
getData(url, desirable_page)
if __name__ == "__main__":
main()
- 运行结果
![]()
2)、心得体会
学习了通过re库,即使用正则表达式来进行对所需内容的提取,其中也尝试了使用soup来进行选取,对比了两者的不同,正则表达式在前后内容都是独一无二的时候比较好使用,否则容易筛选出多个相同的内容。
同时也学会了如何通过在网页后加上&page=在=来进行页面索引。
作业③
1)、实验
- 代码:
import os
import urllib.request
import requests
from bs4 import BeautifulSoup
import re
def getHTML(url):
headers = {
"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.31'
}
request = urllib.request.Request(url, headers=headers)
html = ""
response = urllib.request.urlopen(request)
html = response.read().decode('UTF-8') # 编码形式
return html
def gettext(url):
headers = {
"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.31'
}
response = requests.get(url=url, headers=headers)
response.encoding = 'utf-8'
return response.text
def download_image(url, directory, filename):
response = requests.get(url)
if response.status_code == 200:
save_path = os.path.join(directory, filename)
with open(save_path, 'wb') as file:
file.write(response.content)
print("图片下载完成:", save_path)
else:
print("无法下载图片")
def getData(url):
html = getHTML(url)
imagelist = re.findall('<img src="(.*?)" width=', html)
for j in range(0, len(imagelist)):
img_url = f'http://xcb.fzu.edu.cn{imagelist[j]}'
filename = str(j+1) + '.jpg'
download_image(img_url, 'fzu', filename)
def main():
url = "https://xcb.fzu.edu.cn/info/1071/4481.htm"
headers = {
"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.31'
}
response = requests.get(url=url, headers=headers)
response.encoding = 'utf-8'
content = response.text
imagelist = re.findall('<img src="(.*?)" width=', content)
for j in range(0, len(imagelist)):
img_url = f'http://xcb.fzu.edu.cn{imagelist[j]}'
filename = str(j + 1) + '.jpg'
download_image(img_url, 'fzu', filename)
if __name__ == "__main__":
main()
- 运行结果:
![]()
![]()
2)、心得体会
学习了如何从网页下载图片,下载的时候需要在图片命名加上.jpg或者.png等其他的图片属性,否则无法得到图片。





浙公网安备 33010602011771号