2023数据采集与融合技术实践作业一

作业①

1)、实验

  • 代码
import re
import urllib.request
from bs4 import BeautifulSoup

url = "http://www.shanghairanking.cn/rankings/bcur/2020"
response = urllib.request.urlopen(url)
html = response.read()
soup = BeautifulSoup(html, "html.parser")
table = soup.find("table", class_="rk-table")
print("排名\t学校名称\t省市\t学校类型\t总分")
count = 0
for row in table.find_all("tr")[1:]:  # 跳过第一行表头
    columns = row.find_all("td")
    rank = columns[0].text.strip()
    school_name = re.sub(r'[^\u4e00-\u9fa5]+', '', columns[1].text.strip())
    school_name_encode = school_name.replace('双一流', '')
    province = columns[2].text.strip()
    school_type = columns[3].text.strip()
    total_score = columns[4].text.strip()

    print(f"{rank}\t{school_name_encode}\t{province}\t{school_type}\t{total_score}")

    count += 1
    if count == 15:
        break

  • 运行结果

2)、心得体会

学习了解了urllib.request模块和BeautifulSoup模块,掌握了如何爬取网页信息,并用标签来锁定网页中所需要的内容,不足的是最后的输出没有进行格式化观赏性较差。

作业②

1)、实验

  • 代码:
import urllib.request
from bs4 import BeautifulSoup
import re


def getHTML(url):
    headers = {
        "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.81'
    }
    request = urllib.request.Request(url, headers=headers)
    html = ""
    response = urllib.request.urlopen(request)
    html = response.read().decode('gbk')  # 编码形式
    return html


def getData(url, desirable_page):
    for i in range(1, desirable_page + 1):
        search_url = url + "&page_index=" + str(i)
        html = getHTML(search_url)
        # soup = BeautifulSoup(html, "html.parser")
        # for items in soup.find_all('p', attrs={"class": "name", "name": "title"}):
        #     names = items.find_all('a')
        #     for name in names:
        #         title = name["title"]
        #         nameList.append(title)
        namelist = re.findall(".' alt=' (.*?)' /><p class=", html)
        # for items in soup.find_all('span', attrs={"class": "price_n"}):
        #     priceList.append(items.string)
        pricelist = re.findall('<span class="price_n">&yen;(.*?)</span>', html)
        print("\n")
        print("-------这是第%d页--------" % i)
        print("{:<3}\t{:<25}\t{:>}\t".format('序号', '商品名称', '商品价格'))
        for j in range(0, len(namelist)):
            print("{:<3}\t{:<25}\t{:>}\t".format(str(j+1), namelist[j], pricelist[j]))


def main():
    desirable_page = 2  # 想打印几页
    key = '书包'  # 想要的商品
    key_encoded = urllib.parse.quote(key)
    url = f"http://search.dangdang.com/?key={key_encoded}&act=input"
    getData(url, desirable_page)


if __name__ == "__main__":
    main()

  • 运行结果

2)、心得体会

学习了通过re库,即使用正则表达式来进行对所需内容的提取,其中也尝试了使用soup来进行选取,对比了两者的不同,正则表达式在前后内容都是独一无二的时候比较好使用,否则容易筛选出多个相同的内容。
同时也学会了如何通过在网页后加上&page=在=来进行页面索引。

作业③

1)、实验

  • 代码:
import os
import urllib.request
import requests
from bs4 import BeautifulSoup
import re


def getHTML(url):
    headers = {
        "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.31'
    }
    request = urllib.request.Request(url, headers=headers)
    html = ""
    response = urllib.request.urlopen(request)
    html = response.read().decode('UTF-8')  # 编码形式
    return html


def gettext(url):
    headers = {
        "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.31'
    }
    response = requests.get(url=url, headers=headers)
    response.encoding = 'utf-8'
    return response.text


def download_image(url, directory, filename):
    response = requests.get(url)
    if response.status_code == 200:
        save_path = os.path.join(directory, filename)
        with open(save_path, 'wb') as file:
            file.write(response.content)
        print("图片下载完成:", save_path)
    else:
        print("无法下载图片")


def getData(url):
    html = getHTML(url)
    imagelist = re.findall('<img src="(.*?)" width=', html)
    for j in range(0, len(imagelist)):
        img_url = f'http://xcb.fzu.edu.cn{imagelist[j]}'
        filename = str(j+1) + '.jpg'
        download_image(img_url, 'fzu', filename)


def main():
    url = "https://xcb.fzu.edu.cn/info/1071/4481.htm"
    headers = {
        "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.31'
    }
    response = requests.get(url=url, headers=headers)
    response.encoding = 'utf-8'
    content = response.text
    imagelist = re.findall('<img src="(.*?)" width=', content)
    for j in range(0, len(imagelist)):
        img_url = f'http://xcb.fzu.edu.cn{imagelist[j]}'
        filename = str(j + 1) + '.jpg'
        download_image(img_url, 'fzu', filename)

if __name__ == "__main__":
    main()
  • 运行结果:

2)、心得体会

学习了如何从网页下载图片,下载的时候需要在图片命名加上.jpg或者.png等其他的图片属性,否则无法得到图片。

posted @ 2023-09-23 20:23  LiamCap  阅读(44)  评论(0)    收藏  举报