2023级数据采集与融合技术实践作业一

作业1

1)、实验要求

用requests和BeautifulSoup库方法定向爬取给定网址(http://www.shanghairanking.cn/rankings/bcur/2020 )的数据,屏幕打印爬取的大学排名信息。

输出信息:

排名 学校名称 省市 学校类型 总分
1 清华大学 北京 综合 852.5
2......

代码:

import requests
from bs4 import BeautifulSoup

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.76"
}

def getHTMLText(url):
    try:
        r = requests.get(url, headers=headers)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return "未能调用"

def fillUnivList(soup):
    ulist = []
    table = soup.find("table", class_="rk-table")
    for tr in table.tbody.find_all("tr"):
        try:
            td_list = tr.find_all("td")
            if len(td_list) >= 5:
                rank = td_list[0].text.strip()  # 排名
                name = td_list[1].a.text.strip()  # 学校名称
                location = td_list[2].text.strip()  # 省市
                category = td_list[3].text.strip()  # 学校类型
                score = td_list[4].text.strip()  # 总分
                ulist.append([rank, name, location, category, score])
        except Exception as err:
            print("发生异常:", err)
    return ulist

def printUnivList(ulist, num):
    print("{:^10}\t{:^14}\t{:^6}\t{:^8}\t{:^6}".format("排名", "学校名称", "省市" ,"学校类型", "总分"))  # 取10/14/6/8/6位中间对齐
    for i in range(min(num, len(ulist))):
        u = ulist[i]
        print("{:^10}\t{:^14}\t{:^6}\t{:^8}\t{:^6}".format(u[0], u[1], u[2], u[3], u[4]))

def main():
    url = "http://www.shanghairanking.cn/rankings/bcur/2020"
    html = getHTMLText(url)

    if html == "未能调用":
        print("无法获取网页内容")
        return

    soup = BeautifulSoup(html, "html.parser")

    ulist = fillUnivList(soup)
    printUnivList(ulist, 30)

main()

2)、心得体会

实践中,通过try-except处理异常的方法,运行过程中哪一个步骤出现错误就很容易发现,并且该网站可能面向大众,反爬机制相对较弱,获取信息过程中基本上不会遇到什么阻碍。

作业2

1)、实验要求

用requests和re库方法设计某个商城(自已选择)商品比价定向爬虫,爬取该商城,以关键词“书包”搜索页面的数据,爬取商品名称和价格。

输出信息:

序号 价格 商品名
1 65.00 xxx
2......

代码:

import re
import urllib.request
import http.cookiejar

cookies = {
    "shshshfpa": "6a3735a9-83c4-d291-c1d8-5aaee220fef7-1661328416",
    "__jdu": "1508279914",
    "shshshfp": "2f609deafdea6ee925650952de2d2492",
    "shshshfpx": "6a3735a9-83c4-d291-c1d8-5aaee220fef7-1661328416",
    "areaId": "16",
    "ipLoc-djd": "16-1303-0-0",
    "PCSYCityID": "CN_350000_350100_0",
    "logintype": "wx",
    "unick": "chPuaBDUHXMk",
    "pin": "wdchPuaBDUHXMk",
    "npin": "wdchPuaBDUHXMk",
}

def getHtmlText(url):
    try:
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.81"
        }
        cj = http.cookiejar.CookieJar()
        opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))
        req = urllib.request.Request(url, headers=headers)
        data = opener.open(req).read().decode('utf-8')
        with open("html.txt", "w", encoding='utf-8') as f:
            f.write(data)
    except Exception as err:
        print("未能调用", err)

def getData():
    result = []
    try:
        with open("html.txt", "r", encoding='utf-8') as f:
            html = f.read()
            name_pattern = r'<div class="p-name.*?">.*?<em>(.*?)</em>.*?</div>'
            price_pattern = r'<div class="p-price">(.*?)</div>'
            name_list = re.findall(name_pattern, html, re.S)
            price_list = re.findall(price_pattern, html, re.S)

            for i, (name, price) in enumerate(zip(name_list, price_list), start=1):
                name = re.sub(r'<.*?>', '', name).strip()  # 去掉标签
                price = re.sub(r'<.*?>', '', price).strip()  # 去掉标签
                result.append([i, price, name])
    except Exception as err:
        print("错误:", err)
    return result

def main():
    url = "https://search.jd.com/Search?keyword=%E4%B9%A6%E5%8C%85&enc=utf-8&wq=%E4%B9%A6%E5%8C%85&pvid=77bd6af9b1894e1296fd835c97958e17"
    getHtmlText(url)
    data = getData()

    print("{:^5}\t{:^5}\t{:^15}".format("序号", "价格", "商品名"))
    for item in data:
        print("{:^5}\t{:^5}\t{:^20}".format(item[0], item[1], item[2]))

if __name__ == "__main__":
    main()

2)、心得体会

实践过程中,对正则表达式的使用不够熟练,出现了一个问题——在京东商品页面中有的价格会显示两个:一个原价,一个学生价。反复调整表达式,但是还是不能匹配到具体的一个,最终只能两个都保留,这一点有待改进。还有一个问题——京东网站的反爬机制:不能连续多次爬取网页信息。为解决这个问题,选择将爬取的页面信息转换成html文本保存到txt文本文档里面,方便后面提取数据。

作业3

1)、实验要求

爬取一个给定网页( https://xcb.fzu.edu.cn/info/1071/4481.htm)或者自选网页的所有JPEG和JPG格式文件
输出信息:将自选网页内的所有JPEG和JPG文件保存在一个文件夹中

代码:

import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

headers = {
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.36"
}

def getImagelinks(url):
    img_links = []
    try:
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.text, "html.parser")
        for img in soup.find_all("img"):
            src = img.get("src")
            if src is not None and ('.jpg' in src or '.jpeg' in src):
                # 构建完整的图片链接
                full_url = urljoin(url, src)
                img_links.append(full_url)
    except Exception as err:
        print("未能获取网页图片链接")
    return img_links

# 下载图片
def download_images(img_links):
    folder_name = 'downloaded_images'
    os.makedirs(folder_name, exist_ok=True)

    for link in img_links:
        try:
            response = requests.get(link, stream=True, headers=headers)
            file_name = link.split('/')[-1].split('?')[0]
            file_name = file_name.replace(':', '_').replace('?', '_')
            with open(os.path.join(folder_name, file_name), 'wb') as f:
                for chunk in response.iter_content(chunk_size=1024):
                    if chunk:
                        f.write(chunk)
            print(f'Downloaded: {file_name}')
        except Exception as e:
            print(e)

if __name__ == '__main__':
    url = "https://xcb.fzu.edu.cn/info/1071/4481.htm"
    img_links = getImagelinks(url)
    download_images(img_links)

2)、心得体会

实践过程中发现图片链接会报错,查了一下才知道getImagelinks()函数返回的图片链接是相对路径,缺少了协议(如http或https)导致requests.get()方法无法识别这些链接类型。为了解决这个问题,采用urllib.parse.urljoin()函数来构建完整的图片链接。其次就是下载后的照片查看不了,显示格式问题,后面才知道Image.open()方法是通过Pillow库来打开和显示图片的,使用这个方法需要提前安装Pillow库。

posted @ 2023-09-21 23:59  muyiyYANG  阅读(45)  评论(0)    收藏  举报