102302138 林楚涵 作业1

网络爬虫三次作业全记录


作业①:requests + BeautifulSoup 爬取大学排名

① 核心代码与运行截图

*

点击查看代码
import requests
from bs4 import BeautifulSoup

def get_university_ranking():
    # 目标网址
    url = 'http://www.shanghairanking.cn/rankings/bcur/2020'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
    }

    # 获取页面
    resp = requests.get(url, headers=headers)
    resp.encoding = 'utf-8'
    soup = BeautifulSoup(resp.text, 'html.parser')

    # 定位表格
    table = soup.find('table', class_='rk-table')
    rows = table.find_all('tr')[1:]  # 去掉表头

    # 打印表头
    print('排名\t学校名称\t省市\t类型\t总分')
    print('-' * 60)

    # 遍历每行
    for tr in rows:
        td = tr.find_all('td')
        if len(td) < 5:
            continue
        rank = td[0].text.strip()
        # 只取中文校名,去掉英文
        name = td[1].text.split('\n')[0].strip()
        province = td[2].text.strip()
        category = td[3].text.strip()
        score = td[4].text.strip()

        print(f'{rank}\t{name}\t{province}\t{category}\t{score}')

if __name__ == '__main__':
    get_university_ranking()
*

微信图片_20251026225936_19_81

微信图片_20251026225936_20_81

② 作业心得

我先用 BeautifulSoup 把整段返回文本解析成树,随后用 find('table', class_='rk-table') 一次定位到“主干”,再用 find_all('tr')[1:] 剪掉表头这根侧枝,最后对每一行 tr 调用 find_all('td') ,就相当于顺着枝条摘到 5 片叶子(排名、校名、省市、类型、总分)。整个过程没有陷入繁杂的正则分组,也无需计算字符偏移,完全按照“树根—树枝—树叶”的层级关系一步步精确定位,既直观又不易出错。可见,BS 库“树操作”思维不仅帮助我在代码层面快速提取数据,更让网页结构从混乱的字符串变成了可视化的层次图,大幅降低了理解和维护成本,这正是作业①能够顺利跑通且后续易于扩展的根本原因。


作业②:正则解析当当网书包商品

① 核心代码与运行截图

*

点击查看代码
import re
import requests
from typing import List, Dict, Optional


class DangDangCrawler:
    """当当网商品信息采集器"""

    BASE_URL = 'https://search.dangdang.com/'
    DEFAULT_ENCODING = 'gbk'

    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        })

    def _extract_product_data(self, html_content: str) -> List[Dict[str, str]]:
        """从HTML中提取商品数据"""
        product_pattern = re.compile(
            r'<li.*?ddt-pit="(?P<id>\d+)".*?'
            r'<a title="(?P<name>[^"]+)".*?'
            r'<span class="price_n">\s*&yen;\s*(?P<price>[\d.]+)\s*</span>',
            re.DOTALL
        )

        return [
            {
                'id': match.group('id'),
                'name': match.group('name'),
                'price': match.group('price')
            }
            for match in product_pattern.finditer(html_content)
        ]

    def _format_output(self, products: List[Dict[str, str]]) -> None:
        """格式化输出商品信息"""
        header = f"{'编号':<5} {'价格':<10} {'商品名称'}"
        separator = '-' * 70
        print(header)
        print(separator)

        for product in products:
            print(f"{product['id']:>5}  ¥{product['price']:>8}  {product['name'][:40]:40}")

    def fetch_products(self, keyword: str = '书包') -> None:
        """获取并显示商品信息"""
        try:
            response = self.session.get(
                self.BASE_URL,
                params={'key': keyword, 'act': 'input'},
                timeout=10
            )
            response.encoding = self.DEFAULT_ENCODING

            products = self._extract_product_data(response.text)
            self._format_output(products)

        except requests.RequestException as e:
            print(f"网络请求失败: {str(e)}")
        except Exception as e:
            print(f"处理过程中发生错误: {str(e)}")


if __name__ == "__main__":
    crawler = DangDangCrawler()
    crawler.fetch_products()
*

微信图片_20251027090349_24_81

微信图片_20251027090349_25_81

② 作业心得

整个流程被拆成初始化→请求→解析→输出四个独立方法,每个函数只干一件事,既方便调试,也便于以后更换解析器或增加多线程。尤其 _extract_product_data 把正则表达式写成命名分组,一条 finditer 就把 id、名称、价格一次性拎出来,比传统索引分组直观得多,也避免了硬写下标带来的“数括号”痛苦。通过定义私有方法 _format_output ,还把数据抓取与展示解耦,后续若想改成写 CSV 或入库,只需替换这一层即可,充分体现了“开闭原则”。编码方面,当当页面使用 GBK,若不手动指定 response.encoding = 'gbk' ,中文就会乱码,更让我意识到“爬取前先观察字符集”是条铁律。


作业③:批量下载「影像福大」图片

① 核心代码与运行截图

*

点击查看代码
import urllib.request
import re
import os
import time
from urllib.parse import urljoin, urlparse


class FZUNewsImageSpider:
    def __init__(self):
        self.base_url = "https://news.fzu.edu.cn/yxfd.htm"
        self.domain = "https://news.fzu.edu.cn/"
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        self.all_images = []
        self.all_news = []

    def get_html_content(self, url):
        """获取网页内容"""
        try:
            req = urllib.request.Request(url=url, headers=self.headers)
            response = urllib.request.urlopen(req)
            html_content = response.read().decode('utf-8')
            return html_content
        except Exception as e:
            print(f"获取网页内容失败 {url}: {e}")
            return None

    def parse_pagination(self, html_content):
        """解析分页链接"""
        # 匹配分页链接,包括上一页、下一页和页码
        pagination_pattern = r'<a href="([^"]*)"[^>]*>(\d+)</a>'
        pagination_matches = re.findall(pagination_pattern, html_content)

        page_urls = []
        for url, page_num in pagination_matches:
            if url and not url.startswith('javascript:'):
                full_url = urljoin(self.domain, url)
                page_urls.append((full_url, page_num))

        # 去重并排序
        unique_pages = list(dict.fromkeys(page_urls))
        return unique_pages

    def parse_image_links(self, html_content, page_url):
        """解析图片链接"""
        # 改进的图片匹配模式,匹配各种图片格式
        img_pattern = r'<img[^>]*src=["\']([^"\']*\.(?:jpg|jpeg|png|gif|webp|bmp))["\'][^>]*alt=["\']([^"\']*)["\'][^>]*>'

        images = []
        img_matches = re.findall(img_pattern, html_content, re.IGNORECASE)

        for img_src, alt_text in img_matches:
            # 构建完整的图片URL
            full_img_url = urljoin(page_url, img_src)

            title = alt_text.strip() if alt_text.strip() else "未命名图片"

            images.append({
                'url': full_img_url,
                'title': title,
                'page_url': page_url
            })

        return images

    def parse_news_links(self, html_content, page_url):
        """解析新闻链接"""
        # 改进的新闻链接匹配模式
        news_pattern = r'<a[^>]*href=["\'](info/[^"\']*/\d+\.htm)["\'][^>]*title=["\']([^"\']*)["\'][^>]*>'

        news_links = []
        matches = re.findall(news_pattern, html_content)

        for link, title in matches:
            full_link = urljoin(self.domain, link)
            news_links.append({
                'title': title.strip(),
                'url': full_link,
                'page_url': page_url
            })

        return news_links

    def crawl_page(self, url, page_num="1"):
        """爬取单个页面"""
        print(f"\n正在爬取第 {page_num} 页: {url}")

        html_content = self.get_html_content(url)
        if not html_content:
            return []

        # 解析当前页面的图片和新闻
        images = self.parse_image_links(html_content, url)
        news_links = self.parse_news_links(html_content, url)

        print(f"第 {page_num} 页找到 {len(images)} 张图片, {len(news_links)} 条新闻")

        return images, news_links, html_content

    def download_image(self, img_url, filename):
        """下载单张图片"""
        try:
            req = urllib.request.Request(url=img_url, headers=self.headers)
            response = urllib.request.urlopen(req)
            img_data = response.read()

            with open(filename, 'wb') as f:
                f.write(img_data)

            print(f"下载成功: {filename}")
            return True
        except Exception as e:
            print(f"下载失败 {filename}: {e}")
            return False

    def crawl_all_pages(self, download_images=True):  # 修改为默认下载图片
        """爬取所有页面"""
        print("开始爬取福州大学新闻网影像福大所有页面...")

        # 首先爬取第一页
        images, news, html_content = self.crawl_page(self.base_url, "1")
        self.all_images.extend(images)
        self.all_news.extend(news)

        # 解析分页链接
        page_links = self.parse_pagination(html_content)
        print(f"\n发现 {len(page_links)} 个分页链接")

        # 爬取其他页面
        for page_url, page_num in page_links:
            # 避免重复爬取第一页
            if page_num != "1":
                time.sleep(1)  # 添加延迟,避免请求过快
                images, news, _ = self.crawl_page(page_url, page_num)
                self.all_images.extend(images)
                self.all_news.extend(news)

                # 去重
        self.all_images = list({img['url']: img for img in self.all_images}.values())
        self.all_news = list({news['url']: news for news in self.all_news}.values())

        self.display_results()

        # 下载图片
        if download_images and self.all_images:
            self.download_all_images()

    def display_results(self):
        """显示爬取结果"""
        print("\n" + "=" * 60)
        print("爬取完成!汇总结果:")
        print("=" * 60)

        print(f"\n总共找到 {len(self.all_images)}  张图片:")
        print("-" * 60)
        for i, img in enumerate(self.all_images[:10], 1):  # 只显示前10个
            print(f"{i}. 标题: {img['title']}")
            print(f"   链接: {img['url']}")
            print(f"   来源页面: {img['page_url']}")
            print("-" * 30)

        if len(self.all_images) > 10:
            print(f"... 还有 {len(self.all_images) - 10} 张图片未显示")

        print(f"\n总共找到 {len(self.all_news)}  条新闻:")
        print("-" * 60)
        for i, news in enumerate(self.all_news[:10], 1):  # 只显示前10个
            print(f"{i}. 标题: {news['title']}")
            print(f"   链接: {news['url']}")
            print(f"   来源页面: {news['page_url']}")
            print("-" * 30)

        if len(self.all_news) > 10:
            print(f"... 还有 {len(self.all_news) - 10} 条新闻未显示")

    def download_all_images(self):
        """下载所有图片"""
        print("\n开始下载所有图片...")
        # 创建下载目录
        download_dir = 'fzu_images_all'
        if not os.path.exists(download_dir):
            os.makedirs(download_dir)

        success_count = 0
        for i, img in enumerate(self.all_images, 1):
            # 清理文件名中的非法字符
            safe_title = re.sub(r'[\\/*?:"<> |]', '', img['title'])

            # 根据URL确定文件扩展名
            parsed_url = urlparse(img['url'])
            file_ext = os.path.splitext(parsed_url.path)[1]
            if not file_ext:
                file_ext = '.jpg'  # 默认扩展名

            filename = f"{download_dir}/{i:03d}_{safe_title}{file_ext}"

            # 如果文件名太长,截断
            if len(filename) > 200:
                filename = f"{download_dir}/{i:03d}_image{file_ext}"

            if self.download_image(img['url'], filename):
                success_count += 1

            time.sleep(0.5)  # 添加延迟,避免请求过快

        print(f"\n图片下载完成!成功下载 {success_count}/{len(self.all_images)}  张图片")

        # 生成下载报告
        report_file = f"{download_dir}/download_report.txt"
        with open(report_file, 'w', encoding='utf-8') as f:
            f.write(" 福州大学新闻网影像福大图片下载报告\n")
            f.write("=" * 50 + "\n\n")
            f.write(f" 总图片数: {len(self.all_images)}\n")
            f.write(f" 成功下载: {success_count}\n")
            f.write(f" 失败数量: {len(self.all_images) - success_count}\n\n")

            f.write(" 下载详情:\n")
            for i, img in enumerate(self.all_images, 1):
                status = "成功" if i <= success_count else "失败"
                f.write(f"{i:03d}.  {img['title']} - {status}\n")

    def save_to_file(self):
        """将结果保存到文件"""
        with open('fzu_news_crawl_results.txt', 'w', encoding='utf-8') as f:
            f.write(" 福州大学新闻网影像福大爬取结果\n")
            f.write("=" * 50 + "\n\n")

            f.write(" 图片列表:\n")
            f.write("-" * 30 + "\n")
            for i, img in enumerate(self.all_images, 1):
                f.write(f"{i}.  {img['title']}\n")
                f.write(f"    链接: {img['url']}\n")
                f.write(f"    来源: {img['page_url']}\n\n")

            f.write("\n 新闻列表:\n")
            f.write("-" * 30 + "\n")
            for i, news in enumerate(self.all_news, 1):
                f.write(f"{i}.  {news['title']}\n")
                f.write(f"    链接: {news['url']}\n")
                f.write(f"    来源: {news['page_url']}\n\n")

        print("结果已保存到 fzu_news_crawl_results.txt")


def main():
    # 创建爬虫实例
    spider = FZUNewsImageSpider()

    print("=" * 60)
    print("福州大学新闻网影像福大图片下载器")
    print("=" * 60)

    # 爬取所有页面并下载图片
    spider.crawl_all_pages(download_images=True)

    # 保存结果到文件
    spider.save_to_file()

    print("\n程序执行完毕!")
    print(f"图片保存在: fzu_images_all/ 目录")
    print(f"爬取结果保存在: fzu_news_crawl_results.txt")


if __name__ == "__main__":
    main()
*

微信图片_20251027090349_31_81

微信图片_20251027090349_30_81

微信图片_20251027090349_29_81

微信图片_20251027090349_28_81

微信图片_20251027090349_27_81

微信图片_20251027090349_26_81

微信图片_20251027113905_37_81

② 作业心得

先把任务拆成“拿页面 → 抠图片/新闻 → 去重 → 落盘”四个独立环节,每一步都封装成方法,给未来加功能(视频、PDF、断点续传)留好口子。正则写起来其实挺爽,一句命名分组就能同时抠出 src、alt,但很快发现新闻标题里嵌双引号就能让模式翻车,于是把 [^"] 改成非贪婪 .? 再套 re.DOTALL ,算是给正则上了“保险丝”。真正花时间的是“如何不让同一图片反复下载”——用 URL 当 key 去重最干脆,但又要保留标题、来源页信息,于是折中搞了个“字典套字典”的临时映射,算是用空间换时间。下载目录、文件名合法性、过长截断、失败计数、下载报告,这些“边角料”功能占了一半代码量,却直接决定脚本能不能一次性跑完不用人工擦屁股。*

posted @ 2025-10-27 11:45  Linn13D。  阅读(9)  评论(0)    收藏  举报