102302126_李坤铭_作业4

作业①:
熟练掌握 Selenium 查找HTML元素、爬取Ajax网页数据、等待HTML元素等内容。
使用Selenium框架+ MySQL数据库存储技术路线爬取“沪深A股”、“上证A股”、“深证A股”3个板块的股票数据信息。
1)代码:

点击查看代码
import sqlite3
import logging
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from typing import List, Tuple, Optional


APP_CONFIG = {
    "database_name": "stocks.db",
    "max_pages_to_crawl": 3,
    "page_load_timeout": 10,
    "implicit_wait_time": 5,
    "market_boards": [
        ("沪深A股", "#hs_a_board"),
        ("上证A股", "#sh_a_board"),
        ("深证A股", "#sz_a_board")
    ]
}


# 配置日志记录
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger(__name__)


class StockDataScraper:
    """股票数据爬取类,用于从东方财富网抓取股票数据并存储到SQLite数据库"""
    
    def __init__(self):
        self.web_driver = None
        self.db_connection = None

    def setup_web_driver(self) -> webdriver.Chrome:
        """初始化并配置Chrome WebDriver"""
        chrome_options = Options()
        chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
        chrome_options.add_argument('--disable-blink-features=AutomationControlled')

        driver = webdriver.Chrome(options=chrome_options)
        driver.maximize_window()
        driver.implicitly_wait(APP_CONFIG["implicit_wait_time"])
        return driver

    def setup_database(self) -> sqlite3.Connection:
        """初始化数据库并创建必要的表和索引"""
        connection = sqlite3.connect(APP_CONFIG["database_name"])
        cursor = connection.cursor()

        # 如果表已存在则删除重建
        cursor.execute("DROP TABLE IF EXISTS stock_records")

        # 创建新表
        table_creation_sql = """
        CREATE TABLE IF NOT EXISTS stock_records (
            record_id INTEGER PRIMARY KEY AUTOINCREMENT,
            market_board TEXT NOT NULL,
            stock_symbol TEXT NOT NULL,
            company_name TEXT NOT NULL,
            current_price TEXT,
            price_change_percent TEXT,
            price_change_amount TEXT,
            trading_volume TEXT,
            turnover_amount TEXT,
            price_range TEXT,
            daily_high TEXT,
            daily_low TEXT,
            opening_price TEXT,
            previous_close TEXT,
            timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP
        )
        """
        cursor.execute(table_creation_sql)
        connection.commit()

        # 创建索引以提高查询性能
        cursor.execute("CREATE INDEX IF NOT EXISTS idx_stock_symbol ON stock_records(stock_symbol)")
        cursor.execute("CREATE INDEX IF NOT EXISTS idx_market_board ON stock_records(market_board)")
        connection.commit()

        logger.info("数据库初始化完成")
        return connection

    def wait_until_table_loaded(self, wait_timeout: int = 10) -> bool:
        """等待表格数据加载完成"""
        try:
            WebDriverWait(self.web_driver, wait_timeout).until(
                EC.presence_of_element_located((By.XPATH, "//table//tbody/tr"))
            )
            return True
        except Exception as error:
            logger.warning(f"等待表格加载超时: {error}")
            return False

    def extract_row_data(self, row_element, board_type: str) -> Optional[Tuple]:
        """从表格行中提取股票数据"""
        try:
            cells = row_element.find_elements(By.TAG_NAME, "td")
            if len(cells) < 14:
                return None

            # 提取并清理每个单元格的数据
            def safe_get_text(element, default=""):
                return element.text.strip() if element and element.text else default

            return (
                board_type,
                safe_get_text(cells[1], "N/A"),  # 股票代码
                safe_get_text(cells[2], "N/A"),  # 股票名称
                safe_get_text(cells[4], "0.00"),  # 最新价
                safe_get_text(cells[5], "0.00%"),  # 涨跌幅
                safe_get_text(cells[6], "0.00"),  # 涨跌额
                safe_get_text(cells[7], "0"),  # 成交量
                safe_get_text(cells[8], "0.00万"),  # 成交额
                safe_get_text(cells[9], "0.00%"),  # 振幅
                safe_get_text(cells[10], "0.00"),  # 最高
                safe_get_text(cells[11], "0.00"),  # 最低
                safe_get_text(cells[12], "0.00"),  # 今开
                safe_get_text(cells[13], "0.00")   # 昨收
            )
        except Exception as error:
            logger.error(f"解析行数据时出错: {error}")
            return None

    def scrape_board_data(self, board_name: str, board_selector: str) -> int:
        """爬取指定板块的股票数据"""
        records_count = 0
        target_url = f"http://quote.eastmoney.com/center/gridlist.html{board_selector}"
        logger.info(f"开始爬取 {board_name} 数据,URL: {target_url}")

        try:
            self.web_driver.get(target_url)
            if not self.wait_until_table_loaded():
                logger.warning(f"无法加载 {board_name} 的表格数据")
                return 0

            for current_page in range(1, APP_CONFIG["max_pages_to_crawl"] + 1):
                logger.info(f"  正在处理第 {current_page} 页...")
                time.sleep(2)  # 短暂等待页面稳定

                # 获取当前页的所有数据行
                rows = self.web_driver.find_elements(By.XPATH, "//table//tbody/tr")
                if not rows:
                    logger.warning(f"第 {current_page} 页未找到任何数据行")
                    break

                # 处理每一行数据
                batch_data = []
                for row in rows:
                    extracted_data = self.extract_row_data(row, board_name)
                    if extracted_data:
                        batch_data.append(extracted_data)

                # 保存数据到数据库
                if batch_data:
                    self.store_to_database(batch_data)
                    records_count += len(batch_data)
                    logger.info(f"    第 {current_page} 页保存成功,共 {len(batch_data)} 条记录")

                # 如果不是最后一页,尝试翻页
                if current_page < APP_CONFIG["max_pages_to_crawl"]:
                    if not self.navigate_to_next_page():
                        logger.warning("无法继续翻页,可能已到达最后一页")
                        break

            logger.info(f"{board_name} 数据爬取完成,共获取 {records_count} 条记录")
            return records_count

        except Exception as error:
            logger.error(f"爬取 {board_name} 数据时发生错误: {error}")
            return records_count

    def navigate_to_next_page(self) -> bool:
        """导航到下一页"""
        try:
            next_page_button = WebDriverWait(self.web_driver, 5).until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, "a[title='下一页']"))
            )
            self.web_driver.execute_script("arguments[0].click();", next_page_button)
            time.sleep(2)  # 等待新页面加载
            return self.wait_until_table_loaded()
        except Exception as error:
            logger.warning(f"翻页操作失败: {error}")
            return False

    def store_to_database(self, records: List[Tuple]):
        """将数据批量存储到数据库"""
        if not records:
            return

        insert_query = """
        INSERT INTO stock_records 
        (market_board, stock_symbol, company_name, current_price, price_change_percent, 
         price_change_amount, trading_volume, turnover_amount, price_range, daily_high, 
         daily_low, opening_price, previous_close)
        VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
        """

        try:
            cursor = self.db_connection.cursor()
            cursor.executemany(insert_query, records)
            self.db_connection.commit()
        except Exception as error:
            logger.error(f"数据库存储操作失败: {error}")
            self.db_connection.rollback()
            raise

    def execute(self):
        """执行爬虫任务"""
        logger.info("股票数据爬虫启动...")

        try:
            # 初始化数据库和WebDriver
            self.db_connection = self.setup_database()
            self.web_driver = self.setup_web_driver()

            # 爬取各个板块的数据
            total_records = 0
            for board_name, board_selector in APP_CONFIG["market_boards"]:
                records_fetched = self.scrape_board_data(board_name, board_selector)
                total_records += records_fetched

            # 输出统计信息
            cursor = self.db_connection.cursor()
            cursor.execute("SELECT COUNT(*) FROM stock_records")
            final_record_count = cursor.fetchone()[0]

            logger.info(f"爬虫任务完成!共爬取 {total_records} 条数据,数据库中存储 {final_record_count} 条记录")

        except Exception as error:
            logger.error(f"爬虫执行过程中发生错误: {error}", exc_info=True)

        finally:
            # 清理资源
            if self.web_driver:
                self.web_driver.quit()
                logger.info("WebDriver实例已关闭")

            if self.db_connection:
                self.db_connection.close()
                logger.info("数据库连接已释放")

            logger.info("股票数据爬虫执行结束")


if __name__ == "__main__":
    import sys

    # 测试模式处理
    if len(sys.argv) > 1 and sys.argv[1] == "--test":
        APP_CONFIG["max_pages_to_crawl"] = 1
        logger.info("测试模式已启用,仅爬取第一页数据")

    start_timestamp = time.time()

    # 创建并运行爬虫实例
    scraper = StockDataScraper()
    scraper.execute()

    execution_time = time.time() - start_timestamp
    logger.info(f"程序总执行时间: {execution_time:.2f} 秒")
输出结果:

0fcfea0c-5b29-4faf-8ef9-62aa60d4310d
2)心得体会:
通过这次实践,我学会了 Selenium WebDriver 处理复杂 动态网页 的方法,ChromeDriver 版本必须与本地 Chrome 版本完全一致,否则程序将爬取不到内容。
作业②:
要求:
熟练掌握 Selenium 查找HTML元素、实现用户模拟登录、爬取Ajax网页数据、等待HTML元素等内容。
使用Selenium框架+MySQL爬取中国mooc网课程资源信息(课程号、课程名称、学校名称、主讲教师、团队成员、参加人数、课程进度、课程简介)
1)代码:

点击查看代码
import json
import os
import time
import csv
import sqlite3
import traceback
from datetime import datetime
from typing import List, Dict, Optional, Tuple

import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.common.exceptions import (TimeoutException, NoSuchElementException, 
                                      WebDriverException)
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options


class MOOCCourseScraper:
    """中国大学MOOC课程信息爬取工具(优化版)"""

    DEFAULT_CONFIG = {
        "cookie_file": "mooc_cookies.json",
        "driver_path": r"C:\Program Files\Google\Chrome\Application\chromedriver.exe",
        "output_dir": "mooc_data",
        "timeout": 15,
        "max_pages": 3,
        "courses_per_page": 10,
        "headless_mode": False,
        "retry_attempts": 3,
    }

    def __init__(self, custom_config: Optional[Dict] = None):
        """初始化爬虫配置"""
        self.config = {**self.DEFAULT_CONFIG, **(custom_config or {})}
        self.web_driver = None
        self.wait = None
        self.db_connection = None
        self.db_cursor = None

        # 创建输出目录
        os.makedirs(self.config["output_dir"], exist_ok=True)
        self._initialize_database()

    def _initialize_database(self) -> None:
        """初始化SQLite数据库"""
        db_path = os.path.join(self.config["output_dir"], "mooc_courses.db")
        self.db_connection = sqlite3.connect(db_path, check_same_thread=False)
        self.db_cursor = self.db_connection.cursor()

        # 创建课程表
        self.db_cursor.execute("""
        CREATE TABLE IF NOT EXISTS courses (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            course_id TEXT NOT NULL,
            course_name TEXT NOT NULL,
            university TEXT,
            teacher TEXT,
            team TEXT,
            participants INTEGER DEFAULT 0,
            schedule TEXT,
            description TEXT,
            url TEXT,
            category TEXT,
            crawl_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
            UNIQUE(course_id)
        )
        """)
        self.db_connection.commit()
        print(f"数据库初始化完成: {db_path}")

    def _setup_browser_driver(self) -> bool:
        """配置并启动浏览器驱动"""
        try:
            # 检查驱动是否存在
            if not os.path.exists(self.config["driver_path"]):
                print(f"错误: ChromeDriver未找到于 {self.config['driver_path']}")
                return False

            # 配置浏览器选项
            options = Options()
            if self.config["headless_mode"]:
                options.add_argument('--headless')
                options.add_argument('--no-sandbox')
                options.add_argument('--disable-dev-shm-usage')

            options.add_experimental_option('excludeSwitches', ['enable-logging', 'enable-automation'])
            options.add_argument('--log-level=3')
            options.add_argument('--disable-blink-features=AutomationControlled')
            options.add_argument('--disable-gpu')
            options.add_argument('--disable-infobars')
            options.add_argument('--start-maximized')
            options.add_argument(
                'user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
                '(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
            )

            # 启动浏览器
            service = Service(executable_path=self.config["driver_path"])
            self.web_driver = webdriver.Chrome(service=service, options=options)
            self.wait = WebDriverWait(self.web_driver, self.config["timeout"])

            # 设置超时
            self.web_driver.set_page_load_timeout(30)
            self.web_driver.set_script_timeout(30)

            print("浏览器启动成功")
            return True

        except Exception as e:
            print(f"浏览器启动失败: {str(e)}")
            return False

    def _load_user_cookies(self) -> bool:
        """加载用户Cookie(如果存在)"""
        if not os.path.exists(self.config["cookie_file"]):
            print(f"警告: Cookie文件未找到于 {self.config['cookie_file']}")
            print("将以匿名模式访问")
            return True

        try:
            with open(self.config["cookie_file"], 'r', encoding='utf-8') as f:
                cookie_data = json.load(f)

            # 访问首页
            self._safe_navigate("https://www.icourse163.org/")

            # 清除现有Cookie
            self.web_driver.delete_all_cookies()

            # 添加新Cookie
            success_count = 0
            for cookie in cookie_data.get("cookies", []):
                try:
                    # 格式化Cookie
                    formatted_cookie = {
                        'name': cookie.get('name'),
                        'value': cookie.get('value'),
                        'domain': f".{cookie.get('domain')}" if not cookie.get('domain', '').startswith('.') 
                                else cookie.get('domain'),
                        'path': cookie.get('path', '/'),
                        'expiry': int(cookie.get('expiry')) if cookie.get('expiry') else None,
                        'secure': cookie.get('secure', False)
                    }

                    # 添加有效Cookie
                    if all([formatted_cookie.get('name'), formatted_cookie.get('value'), formatted_cookie.get('domain')]):
                        self.web_driver.add_cookie(formatted_cookie)
                        success_count += 1

                except (ValueError, WebDriverException) as e:
                    print(f"添加Cookie失败: {str(e)}")

            print(f"成功添加 {success_count} 个Cookie")
            self.web_driver.refresh()
            time.sleep(2)
            return True

        except Exception as e:
            print(f"加载Cookie失败: {str(e)}")
            return False

    def _safe_navigate(self, url: str, retries: int = 3) -> bool:
        """安全导航到指定URL"""
        for attempt in range(retries):
            try:
                print(f"访问页面: {url} (尝试 {attempt + 1}/{retries})")
                self.web_driver.get(url)
                time.sleep(2)  # 等待页面加载
                return True

            except TimeoutException:
                print(f"警告: 页面加载超时")
                if attempt < retries - 1:
                    time.sleep(2)
                    continue
                return False

            except WebDriverException as e:
                print(f"浏览器错误: {str(e)}")
                if "invalid session id" in str(e) or "disconnected" in str(e):
                    self._restart_driver()
                return False

        return False

    def _restart_driver(self) -> bool:
        """重新启动浏览器驱动"""
        print("尝试重新启动浏览器...")
        self._close_driver()
        return self._setup_browser_driver() and self._load_user_cookies()

    def _parse_course_list(self, url: str) -> List[Dict]:
        """解析课程列表页面"""
        print(f"解析课程列表: {url}")
        courses = []

        try:
            # 获取页面源码
            time.sleep(2)  # 等待动态内容加载
            soup = BeautifulSoup(self.web_driver.page_source, 'html.parser')

            # 尝试多种选择器查找课程卡片
            course_selectors = [
                '.course-card', '.m-course-list .course-card',
                '.j-course-list .course-card', '.u-courseList .courseCard',
                '[class*="courseCard"]', '[class*="course-card"]'
            ]

            course_elements = None
            for selector in course_selectors:
                course_elements = soup.select(selector)
                if course_elements:
                    print(f"使用选择器 '{selector}' 找到 {len(course_elements)} 个课程")
                    break

            # 如果没有找到课程卡片,尝试提取单个课程链接
            if not course_elements:
                print("未找到标准课程卡片,尝试提取单个课程链接...")
                course_links = []
                for link in soup.find_all('a', href=True):
                    if '/course/' in link['href'] and 'icourse163.org' not in link['href']:
                        course_url = f"https://www.icourse163.org{link['href']}" if link['href'].startswith('/') else link['href']
                        course_links.append({
                            'url': course_url,
                            'title': link.get_text(strip=True),
                            'university': self._extract_university_from_element(link.parent)
                        })

                print(f"找到 {len(course_links)} 个课程链接")
                course_elements = course_links

            # 提取课程信息
            for element in course_elements[:self.config["courses_per_page"]]:
                try:
                    if isinstance(element, dict):  # 处理预提取的链接
                        course_info = self._extract_course_from_link(element)
                    else:  # 处理HTML元素
                        course_info = self._extract_course_from_element(element)

                    if course_info and course_info.get('course_name'):
                        courses.append(course_info)

                except Exception as e:
                    print(f"课程信息提取失败: {str(e)}")
                    continue

        except Exception as e:
            print(f"页面解析错误: {str(e)}")
            traceback.print_exc()

        return courses

    def _extract_university_from_element(self, element) -> str:
        """从HTML元素中提取学校信息"""
        if not element:
            return "未知"

        school_selectors = [
            '.school-name', '.university', '.u-course-uni',
            '[class*="school"]', '[class*="uni"]'
        ]

        for selector in school_selectors:
            school_elem = element.select_one(selector)
            if school_elem:
                return school_elem.get_text(strip=True)

        return "未知"

    def _extract_course_from_link(self, link_data: Dict) -> Dict:
        """从预提取的链接数据中提取课程信息"""
        course_id = ""
        if '/course/' in link_data['url']:
            course_id = link_data['url'].split('/course/')[1].split('/')[0].split('?')[0]

        return {
            "course_id": course_id,
            "course_name": link_data.get('title', '未知'),
            "university": link_data.get('university', '未知'),
            "teacher": "未知",
            "team": "",
            "participants": 0,
            "schedule": "未知",
            "description": "",
            "url": link_data.get('url', '')
        }

    def _extract_course_from_element(self, element) -> Dict:
        """从HTML元素中提取课程信息"""
        try:
            # 提取课程链接
            link = element.find('a', href=True)
            if not link:
                return {}

            course_url = f"https://www.icourse163.org{link['href']}" if link['href'].startswith('/') else link['href']

            # 提取课程ID
            course_id = ""
            if '/course/' in course_url:
                course_id = course_url.split('/course/')[1].split('/')[0].split('?')[0]

            # 提取课程名称
            course_name = self._extract_text(element, [
                '.course-name', '.title', '.f-thide', '.u-course-name', 'h3', 'h4'
            ], default="未知")

            # 提取学校信息
            university = self._extract_text(element, [
                '.school-name', '.university', '.u-course-uni', '[class*="school"]', '[class*="uni"]'
            ], default="未知")

            # 提取教师信息
            teacher = self._extract_text(element, [
                '.teacher-name', '.teacher', '.u-course-teacher', '.lecturer'
            ], default="未知")

            # 提取参与人数
            participants = self._extract_number(element, [
                '.hot', '.count', '.participants', '.enrollment'
            ], default=0)

            # 提取课程进度
            schedule = self._extract_text(element, [
                '.time', '.schedule', '.u-course-time', '.date', '.period'
            ], default="未知")

            # 提取课程简介
            description = self._extract_text(element, [
                '.brief', '.description', '.u-course-brief', '.intro'
            ], default="", max_length=200)

            return {
                "course_id": course_id,
                "course_name": course_name,
                "university": university,
                "teacher": teacher,
                "team": teacher,
                "participants": participants,
                "schedule": schedule,
                "description": description,
                "url": course_url
            }

        except Exception as e:
            print(f"课程信息提取错误: {str(e)}")
            return {}

    def _extract_text(self, element, selectors: List[str], default: str = "", max_length: int = None) -> str:
        """从元素中提取文本"""
        for selector in selectors:
            target = element.select_one(selector)
            if target:
                text = target.get_text(strip=True)
                return text[:max_length] if max_length else text
        return default

    def _extract_number(self, element, selectors: List[str], default: int = 0) -> int:
        """从元素中提取数字"""
        for selector in selectors:
            target = element.select_one(selector)
            if target:
                text = target.get_text(strip=True)
                try:
                    if '万' in text:
                        return int(float(text.replace('万', '').replace('+', '')) * 10000)
                    elif 'k' in text.lower():
                        return int(float(text.lower().replace('k', '')) * 1000)
                    return int(''.join(filter(str.isdigit, text)) or 0)
                except ValueError:
                    continue
        return default

    def _scrape_page_with_retry(self, url: str, category: str = "未分类") -> List[Dict]:
        """带重试机制的页面爬取"""
        print(f"\n开始处理分类: {category}")

        for attempt in range(self.config["retry_attempts"]):
            try:
                # 访问页面
                if not self._safe_navigate(url):
                    print(f"页面访问失败,剩余重试次数: {self.config['retry_attempts'] - attempt - 1}")
                    time.sleep(2)
                    continue

                # 解析课程
                courses = self._parse_course_list(url)
                if not courses:
                    print("首次解析未找到课程,尝试滚动页面...")
                    self.web_driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                    time.sleep(2)
                    courses = self._parse_course_list(url)

                # 保存到数据库
                saved_count = 0
                for course in courses:
                    course['category'] = category
                    if self._save_to_database(course):
                        saved_count += 1

                print(f"成功保存 {saved_count}/{len(courses)} 门课程")
                return courses

            except Exception as e:
                print(f"爬取过程中出错 (尝试 {attempt + 1}/{self.config['retry_attempts']}): {str(e)}")
                traceback.print_exc()

                if attempt < self.config["retry_attempts"] - 1:
                    time.sleep(3)
                    if not self._restart_driver():
                        print("无法恢复浏览器会话")
                        return []

        return []

    def _save_to_database(self, course: Dict) -> bool:
        """保存课程数据到数据库"""
        try:
            self.db_cursor.execute("""
            INSERT OR REPLACE INTO courses 
            (course_id, course_name, university, teacher, team, participants, 
             schedule, description, url, category)
            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
            """, (
                course.get('course_id', ''),
                course.get('course_name', ''),
                course.get('university', ''),
                course.get('teacher', ''),
                course.get('team', ''),
                course.get('participants', 0),
                course.get('schedule', ''),
                course.get('description', ''),
                course.get('url', ''),
                course.get('category', '')
            ))
            self.db_connection.commit()
            return True

        except Exception as e:
            print(f"数据库保存失败: {str(e)}")
            self.db_connection.rollback()
            return False

    def scrape_courses(self) -> List[Dict]:
        """执行课程爬取任务"""
        print("=" * 60)
        print("中国大学MOOC课程爬取开始")
        print("=" * 60)

        # 启动浏览器
        if not self._setup_browser_driver():
            return []

        try:
            # 加载用户会话
            self._load_user_cookies()

            # 定义要爬取的URL
            target_urls = [
                ("https://www.icourse163.org/", "首页推荐"),
                ("https://www.icourse163.org/category/computer", "计算机"),
                ("https://www.icourse163.org/search.htm?search=python", "Python"),
                ("https://www.icourse163.org/search.htm?search=数据分析", "数据分析"),
            ]

            all_courses = []
            for idx, (url, category) in enumerate(target_urls[:self.config["max_pages"]]):
                print(f"\n{'=' * 40}")
                print(f"进度: {idx + 1}/{min(len(target_urls), self.config['max_pages'])}")

                courses = self._scrape_page_with_retry(url, category)
                all_courses.extend(courses)

                if idx < len(target_urls) - 1:
                    time.sleep(2)  # 礼貌性延迟

            print(f"\n爬取完成!共获取 {len(all_courses)} 门课程")

            # 保存结果
            if all_courses:
                self._save_results(all_courses)
                self._print_statistics(all_courses)

            return all_courses

        except Exception as e:
            print(f"爬取过程中发生错误: {str(e)}")
            traceback.print_exc()
            return []

        finally:
            self._cleanup()

    def _save_results(self, courses: List[Dict]) -> None:
        """保存爬取结果到文件"""
        # CSV文件
        csv_path = os.path.join(self.config["output_dir"], "mooc_courses.csv")
        self._save_as_csv(courses, csv_path)

        # Excel文件
        excel_path = os.path.join(self.config["output_dir"], "mooc_courses.xlsx")
        self._save_as_excel(courses, excel_path)

        # 文本摘要
        summary_path = os.path.join(self.config["output_dir"], "course_summary.txt")
        self._save_summary(courses, summary_path)

    def _save_as_csv(self, courses: List[Dict], filepath: str) -> bool:
        """保存为CSV文件"""
        try:
            headers = ["序号", "课程ID", "课程名称", "学校", "教师", "团队", 
                      "参与人数", "课程进度", "简介", "链接", "分类"]

            with open(filepath, 'w', newline='', encoding='utf-8-sig') as f:
                writer = csv.writer(f)
                writer.writerow(headers)

                for idx, course in enumerate(courses, 1):
                    writer.writerow([
                        idx,
                        course.get('course_id', ''),
                        course.get('course_name', ''),
                        course.get('university', ''),
                        course.get('teacher', ''),
                        course.get('team', ''),
                        course.get('participants', 0),
                        course.get('schedule', ''),
                        course.get('description', ''),
                        course.get('url', ''),
                        course.get('category', '')
                    ])

            print(f"CSV文件已保存: {filepath}")
            return True

        except Exception as e:
            print(f"CSV保存失败: {str(e)}")
            return False

    def _save_as_excel(self, courses: List[Dict], filepath: str) -> bool:
        """保存为Excel文件"""
        try:
            data = [{
                "序号": idx,
                "课程ID": courseget('course_id', ''),
                "课程名称": course.get('course_name', ''),
                "学校": course.get('university', ''),
                "教师": course.get('teacher', ''),
                "团队": course.get('team', ''),
                "参与人数": course.get('participants', 0),
                "课程进度": course.get('schedule', ''),
                "简介": course.get('description', ''),
                "链接": course.get('url', ''),
                "分类": course.get('category', '')
            } for idx, course in enumerate(courses, 1)]

            df = pd.DataFrame(data)
            df.to_excel(filepath, index=False)
            print(f"Excel文件已保存: {filepath}")
            return True

        except ImportError:
            print("警告: 未安装pandas库,无法生成Excel文件")
            print("请使用命令安装: pip install pandas openpyxl")
            return False
        except Exception as e:
            print(f"Excel保存失败: {str(e)}")
            return False

    def _save_summary(self, courses: List[Dict], filepath: str) -> None:
        """保存文本摘要"""
        with open(filepath, 'w', encoding='utf-8') as f:
            f.write(f"中国大学MOOC课程摘要\n")
            f.write(f"生成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
            f.write(f"课程总数: {len(courses)}\n\n")

            for idx, course in enumeratecourses[:10], 1):  # 只显示前10个
                f.write(f"{idx}. {course.get('course_name')}\n")
                f.write(f"   学校: {course.get('university')}\n")
                f.write(f"   教师: {course.get('teacher')}\n")
                f.write(f"   人数: {course.get('participants'):,}\n")
                f.write(f"   分类: {course.get('category')}\n")
                f.write(f"   链接: {course.get('url')}\n")
                f.write("-" * 50 + "\n")

        print(f"摘要文件已保存: {filepath}")

    def _print_statistics(self, courses: List[Dict]) -> None:
        """打印统计信息"""
        print("\n" + "=" * 50)
        print("爬取结果统计")
        print("=" * 50)

        if not courses:
            print("未获取到任何课程数据")
            return

        # 基本统计
        print(f"总课程数: {len(courses)}")

        # 分类统计
        category_stats = {}
        for course in courses:
            category = course.get('category', '未分类')
            category_stats[category] = category_stats.get(category, 0) + 1

        print("\n分类分布:")
        for category, count in sorted(category_stats.items()):
            print(f"  {category}: {count} 门")

        # 热门课程
        top_courses = sorted(courses, key=lambda x: x.get('participants', 0), reverse=True)[:5]
        print("\n热门课程(按参与人数):")
        for course in top_courses:
            print(f"  {course.get('course_name')} ({course.get('university')}) - "
                  f"{course.get('participants'):,} 人")

    def _cleanup(self) -> None:
        """清理资源"""
        self._close_driver()
        self._close_database()

    def _close_driver(self) -> None:
        """关闭浏览器驱动"""
        if self.web_driver:
            try:
                self.web_driver.quit()
                print("浏览器已关闭")
            except Exception as e:
                print(f"关闭浏览器时出错: {str(e)}")

    def _close_database(self) -> None:
        """关闭数据库连接"""
        if self.db_connection:
            try:
                self.db_connection.close()
                print("数据库连接已关闭")
            except Exception as e:
                print(f"关闭数据库时出错: {str(e)}")


def main():
    """主程序入口"""
    print("=" * 60)
    print("中国大学MOOC课程爬取工具")
    print("=" * 60)

    # 配置参数
    scraper_config = {
        "driver_path": r"C:\Program Files\Google\Chrome\Application\chromedriver.exe",
        "cookie_file": "mooc_cookies.json",
        "max_pages": 3,
        "headless_mode": False,
    }

    # 创建并运行爬虫
    scraper = MOOCCourseScraper(scraper_config)
    courses = scraper.scrape_courses()

    # 显示结果
    if courses:
        print("\n" + "=" * 50)
        print("示例课程:")
        for course in courses[:3]:  # 显示前3个课程
            print(f"\n{course.get('course_name')} ({course.get('university')})")
            print(f"教师: {course.get('teacher')}")
            print(f"参与人数: {course.get('participants'):,}")
            print(f"链接: {course.get('url')}")
    else:
        print("\n未获取到任何课程数据")


if __name__ == "__main__":
    main()
输出结果:

2ed726ce-6507-4861-b65e-bd52ec3c06ce
2)心得体会:通过这次实践,我理解了显式等待与隐式等待的区别,掌握了 Selenium 爬虫核心技能,MySQL 表设计与批量插入,理解了Web会话机制,掌握了维持登录状态的方法。
作业③:
要求:
掌握大数据相关服务,熟悉Xshell的使用
任务一:Python脚本生成测试数据

824b4845-f72a-4472-9193-756d14a68b9b
任务二:配置Kafka

35eb4e49-4fc0-4e2d-be89-81dbaa853927

b2fc0553-7f81-4817-b564-1658ac14c849
任务三: 安装Flume客户端

c7108761-71bf-454d-8837-bbc93ca16c8a
任务四:配置Flume采集数据

1bf1e1c1-1232-46fc-a2f4-a04027981f71

posted @ 2025-12-10 03:45  李坤铭  阅读(0)  评论(0)    收藏  举报