作业4

1.熟练掌握 Selenium 查找HTML元素、爬取Ajax网页数据、等待HTML元素等内容使用Selenium框架+ MySQL数据库存储技术路线爬取“沪深A股”、“上证A股”、“深证A股”3个板块的股票数据信息。

实验内容

网页结构

屏幕截图 2025-12-09 233434

在table中的tr为每行元素,td为单个属性

核心代码

点击查看代码
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import pymysql

# 连接 MySQL
conn = pymysql.connect(
    host="127.0.0.1",
    user="root",
    password="123456",
    database="stock",
    charset="utf8mb4"
)
cursor = conn.cursor()

# 启动 Selenium 浏览器
driver = webdriver.Edge()

# 东方财富股票列表基础 URL
BASE_URL = "http://quote.eastmoney.com/center/gridlist.html#"

def close_ad_popup():
    """关闭东方财富网每次都弹出的开户广告"""
    try:
        # 方法1:点右上角的 X(最常见)
        close_btn = driver.find_element(By.CSS_SELECTOR, "div.close-btn, a.close, img.close, span.close")
        close_btn.click()
        print("已关闭弹窗广告(方法1)")
        time.sleep(1)
    except:
        try:
            # 方法2:新版广告的关闭按钮(2025年实测最稳)
            driver.execute_script("""
                var btn = document.querySelector('div[ad-tag] .close-btn') || 
                          document.querySelector('.popup-ad .close') ||
                          document.querySelector('img[src*="close"], img[alt="close"]');
                if (btn) btn.click();
            """)
            print("已关闭弹窗广告(方法2)")
            time.sleep(1)
        except:
            # 方法3:直接隐藏整个广告容器(万能兜底)
            driver.execute_script("""
                var ads = document.querySelectorAll('div[ad-tag], div.popup-ad, .advertisement, div[class*="ad"], iframe');
                ads.forEach(ad => ad.style.display = 'none');
            """)
            print("已强制隐藏所有广告元素(方法3)")

def crawl_board(hash_code, board_code):
    # 1) 拼接 URL,打开对应板块页面
    url = BASE_URL + hash_code
    driver.get(url)

    time.sleep(3)                    # 给广告一点时间弹出来
    close_ad_popup()

    # 2) 简单等待页面和数据加载完成
    time.sleep(5)


    # 3) 找到表格的所有行
    rows = driver.find_elements(By.CSS_SELECTOR, "table tbody tr")
    print("共发现行数:", len(rows))

    sql = """
        INSERT INTO stocks
        (board, seq_no, stock_no, stock_name,
         last_price, change_pct, change_amt,
         volume, amount, amplitude,
         high_price, low_price, open_price, pre_close)
        VALUES
        (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
    """

    for row in rows:
        tds = row.find_elements(By.TAG_NAME, "td")
        if len(tds) < 13:
            # 防止空行或异常行,列数不足时直接跳过
            continue

        # 按网页表头顺序取值:
        seq_no      = tds[0].text.strip()   # 序号
        stock_no    = tds[1].text.strip()   # 股票代码
        stock_name  = tds[2].text.strip()   # 股票名称
        last_price  = tds[4].text.strip()   # 最新报价
        change_pct  = tds[5].text.strip()   # 涨跌幅
        change_amt  = tds[6].text.strip()   # 涨跌额
        volume      = tds[7].text.strip()   # 成交量
        amount      = tds[8].text.strip()   # 成交额
        amplitude   = tds[9].text.strip()   # 振幅
        high_price  = tds[10].text.strip()   # 最高
        low_price   = tds[11].text.strip()  # 最低
        open_price  = tds[12].text.strip()  # 今开
        pre_close   = tds[13].text.strip()  # 昨收

        data = (
            board_code,
            seq_no, stock_no, stock_name,
            last_price, change_pct, change_amt,
            volume, amount, amplitude,
            high_price, low_price, open_price, pre_close
        )

        cursor.execute(sql, data)

    conn.commit()
    print(board_code, "板块入库完成!\n")

if __name__ == "__main__":
    try:
        # 依次爬三个板块
        crawl_board("hs_a_board", "hs_a")  # 沪深 A 股
        crawl_board("sh_a_board", "sh_a")  # 上证 A 股
        crawl_board("sz_a_board", "sz_a")  # 深证 A 股
    finally:
        # 释放资源
        cursor.close()
        conn.close()
        driver.quit()

运行结果

屏幕截图 2025-12-09 233610

屏幕截图 2025-12-09 212658

心得体会

这个网站加载出来后会有一个广告弹窗,影响页面爬取,div[ad-tag].close-btn:查找有ad-tag属性的div元素内的关闭按钮img[src*="close"]:查找src属性包含"close"的图片(模糊匹配)img[alt="close"]:查找alt属性为"close"的图片,利用这三种方式便可关掉广告

链接https://gitee.com/wugao00882999/data-collection/blob/master/%E4%BD%9C%E4%B8%9A4/stocks.py

2.熟练掌握 Selenium 查找HTML元素、实现用户模拟登录、爬取Ajax网页数据、等待HTML元素等内容。使用Selenium框架+MySQL爬取中国mooc网课程资源信息(课程号、课程名称、学校名称、主讲教师、团队成员、参加人数、课程进度、课程简介)

实验内容

网页结构

image

image

可以看到,这是很明显的动态加载页面,十分适合用动态方法爬取

核心代码

点击查看代码
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.edge.service import Service
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pymysql

options = webdriver.EdgeOptions() 
# 反爬配置
options.use_chromium = True  # 使用Chromium版Edge
options.add_argument('--disable-blink-features=AutomationControlled')  # 隐藏自动化特征
options.add_argument('--no-sandbox')  # 防止某些环境下的沙盒问题


driver = webdriver.Edge(options=options)  
driver.maximize_window()

# 数据库连接
db = pymysql.connect(host='127.0.0.1', user='root', password='123456', port=3306, database='mooc')
cursor = db.cursor()
cursor.execute('DROP TABLE IF EXISTS courseMessage')
sql = '''CREATE TABLE courseMessage(cCourse varchar(64),cCollege varchar(64),cTeacher varchar(16),cTeam varchar(256),cCount varchar(16),
cProcess varchar(32),cBrief varchar(2048))'''
cursor.execute(sql)

def spiderOnePage():
    time.sleep(3)
    courses = driver.find_elements(By.XPATH, '//*[@id="channel-course-list"]/div/div/div[2]/div[1]/div')
    current_window_handle = driver.current_window_handle
    for course in courses:
        cCourse = course.find_element(By.XPATH, './/h3').text  # 提取课程名称
        cCollege = course.find_element(By.XPATH, './/p[@class="_2lZi3"]').text  # 提取院校
        cTeacher = course.find_element(By.XPATH, './/div[@class="_1Zkj9"]').text  # 提取老师
        cCount = course.find_element(By.XPATH, './/div[@class="jvxcQ"]/span').text  # 提取学校人数
        cProcess = course.find_element(By.XPATH, './/div[@class="jvxcQ"]/div').text  # 提取课程进度

        course.click()
        Handles = driver.window_handles
        if len(Handles) < 2:
            continue
        driver.switch_to.window(Handles[1])
        time.sleep(3)

        # 提取课程详情
        cBrief = driver.find_element(By.XPATH, '//*[@id="j-rectxt2"]').text  # 先通过固定ID找简介
        if len(cBrief) == 0:
            cBriefs = driver.find_elements(By.XPATH, '//*[@id="content-section"]/div[4]/div//*')  # 获取简介区块下的所有子元素
            cBrief = ""
            for c in cBriefs:
                cBrief += c.text
        cBrief = cBrief.replace('"', r'\"').replace("'", r"\'")
        cBrief = cBrief.strip()
        nameList = []
        cTeachers = driver.find_elements(By.XPATH, '//div[@class="um-list-slider_con_item"]')  # 提取授课团队
        for Teacher in cTeachers:
            name = Teacher.find_element(By.XPATH, './/h3[@class="f-fc3"]').text.strip()
            nameList.append(name)
        nextButton = driver.find_elements(By.XPATH, '//div[@class="um-list-slider_next f-pa"]')  # 提取下一页按钮
        while len(nextButton) != 0:
            nextButton[0].click()
            time.sleep(3)
            cTeachers = driver.find_elements(By.XPATH, '//div[@class="um-list-slider_con_item"]')
            for Teacher in cTeachers:
                name = Teacher.find_element(By.XPATH, './/h3[@class="f-fc3"]').text.strip()
                nameList.append(name)
            nextButton = driver.find_elements(By.XPATH, '//div[@class="um-list-slider_next f-pa"]')
        cTeam = ','.join(nameList)

        # 关闭详情页
        driver.close()
        driver.switch_to.window(current_window_handle)

        # 插入数据库
        cursor.execute('INSERT INTO courseMessage VALUES ("%s","%s","%s","%s","%s","%s","%s")' % (
        cCourse, cCollege, cTeacher, cTeam, cCount, cProcess, cBrief))
        db.commit()

driver.get('https://www.icourse163.org/')
driver.get(WebDriverWait(driver, 10, 0.48).until(EC.presence_of_element_located((By.XPATH, '//*[@id="app"]/div/div/div[1]/div[1]/div[1]/span[1]/a'))).get_attribute('href'))
spiderOnePage()
count = 1
# 翻页逻辑
next_page = driver.find_element(By.XPATH, '//*[@id="channel-course-list"]/div/div/div[2]/div[2]/div/a[10]')
while next_page.get_attribute('class') == '_3YiUU ':
    if count == 2:
        break
    count += 1
    next_page.click()
    spiderOnePage()
    next_page = driver.find_element(By.XPATH, '//*[@id="channel-course-list"]/div/div/div[2]/div[2]/div/a[10]')

# 关闭数据库连接
cursor.close()
db.close()

time.sleep(3)
driver.quit()

运行结果

屏幕截图 2025-12-09 225428

心得体会

这个项目让我学会了如何爬取教育平台数据,处理复杂的页面交互和教师信息展示。

链接https://gitee.com/wugao00882999/data-collection/blob/master/%E4%BD%9C%E4%B8%9A4/icourse1.py

3.掌握大数据相关服务,熟悉Xshell的使用完成文档 华为云_大数据实时分析处理实验手册-Flume日志采集实验(部分)v2.docx 中的任务

实验内容

Python脚本生成测试数据

image

配置Kafka

image

image

安装Flume客户端

image

image

配置Flume采集数据

image

屏幕截图 2025-11-23 210621

屏幕截图 2025-11-23 204315

屏幕截图 2025-11-23 215426

心得体会

通过这次Flume日志采集实验,我系统性地掌握了大数据生态核心技术的集成应用。不仅学会运用Python模拟复杂业务数据流,更深层次理解了分布式消息系统Kafka在高并发场景下的架构哲学,以及Flume在数据采集领域展现的工程美学。

posted @ 2025-12-10 00:59  kukuliii库12321  阅读(3)  评论(0)    收藏  举报