博客4

作业①:

o 要求:

▪ 熟练掌握 Selenium 查找 HTML 元素、爬取 Ajax 网页数据、等待 HTML 元素等内

容。

▪ 使用 Selenium 框架+ MySQL 数据库存储技术路线爬取“沪深 A 股”、“上证 A 股”、

“深证 A 股”3 个板块的股票数据信息。

o 候选网站:东方财富网:

http://quote.eastmoney.com/center/gridlist.html#hs_a_board

通过分析html结构发现股票信息存储的位置
完整代码:

import time
import pymysql
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# ===========================
# MySQL 配置
# ===========================
DB_HOST = "localhost"
DB_USER = "scrapy"
DB_PASS = "777456qqQ"
DB_NAME = "stockdb"

db = pymysql.connect(
    host=DB_HOST,
    user=DB_USER,
    password=DB_PASS,
    database=DB_NAME,
    charset="utf8mb4"
)
cursor = db.cursor()

# ===========================
# 工具函数:安全转换 float
# ===========================
def safe_float(v):
    try:
        if not v:
            return 0.0
        v = v.replace(",", "").replace("万", "").replace("亿", "").replace("--", "").strip()
        if v == "":
            return 0.0
        return float(v)
    except:
        return 0.0

# ===========================
# 保存函数
# ===========================
def save(row):
    sql = """
    INSERT INTO eastmoney_stock(
        board, xuhao, daima, mingcheng, zuixinbaojia,
        zhangdiefu, zhangdiee, chengjiaoliang,
        chengjiaoe, zhenfu, zuigao, zuidi,
        jinkai, zuoshou
    ) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
    ON DUPLICATE KEY UPDATE
        zuixinbaojia=VALUES(zuixinbaojia),
        zhangdiefu=VALUES(zhangdiefu),
        zhangdiee=VALUES(zhangdiee),
        chengjiaoliang=VALUES(chengjiaoliang),
        chengjiaoe=VALUES(chengjiaoe),
        zhenfu=VALUES(zhenfu),
        zuigao=VALUES(zuigao),
        zuidi=VALUES(zuidi),
        jinkai=VALUES(jinkai),
        zuoshou=VALUES(zuoshou)
    """
    cursor.execute(sql, row)
    db.commit()

# ===========================
# 爬取单个板块
# ===========================
def crawl_board(board_name, url, max_pages=2):
    print(f"\n>>> 正在抓取板块:{board_name}")
    driver.get(url)
    wait = WebDriverWait(driver, 20)

    page = 1
    while page <= max_pages:
        print(f"  - 抓取第 {page} 页 ...")
        wait.until(EC.presence_of_all_elements_located((By.XPATH, "//tbody/tr")))
        rows = driver.find_elements(By.XPATH, "//tbody/tr")

        for r in rows:
            tds = r.find_elements(By.TAG_NAME, "td")
            if len(tds) < 13:
                continue
            xuhao = tds[0].text.strip()
            daima = tds[1].text.strip()
            mingcheng = tds[2].text.strip()
            zuixinbaojia = tds[3].text.strip()
            zhangdiefu = tds[4].text.strip()
            zhangdiee = tds[5].text.strip()
            chengjiaoliang = tds[6].text.strip()
            chengjiaoe = tds[7].text.strip()
            zhenfu = tds[8].text.strip()
            zuigao = tds[9].text.strip()
            zuidi = tds[10].text.strip()
            jinkai = tds[11].text.strip()
            zuoshou = tds[12].text.strip()

            # 跳过不合法股票
            if not daima.isdigit():
                continue

            data = (
                board_name, xuhao, daima, mingcheng, zuixinbaojia,
                zhangdiefu, zhangdiee, chengjiaoliang,
                chengjiaoe, zhenfu, zuigao, zuidi,
                jinkai, zuoshou
            )
            save(data)
            print(f"    ✔ 保存 {daima} {mingcheng}")

        # 翻页
        if page < max_pages:
            try:
                next_btn = wait.until(
                    EC.element_to_be_clickable((By.XPATH, '//a[@title="下一页"]'))
                )
                next_btn.click()
                print("  - 翻页成功")
                time.sleep(2)
            except:
                print("  - 找不到下一页按钮,提前结束。")
                break
        page += 1

# ===========================
# 主程序
# ===========================
if __name__ == "__main__":
    opt = Options()
    # opt.add_argument("--headless")  # 如果需要无头浏览器可打开
    driver = webdriver.Chrome(options=opt)

    boards = {
        "沪深A股": "https://quote.eastmoney.com/center/gridlist.html#hs_a_board",
        "上证A股": "https://quote.eastmoney.com/center/gridlist.html#sh_a_board",
        "深证A股": "https://quote.eastmoney.com/center/gridlist.html#sz_a_board",
    }

    for name, url in boards.items():
        crawl_board(name, url, max_pages=2)

    driver.quit()
    cursor.close()
    db.close()
    print("\n全部板块前 2 页爬取完成!")

  

运行结果:

作业②:

o 要求:

▪ 熟练掌握 Selenium 查找 HTML 元素、实现用户模拟登录、爬取 Ajax 网页数据、

等待 HTML 元素等内容。

▪ 使用 Selenium 框架+MySQL 爬取中国 mooc 网课程资源信息(课程号、课程名

称、学校名称、主讲教师、团队成员、参加人数、课程进度、课程简介)

o 候选网站:中国 mooc 网:https://www.icourse163.org

使用扫码来实现模拟用户登录,编写了cookie.py来实现第一次扫码登录

完整代码:

import json
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

COOKIE_FILE = "cookies.json"
CHROME_DRIVER_PATH = r"D:\python\pythonProject1\.venv\Scripts\chromedriver.exe"

def save_cookies(driver):
    cookies = driver.get_cookies()
    with open(COOKIE_FILE, "w", encoding="utf-8") as f:
        json.dump(cookies, f, ensure_ascii=False, indent=2)
    print("\n>>> Cookie 已保存到 cookies.json!\n")

def generate_cookie():
    print(">>> 正在启动 Chrome,请稍等…")

    # ===== 必须使用 Service 才能正确加载 chromedriver =====
    service = Service(executable_path=CHROME_DRIVER_PATH)

    options = webdriver.ChromeOptions()
    options.add_experimental_option('excludeSwitches', ['enable-logging'])
    options.add_argument('--log-level=3')

    driver = webdriver.Chrome(service=service, options=options)
    wait = WebDriverWait(driver, 20)

    driver.get("https://www.icourse163.org/")
    time.sleep(2)

    print(">>> 浏览器已打开,请稍等...")

    # 点击右上角登录按钮(最新页面结构)
    try:
        login_button = wait.until(EC.element_to_be_clickable(
            (By.XPATH, "//div[contains(@class,'loginBtn')]")
        ))
        login_button.click()
        print(">>> 已点击登录按钮")
    except:
        print(">>> 登录按钮未找到,尝试备用 XPATH…")
        try:
            fallback_login_btn = wait.until(EC.element_to_be_clickable(
                (By.XPATH, "//div[contains(text(),'登录')]")
            ))
            fallback_login_btn.click()
        except:
            print(">>> 无法找到登录按钮!程序退出")
            driver.quit()
            return

    time.sleep(2)

    # 点击“扫码登录”
    try:
        qrcode_btn = wait.until(EC.element_to_be_clickable(
            (By.XPATH, "//img[contains(@src,'qrcode')]")
        ))
        qrcode_btn.click()
        print(">>> 已切换到扫码登录,请扫码!")
    except:
        print(">>> 找不到二维码按钮,将直接等待扫码界面出现…")

    input("\n>>> 请使用手机扫码登录,完成后按 Enter 继续… ")

    # 保存 cookie
    save_cookies(driver)

    driver.quit()
    print(">>> 已关闭浏览器")


if __name__ == "__main__":
    generate_cookie()

  

爬取通过解析xpath来实现内容的爬取,同时将需要爬取的课程的链接直接放在代码内:

完整代码:

import json
import os
import time
import pymysql
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# ================== 配置区域 ==================
CHROME_DRIVER_PATH = r"D:\python\pythonProject1\.venv\Scripts\chromedriver.exe"

COOKIE_FILE = "cookies.json"

DB_HOST = "localhost"
DB_USER = "scrapy"
DB_PASS = "777456qqQ"
DB_NAME = "stockdb"

# 你要爬取的课程链接列表
COURSE_URLS = [
    "https://www.icourse163.org/course/ZJU-199001",
    "https://www.icourse163.org/course/XJTU-1002529011",
    "https://www.icourse163.org/course/PKU-1206624828"
]
# ============================================


def load_cookies(driver):
    if not os.path.exists(COOKIE_FILE):
        print("cookies.json 不存在,请先生成 cookie!")
        exit()

    driver.get("https://www.icourse163.org/")
    time.sleep(2)

    with open(COOKIE_FILE, "r", encoding="utf-8") as f:
        cookies = json.load(f)

    for cookie in cookies:
        cookie.pop("sameSite", None)
        cookie.pop("expiry", None)
        try:
            driver.add_cookie(cookie)
        except Exception as e:
            print("添加 cookie 失败:", e)

    driver.get("https://www.icourse163.org/")
    time.sleep(2)
    print("使用 cookie 登录成功!")


def safe_text(driver, wait, xpath):
    try:
        return wait.until(EC.presence_of_element_located((By.XPATH, xpath))).text.strip()
    except:
        return ""


def safe_attr(driver, wait, xpath, attr):
    try:
        return wait.until(EC.presence_of_element_located((By.XPATH, xpath))).get_attribute(attr)
    except:
        return ""


def crawl_course(driver, wait, url, cursor, db):
    print(f"\n正在爬取:{url}")
    driver.get(url)
    time.sleep(2)

    cCourse = safe_text(driver, wait,
                        "/html/body/div[5]/div[2]/div[1]/div/div/div/div[2]/div[2]/div/div[2]/div[1]/span[1]")
    cCollege = safe_attr(driver, wait,
                         "/html/body/div[5]/div[2]/div[2]/div[2]/div[2]/div[2]/div[2]/div/a/img",
                         "alt")

    # 获取所有老师
    all_teachers = []

    def get_teachers_from_page():
        page_teachers = []
        try:
            teacher_container = driver.find_element(By.XPATH, "//div[@class='m-teachers_teacher-list']")
            slider_con = teacher_container.find_element(By.XPATH, ".//div[@class='um-list-slider_con']")
            con_items = slider_con.find_elements(By.XPATH, ".//div[@class='um-list-slider_con_item']")
            for item in con_items:
                try:
                    img = item.find_element(By.XPATH, ".//img")
                    name = img.get_attribute("alt")
                    if name:
                        page_teachers.append(name)
                except:
                    continue
        except Exception as e:
            print("获取老师失败:", e)
        return page_teachers

    # 第一页
    all_teachers.extend(get_teachers_from_page())

    # 如果有翻页的话
    try:
        buttons = driver.find_elements(By.XPATH,
                                       "/html/body/div[5]/div[2]/div[2]/div[2]/div[2]/div[2]/div[2]/div/div/div[2]/div/div[1]/span")
        if buttons:
            buttons[0].click()
            time.sleep(2)
            while True:
                page_teachers = get_teachers_from_page()
                all_teachers.extend(page_teachers)
                buttons = driver.find_elements(By.XPATH,
                                               "/html/body/div[5]/div[2]/div[2]/div[2]/div[2]/div[2]/div[2]/div/div/div[2]/div/div[1]/span")
                if len(buttons) == 2:
                    buttons[1].click()
                    time.sleep(2)
                else:
                    break
    except Exception as e:
        print("老师翻页失败:", e)

    cTeacher = all_teachers[0] if all_teachers else ""
    cTeam = ",".join(all_teachers)

    # 参加人数
    try:
        elem = wait.until(EC.presence_of_element_located((By.XPATH,
            "/html/body/div[5]/div[2]/div[1]/div/div/div/div[2]/div[2]/div/div[3]/div/div[1]/div[4]/span[2]"
        )))
        cCount = elem.text.strip()
    except:
        cCount = ""

    # 进度
    cProcess = safe_text(driver, wait,
                         "/html/body/div[5]/div[2]/div[1]/div/div/div/div[2]/div[2]/div/div[3]/div/div[1]/div[2]/div/span[2]"
                         )

    # 简介
    cBrief = safe_text(driver, wait,
                       "/html/body/div[5]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/div[2]/div[1]"
                       )

    cursor.execute("""
        INSERT INTO course_info(url, cCourse, cCollege, cTeacher, cTeam, cCount, cProcess, cBrief)
        VALUES (%s,%s,%s,%s,%s,%s,%s,%s)
    """, (url, cCourse, cCollege, cTeacher, cTeam, cCount, cProcess, cBrief))

    db.commit()
    print(f"爬取完成:{cCourse}")


def main():
    db = pymysql.connect(
        host=DB_HOST,
        user=DB_USER,
        password=DB_PASS,
        database=DB_NAME,
        charset="utf8mb4"
    )
    cursor = db.cursor()

    cursor.execute("""
    CREATE TABLE IF NOT EXISTS course_info(
        id INT AUTO_INCREMENT PRIMARY KEY,
        url VARCHAR(255),
        cCourse VARCHAR(255),
        cCollege VARCHAR(255),
        cTeacher VARCHAR(255),
        cTeam TEXT,
        cCount VARCHAR(50),
        cProcess VARCHAR(255),
        cBrief TEXT
    );
    """)
    db.commit()

    service = Service(CHROME_DRIVER_PATH)
    driver = webdriver.Chrome(service=service)
    wait = WebDriverWait(driver, 20)

    # 使用 cookie 登录
    load_cookies(driver)

    # 爬取你指定的链接列表
    for url in COURSE_URLS:
        crawl_course(driver, wait, url, cursor, db)

    driver.quit()
    db.close()
    print("\n全部课程爬取完毕!")


if __name__ == "__main__":
    main()

  

运行结果:

作业③:

要求:

掌握大数据相关服务,熟悉Xshell的使用

完成文档 华为云_大数据实时分析处理实验手册-Flume日志采集实验(部分)v2.docx 中的任务,即为下面5个任务,具体操作见文档。

环境搭建:

任务一:开通MapReduce服务

实时分析开发实战:

任务一:Python脚本生成测试数据

任务二:配置Kafka

任务三: 安装Flume客户端

任务四:配置Flume采集数据

1.Python脚本生成测试数据

2.配置Kafka

3.安装Flume客户端

4.配置Flume采集数据

posted @ 2025-12-07 17:50  woshinida  阅读(2)  评论(0)    收藏  举报