102302122许志安作业4

数据采集第四次作业 

作业一:基于 Selenium + MySQL 的沪深 A 股股票数据爬取

要求:

▪ 熟练掌握 Selenium 查找HTML元素、爬取Ajax网页数据、等待HTML元素等内

容。

▪ 使用Selenium框架+ MySQL数据库存储技术路线爬取“沪深A股”、“上证A股”、

“深证A股”3个板块的股票数据信息。

o 候选网站:东方财富网:

http://quote.eastmoney.com/center/gridlist.html#hs_a_board

o 输出信息:MYSQL数据库存储和输出格式如下,表头应是英文命名例如:序号

id,股票代码:bStockNo……,由同学们自行定义设计表头:

思路:从前端检查找到对应所需按钮xpath,实现页面自动跳转或指定数据爬取

代码结果如下:

 

import time
import pymysql
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

db = pymysql.connect(
    host="localhost",
    user="root",
    password="xuzhian123",
    database="stockdb",
    charset="utf8mb4"
)
cursor = db.cursor()

def insert_stock(data):
    sql = """
    INSERT INTO eastmoney_stock(
        board, xuhao, daima, mingcheng, zuixinbaojia,
        zhangdiefu, zhangdiee, chengjiaoliang,
        chengjiaoe, zhenfu, zuigao, zuidi,
        jinkai, zuoshou
    ) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
    """
    cursor.execute(sql, data)
    db.commit()

def crawl_board(board_name, url):
    print(f"\n>>> 正在抓取板块:{board_name}")

    driver.get(url)
    wait = WebDriverWait(driver, 15)
    time.sleep(2)

    for page in range(1, 3):

        print(f"  - 正在抓取第 {page} 页 ...")

        wait.until(EC.presence_of_element_located((By.XPATH, "//table//tbody/tr")))

        rows = driver.find_elements(By.XPATH, "//table//tbody/tr")

        for r in rows:
            tds = r.find_elements(By.TAG_NAME, "td")
            if len(tds) < 13:
                continue
            xuhao = tds[0].text.strip()
            daima = tds[1].text.strip()
            mingcheng = tds[2].text.strip()
            zuixinbaojia = tds[3].text.strip()
            zhangdiefu = tds[4].text.strip()
            zhangdiee = tds[5].text.strip()
            chengjiaoliang = tds[6].text.strip()
            chengjiaoe = tds[7].text.strip()
            zhenfu = tds[8].text.strip()
            zuigao = tds[9].text.strip()
            zuidi = tds[10].text.strip()
            jinkai = tds[11].text.strip()
            zuoshou = tds[12].text.strip()

            if not daima.isdigit():
                continue

            data = (
                board_name, xuhao, daima, mingcheng, zuixinbaojia,
                zhangdiefu, zhangdiee, chengjiaoliang,
                chengjiaoe, zhenfu, zuigao, zuidi,
                jinkai, zuoshou
            )
            insert_stock(data)

        if page < 2:
            try:
                next_btn = wait.until(
                    EC.element_to_be_clickable(
                        (By.XPATH, '//*[@id="mainc"]/div/div/div[4]/div/a[@title="下一页"]')
                    )
                )
                next_btn.click()
                print("  > 翻页成功")
                time.sleep(2)

            except Exception as e:
                print("  - 找不到下一页按钮,提前结束:", e)
                break

    print(f"{board_name} —— 前 2 页抓取完成")

if __name__ == "__main__":
    driver = webdriver.Chrome()

    boards = {
        "沪深A股": "https://quote.eastmoney.com/center/gridlist.html#hs_a_board",
        "上证A股": "https://quote.eastmoney.com/center/gridlist.html#sh_a_board",
        "深证A股": "https://quote.eastmoney.com/center/gridlist.html#sz_a_board",
    }

    for name, url in boards.items():
        crawl_board(name, url)

    driver.quit()
    db.close()

    print("\n全部板块前两页爬取成功!")

 

作业心得:对Selenium的认识更进一步

作业二:基于 Selenium + MySQL 的中国大学MOOC课程信息爬取▪ 熟练掌握 Selenium 查找HTML元素、实现用户模拟登录、爬取Ajax网页数据、

等待HTML元素等内容。

▪ 使用Selenium框架+MySQL爬取中国mooc网课程资源信息(课程号、课程名

称、学校名称、主讲教师、团队成员、参加人数、课程进度、课程简介)

o 候选网站:中国mooc网:https://www.icourse163.org

o 输出信息:MYSQL数据库存储和输出格式

思路:与上一题相同,也是从前端检查找到对应所需按钮xpath,实现页面自动跳转或指定数据爬取,增添cookies验证登录爬取。

代码结果如下:

主爬虫:

import json, os, time, pymysql
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

COOKIE_FILE = "cookies.json"

# ---------- 基础工具 ----------
def safe_text(wait, xpath):
    try: return wait.until(EC.presence_of_element_located((By.XPATH, xpath))).text.strip()
    except: return ""

def safe_attr(wait, xpath, attr):
    try: return wait.until(EC.presence_of_element_located((By.XPATH, xpath))).get_attribute(attr)
    except: return ""

# ---------- 加载 cookie 登录 ----------
def load_cookies(driver):
    if not os.path.exists(COOKIE_FILE):
        print("cookies.json 不存在,请先生成 cookie!")
        exit()

    driver.get("https://www.icourse163.org/")
    time.sleep(2)

    for c in json.load(open(COOKIE_FILE, "r", encoding="utf-8")):
        c.pop("sameSite", None)
        c.pop("expiry", None)
        try: driver.add_cookie(c)
        except: pass

    driver.get("https://www.icourse163.org/")
    time.sleep(2)
    print("使用 cookie 登录成功!")

# ---------- 获取老师 ----------
def get_all_teachers(driver):
    all_teachers = []

    def collect():
        lst = []
        try:
            items = driver.find_elements(By.XPATH,
                "//div[@class='m-teachers_teacher-list']//div[@class='um-list-slider_con_item']//img"
            )
            for img in items:
                name = img.get_attribute("alt")
                if name: lst.append(name)
        except: pass
        return lst

    # 第 1 页
    all_teachers += collect()

    # 翻页按钮(0/1/2 个)
    while True:
        buttons = driver.find_elements(
            By.XPATH, "//div[@class='um-list-slider_con']/div/div[1]/span"
        )
        if len(buttons) == 2:   # 中间页:点右边按钮
            buttons[1].click()
            time.sleep(1.5)
            all_teachers += collect()
        else:
            break

    return all_teachers

# ---------- 单课程爬取 ----------
def crawl_course(driver, wait, url, cursor, db):
    print(f"\n正在爬取:{url}")
    driver.get(url)
    time.sleep(2)

    cCourse  = safe_text(wait, "/html/body/div[5]/div[2]/div[1]/div/div/div/div[2]/div[2]/div/div[2]/div[1]/span[1]")
    cCollege = safe_attr(wait, "/html/body/div[5]/div[2]/div[2]/div[2]/div[2]/div[2]/div[2]/div/a/img", "alt")

    teachers = get_all_teachers(driver)
    print(f"共找到 {len(teachers)} 位老师:{teachers}")

    cTeacher = teachers[0] if teachers else ""
    cTeam    = ",".join(teachers)

    cCount = safe_text(wait, "/html/body/div[5]/div[2]/div[1]/div/div/div/div[2]/div[2]/div/div[3]/div/div[1]/div[4]/span[2]")
    cProcess = safe_text(wait, "/html/body/div[5]/div[2]/div[1]/div/div/div/div[2]/div[2]/div/div[3]/div/div[1]/div[2]/div/span[2]")
    cBrief   = safe_text(wait, "/html/body/div[5]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/div[2]/div[1]")

    cursor.execute("""
        INSERT INTO course_info(url, cCourse, cCollege, cTeacher, cTeam, cCount, cProcess, cBrief)
        VALUES (%s,%s,%s,%s,%s,%s,%s,%s)
    """, (url, cCourse, cCollege, cTeacher, cTeam, cCount, cProcess, cBrief))
    db.commit()

    print(f"爬取完成:{cCourse}")

# ---------- 主程序 ----------
def main():
    db = pymysql.connect(
        host="localhost", user="root", password="xuzhian123",
        database="mooc_comment_db", charset="utf8mb4"
    )
    cursor = db.cursor()

    cursor.execute("""
        CREATE TABLE IF NOT EXISTS course_info(
            id INT AUTO_INCREMENT PRIMARY KEY,
            url VARCHAR(255),
            cCourse VARCHAR(255),
            cCollege VARCHAR(255),
            cTeacher VARCHAR(255),
            cTeam TEXT,
            cCount VARCHAR(50),
            cProcess VARCHAR(255),
            cBrief TEXT
        );
    """)
    db.commit()

    # selenium
    driver = webdriver.Chrome(options=webdriver.ChromeOptions()
                              .add_experimental_option('excludeSwitches',['enable-logging']))
    wait = WebDriverWait(driver, 20)

    load_cookies(driver)

    # 输入 URL
    print("请输入课程 URL(可输入多个,用空格或换行分隔,输入空行结束):")
    urls = []
    while True:
        line = input().strip()
        if not line: break
        urls.extend(line.split())

    if not urls:
        print("未输入任何 URL,程序退出。")
        return

    for url in urls:
        crawl_course(driver, wait, url, cursor, db)

    driver.quit()
    db.close()
    print("\n全部课程爬取完毕!")

if __name__ == "__main__":
    main()

保存cookie:

import json, time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

COOKIE_FILE = "cookies.json"


def save_cookies(driver):
    with open(COOKIE_FILE, "w", encoding="utf-8") as f:
        json.dump(driver.get_cookies(), f, ensure_ascii=False, indent=2)
    print("Cookie 已保存到 cookies.json!")


def generate_cookie():
    options = webdriver.ChromeOptions()
    options.add_experimental_option('excludeSwitches', ['enable-logging'])
    options.add_argument('--log-level=3')

    driver = webdriver.Chrome(options=options)
    wait = WebDriverWait(driver, 20)

    print("请扫码登录,按回车继续…")
    driver.get("https://www.icourse163.org/")
    time.sleep(2)

    # ---------- 打开登录弹窗 ----------
    login_selectors = [
        "/html/body/div[5]/div[2]/div[1]/div/div/div[1]/div[3]/div[3]/div",
        "//div[contains(text(),'登录')]"
    ]
    for sel in login_selectors:
        try:
            wait.until(EC.element_to_be_clickable((By.XPATH, sel))).click()
            print("已打开登录弹窗")
            break
        except:
            pass
    else:
        print("无法找到主页登录按钮")
        driver.quit()
        return

    time.sleep(2)

    # ---------- 切换二维码登录方式 ----------
    qrcode_selectors = [
        "/html/body/div[13]/div/div[2]/div/div/div/div/div/div[1]/div/div[2]/div[2]/img",
        "//img[contains(@alt,'码') or contains(@title,'码')]",
        "//div[contains(@class,'qrcode')]//img",
        "//*[contains(text(),'扫码登录') or contains(text(),'二维码')]"
    ]

    for sel in qrcode_selectors:
        try:
            wait.until(EC.element_to_be_clickable((By.XPATH, sel))).click()
            print("已切换到二维码登录方式,请扫码...")
            break
        except:
            pass

    input("扫码完成后按回车继续...")

    save_cookies(driver)
    driver.quit()


if __name__ == "__main__":
    generate_cookie()

作业心得:对Selenium的认识更进一步

• 作业③:

• 掌握大数据相关服务,熟悉Xshell的使用

• 完成文档 华为云_大数据实时分析处理实验手册-Flume日志采集实验(部

分)v2.docx 中的任务,即为下面5个任务,具体操作见文档。

• 环境搭建:

·任务一:开通MapReduce服务

• 实时分析开发实战:

·任务一:Python脚本生成测试数据

·任务二:配置Kafka

·任务三: 安装Flume客户端

·任务四:配置Flume采集数据

输出:实验关键步骤或结果截图。

实验二:大数据实时分析

1.1申请弹性公网IP

1.2购买MapReduce集群

1.3开通云数据库服务RDS

1.4创建弹性资源池

1.5开通数据可视化服务(DLV)

2.1 Python脚本生成测试数据

2.2配置Kafka

2.3安装Flume客户端

2.4配置Flume采集数据

2.5 MySQL中准备结果表与维度表数据

创建Flink作业的结果表,查看创建表

2.6使用DLI中的Flink作业进行数据分析

编辑Flink作业的SQL脚本

测试地址连通性

查看作业运行详情

2.7 DLV数据可视化

posted @ 2025-12-09 17:50  许志安xza  阅读(2)  评论(0)    收藏  举报