第四次作业

作业①

股票爬取

实验要求

使用Selenium框架+ MySQL数据库存储技术路线爬取“沪深A股”、“上证A股”、“深证A股”3个板块的股票数据信息。

核心代码和运行结果

点击查看代码
import pymysql
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
MYSQL_HOST = 'localhost'
MYSQL_USER = 'root'
MYSQL_PASSWORD = 'root' 
MYSQL_DB = 'stocks_db' 
MYSQL_PORT = 3306

board_list = [
    ("沪深A股", "#hs_a_board"),
    ("上证A股", "#sh_a_board"),
    ("深证A股", "#sz_a_board")
]
max_page = 3


def get_driver():
    options = Options()
    # 防止被检测为自动化测试软件
    options.add_experimental_option('excludeSwitches', ['enable-automation'])
    options.add_argument("--disable-blink-features=AutomationControlled")

    driver = webdriver.Chrome(options=options)
    driver.maximize_window()
    return driver


def main():
    print("作业开始运行了...")
    print("正在连接 MySQL 数据库...")

    try:
        conn = pymysql.connect(
            host=MYSQL_HOST,
            user=MYSQL_USER,
            password=MYSQL_PASSWORD,
            database=MYSQL_DB,
            port=MYSQL_PORT,
            charset='utf8mb4'
        )
        c = conn.cursor()
    except Exception as e:
        print(f"数据库连接失败: {e}")
        return

    c.execute("DROP TABLE IF EXISTS stock_data")
    sql = '''
          CREATE TABLE stock_data \
          ( \
              id             INT AUTO_INCREMENT PRIMARY KEY, \
              board_type     VARCHAR(50), \
              stock_code     VARCHAR(20), \
              stock_name     VARCHAR(50), \
              latest_price   VARCHAR(20), \
              change_percent VARCHAR(20), \
              change_amount  VARCHAR(20), \
              volume         VARCHAR(50), \
              turnover       VARCHAR(50), \
              amplitude      VARCHAR(20), \
              high           VARCHAR(20), \
              low            VARCHAR(20), \
              open_price     VARCHAR(20), \
              prev_close     VARCHAR(20)
          ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
          '''
    c.execute(sql)
    conn.commit()
    print("数据库表创建成功!")

    driver = get_driver()

    try:
        for board in board_list:
            b_name = board[0]
            b_code = board[1]
            url = "http://quote.eastmoney.com/center/gridlist.html" + b_code
            print(f"\n正在爬取:{b_name},网址是:{url}")
            driver.get(url)
            driver.refresh()
            for i in range(1, max_page + 1):
                tr_list = driver.find_elements(By.XPATH, "//table//tbody/tr")
                data_to_save = []
                for tr in tr_list:
                    tds = tr.find_elements(By.TAG_NAME, "td")
                    if len(tds) < 10:
                        continue
                    code = tds[1].text
                    if not code:
                        continue
                    one_row = (
                        b_name,
                        code,
                        tds[2].text,
                        tds[4].text,
                        tds[5].text,
                        tds[6].text,
                        tds[7].text,
                        tds[8].text,
                        tds[9].text,
                        tds[10].text,
                        tds[11].text,
                        tds[12].text,
                        tds[13].text
                    )
                    data_to_save.append(one_row)

                if len(data_to_save) > 0:
                    insert_sql = '''
                                 INSERT INTO stock_data (board_type, stock_code, stock_name, latest_price,
                                                         change_percent, change_amount, volume, turnover,
                                                         amplitude, high, low, open_price, prev_close)
                                 VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
                                 '''
                    c.executemany(insert_sql, data_to_save)
                    conn.commit()
                    print(f"    第 {i} 页保存成功,存了 {len(data_to_save)} 条数据。")
                else:
                    print("lose")

                if i < max_page:
                    try:
                        next_button = driver.find_element(By.CSS_SELECTOR, "a[title='下一页']")
                        driver.execute_script("arguments[0].click();", next_button)
                        time.sleep(5)
                    except Exception as e:
                        print(f"翻页失败: {e}")
                        break
            time.sleep(2)
    finally:
        driver.quit()
        c.close()
        conn.close()
        print("easy ")


if __name__ == "__main__":
    main()

image
image

心得体会

image
我首先用xpath来抓取表格行tr,然后直接定位他的td标签,每个td标签内就有我要的数据,最后在翻页上用css选择器定位title来找到翻页按钮,最开始我用.click,但是由于弹出广告的缘故会失败,所有我最后改用js点击,这样就可以成功
image
gitee:https://gitee.com/abcman12/2025_crawl_project/blob/master/作业4/stock.csv

作业②

mooc爬取

实验要求

使用Selenium框架+MySQL爬取中国mooc网课程资源信息(课程号、课程名
称、学校名称、主讲教师、团队成员、参加人数、课程进度、课程简介)

核心代码与运行结果

点击查看代码
import pymysql
import time
from selenium import webdriver
from selenium.webdriver.common.by import By

# --- 数据库配置 (请修改) ---
MYSQL_HOST = 'localhost'
MYSQL_USER = 'root'
MYSQL_PASS = 'root'
MYSQL_DB = 'mooc'

target_url = 'https://www.icourse163.org/home.htm?userId=1528358600#/home/course'


def main():
    print("正在连接数据库...")
    conn = pymysql.connect(
        host=MYSQL_HOST,
        user=MYSQL_USER,
        password=MYSQL_PASS,
        database=MYSQL_DB,
        charset='utf8mb4'
    )
    cur = conn.cursor()
    print("数据库连接成功")

    # 删表
    cur.execute("DROP TABLE IF EXISTS courses")

    # 建表
    sql = '''
          CREATE TABLE courses \
          ( \
              id       INT AUTO_INCREMENT PRIMARY KEY, \
              cCourse  VARCHAR(255), \
              cCollege VARCHAR(255), \
              cTeacher VARCHAR(255), \
              cCount   VARCHAR(100), \
              cProcess VARCHAR(100), \
              cBrief   TEXT, \
              url      VARCHAR(500)
          )
          '''
    cur.execute(sql)
    conn.commit()
    print("表建好了")

    # --- 2. 启动浏览器 ---
    options = webdriver.ChromeOptions()
    options.add_argument('--disable-blink-features=AutomationControlled')
    driver = webdriver.Chrome(options=options)
    driver.maximize_window()

    # --- 3. 获取链接 ---
    print(f"打开网页:{target_url}")
    driver.get(target_url)

    print("扫码登录")
    time.sleep(10)

    print("开始获取课程链接")
    url_list = []

    # 获取所有a标签
    all_links = driver.find_elements(By.TAG_NAME, 'a')

    for link in all_links:
        href = link.get_attribute('href')
        if href:
            if '/course/' in href and 'search.htm' not in href:
                if href not in url_list:
                    url_list.append(href)

    print(f"一共找到 {len(url_list)} 个课程")
    count = 0
    for u in url_list:
        count += 1
        print(f"\n正在爬第 {count} 个:{u}")

        driver.get(u)
        time.sleep(3)

        # 初始化变量
        c_course = "未找到"
        c_college = "未找到"
        c_teacher = "未找到"
        c_count = "0"
        c_process = "未找到"
        c_brief = "无简介"

        #找学校
        pic_xpath = '/html/body/div[5]/div[2]/div[2]/div[2]/div[2]/div[2]/div[2]/div/a/img'
        pic_eles = driver.find_elements(By.XPATH, pic_xpath)

        if len(pic_eles) > 0:
            # 列表不为空,说明找到了
            school_name = pic_eles[0].get_attribute('alt')
            if school_name:
                c_college = school_name
            else:
                c_college = pic_eles[0].get_attribute('title')
        else:
            backup_xpath = '//*[@id="j-teacher"]/div/a'
            backup_eles = driver.find_elements(By.XPATH, backup_xpath)
            if len(backup_eles) > 0:
                c_college = backup_eles[0].text

        #找课程名
        course_xpath = '/html/body/div[5]/div[2]/div[1]/div/div/div/div[2]/div[2]/div/div[2]/div[1]/span[1]'
        course_eles = driver.find_elements(By.XPATH, course_xpath)
        if len(course_eles) > 0:
            c_course = course_eles[0].text

        #找老师
        teacher_xpath = '/html/body/div[5]/div[2]/div[2]/div[2]/div[2]/div[2]/div[2]/div/div/div[2]/div/div/div[1]/div/div/h3'
        teacher_eles = driver.find_elements(By.XPATH, teacher_xpath)
        if len(teacher_eles) > 0:
            c_teacher = teacher_eles[0].text

        #找简介
        brief_xpath = '/html/body/div[5]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/div[2]/div[1]'
        brief_eles = driver.find_elements(By.XPATH, brief_xpath)
        if len(brief_eles) > 0:
            text = brief_eles[0].text
            if len(text) > 200:
                c_brief = text[:200] + "..."
            else:
                c_brief = text

        # === 5. 找人数 ===
        count_xpath = '/html/body/div[5]/div[2]/div[1]/div/div/div/div[2]/div[2]/div/div[3]/div/div[1]/div[4]/span[2]'
        count_eles = driver.find_elements(By.XPATH, count_xpath)
        if len(count_eles) > 0:
            c_count = count_eles[0].text

        # === 6. 找进度 ===
        process_xpath = '/html/body/div[5]/div[2]/div[1]/div/div/div/div[2]/div[2]/div/div[3]/div/div[1]/div[4]/span[1]'
        process_eles = driver.find_elements(By.XPATH, process_xpath)
        if len(process_eles) > 0:
            c_process = process_eles[0].text

        # === 存入数据库 ===
        insert_sql = '''
                     INSERT INTO courses (cCourse, cCollege, cTeacher, cCount, cProcess, cBrief, url)
                     VALUES (%s, %s, %s, %s, %s, %s, %s)
                     '''
        cur.execute(insert_sql, (c_course, c_college, c_teacher, c_count, c_process, c_brief, u))
        conn.commit()
        print(f"    保存成功:{c_course}")

    driver.quit()
    cur.close()
    conn.close()
    print("运行结束")


if __name__ == "__main__":
    main()

image

image

心得体会

我选择爬取的是mooc的我的课程部分的信息,如图所示

image

image

我的主要思路是先拿到url,再查找题目所需要的每个要爬取信息的xpath复制下来这样就能简单粗暴的爬取到所需信息,这样就能实现题目要求

gitee:https://gitee.com/abcman12/2025_crawl_project/blob/master/作业4/mooc.csv

作业③

flume

实验要求

完成文档 华为云_大数据实时分析处理实验手册-Flume日志采集实验(部
分)v2.docx 中的任务

运行结果

python脚本生成测试数据,查看生成结果
image
配置kafka
image
创建topic
image
安装flume客户端
image
安装成功
image

配置flume采集数据
image
image

心得体会

熟悉了flume操作过程

posted @ 2025-11-30 11:51  abc。  阅读(0)  评论(0)    收藏  举报