数据采集作业4

目录

1.第一题


使用Selenium框架+ MySQL数据库存储技术路线爬取“沪深A股”、“上证A股”、“深证A股”3个板块的股票数据信息。

1.1主要代码


第一题全部代码

点击查看代码
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.edge.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
import time
import pymysql
from beautifultable import BeautifulTable

driver = webdriver.Edge(service=Service())
class DatabaseManager:
    def __init__(self):
        self.db = None
        self.cursor = None
        self.connect()

    def connect(self):
        """连接数据库"""
        try:
            self.db = pymysql.connect(
                host='localhost',
                user='root',
                password='826922',
                database='stocks_db',
                charset='utf8mb4',
                port=3306
            )
            self.cursor = self.db.cursor()
            print("MySQL数据库连接成功")
            self.create_table()
        except Exception as e:
            print(f"MySQL连接失败: {e}")
            self.db = None
            self.cursor = None

    def create_table(self):
        """创建数据表"""
        self.cursor.execute('DROP TABLE IF EXISTS stocks1')
        try:
            self.cursor.execute('''
                CREATE TABLE IF NOT EXISTS stocks1 (
                   stock_code VARCHAR(20) PRIMARY KEY,
                    stock_name VARCHAR(50),
                    last_price DECIMAL(10,2),
                    change_percent VARCHAR(30),
                    change_amount DECIMAL(10,2),
                    volume VARCHAR(30),
                    turnover VARCHAR(30),
                    turnover_rate VARCHAR(30),
                    high_price DECIMAL(10,2),
                    low_price DECIMAL(10,2),
                    open_price DECIMAL(10,2),
                    close_price DECIMAL(10,2)
                ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4
            ''')
            self.db.commit()
            print("数据表 stocks1 创建成功或已存在")
        except Exception as e:
            print(f"创建表失败: {e}")

    def insert_stock_data(self, stock_data):
        """插入股票数据"""
        if not self.db or not self.cursor:
            print("数据库未连接,跳过数据插入")
            return False
        try:
            sql = '''
                INSERT INTO stocks1 
                (stock_code, stock_name, last_price, change_percent, change_amount, 
                 volume, turnover,turnover_rate,high_price, low_price, open_price, close_price)
                VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s,%s)
                ON DUPLICATE KEY UPDATE
                stock_name = VALUES(stock_name),
                last_price = VALUES(last_price),
                change_percent = VALUES(change_percent),
                change_amount = VALUES(change_amount),
                volume = VALUES(volume),
                turnover = VALUES(turnover),turnover_rate = VALUES(turnover_rate),
                high_price = VALUES(high_price),
                low_price = VALUES(low_price),
                open_price = VALUES(open_price),
                close_price = VALUES(close_price)
            '''

            self.cursor.executemany(sql, stock_data)
            self.db.commit()
            print(f"成功插入/更新 {self.cursor.rowcount} 条股票数据")
            return True
        except Exception as e:
            self.db.rollback()
            print(f"数据插入失败: {e}")
            return False
def get_html(url, max_page):
    """获取HTML数据"""
    driver.get(url)
    current_page = 1
    data = []

    while current_page <= max_page:
        try:
            print(f"\n正在获取第 {current_page} 页数据...")

            # 等待表格数据加载完成
            wait = WebDriverWait(driver, 10)
            marks = wait.until(
                EC.presence_of_all_elements_located(
                    (By.XPATH, '//table//tbody/tr[td[2]/a[contains(@href, "quote.eastmoney.com")]]'))
            )
            print(f"找到 {len(marks)} 条数据")

            # 提取当前页的所有数据
            for mark in marks:
                try:
                    code = mark.find_element(By.XPATH, './/td[2]').text
                    name = mark.find_element(By.XPATH, './/td[3]').text

                    # 价格信息
                    last_price = mark.find_element(By.XPATH, './/td[5]').text
                    change_percent = mark.find_element(By.XPATH, './/td[6]').text
                    change_amount = mark.find_element(By.XPATH, './/td[7]').text

                    # 成交量信息
                    volume = mark.find_element(By.XPATH, './/td[8]').text
                    turnover = mark.find_element(By.XPATH, './/td[9]').text
                    turnover_rate = mark.find_element(By.XPATH, './/td[10]').text

                    # 价格区间
                    high_price = mark.find_element(By.XPATH, './/td[11]').text
                    low_price = mark.find_element(By.XPATH, './/td[12]').text
                    open_price = mark.find_element(By.XPATH, './/td[13]').text
                    close_price = mark.find_element(By.XPATH, './/td[14]').text

                    data.append([ code, name, last_price, change_percent, change_amount,
                                 volume, turnover, turnover_rate, high_price, low_price,
                                 open_price, close_price])

                except NoSuchElementException as e:
                    print(f"提取数据时元素未找到: {e}")
                    continue

            # 翻页逻辑(处理完当前页所有数据后再翻页)
            if current_page < max_page:
                if not click_next_page():
                    print("没有下一页,停止翻页")
                    break
            else:
                break  # 已经达到最大页数,退出循环

            # 当前页处理完成,页码递增
            current_page += 1
            time.sleep(2)  # 翻页间隔

        except TimeoutException:
            print("页面加载超时,未找到数据")
            break
        except Exception as e:
            print(f"获取第 {current_page} 页数据时出错: {e}")
            break

    return data  # 添加返回值


def click_next_page():
    try:
        time.sleep(2)

        # 查找并点击下一页按钮
        next_buttons = driver.find_elements(By.XPATH, '//a[@title="下一页"]')

        for button in next_buttons:
            if button.is_displayed() and button.is_enabled():
                driver.execute_script("arguments[0].click();", button)
                time.sleep(3)
                return True

        return False

    except Exception as e:
        print(f"翻页失败: {e}")
        return False
def main():
    try:
        markets = [
            "https://quote.eastmoney.com/center/gridlist.html#hs_a_board",
            "https://quote.eastmoney.com/center/gridlist.html#sh_a_board",
            "https://quote.eastmoney.com/center/gridlist.html#sz_a_board"
        ]

        market_names = ["沪深京A股", "上证A股", "深证A股"]
        all_data = []
        db_manager = DatabaseManager()
        for i, url in enumerate(markets):
            print(f"\n{'=' * 50}")
            print(f"开始处理板块: {market_names[i]}")
            print(f"{'=' * 50}")

            data = get_html(url, 2)
            if data:
                all_data.extend(data)
                print(f"{market_names[i]} 获取到 {len(data)} 条数据")
                # 显示当前板块的数据预览
                print(f"\n{market_names[i]} 数据:")
                table = BeautifulTable()
                table.set_style(BeautifulTable.STYLE_COMPACT)
                table.columns.alignment = BeautifulTable.ALIGN_CENTER
                table.columns.header = [
                    '代码', '名称', '最新价', '涨跌幅', '涨跌额',
                    '成交量', '成交额', '换手率', '最高价', '最低价', '开盘价', '昨收价'
                ]

                table.columns.width = [14,12,12,12,12,12,12,12,12,12,12,12]
                for product in data:
                    # 简化商品名称显示
                    table.rows.append(product)
                print(table)
            else:
                print(f"{market_names[i]} 没有获取到数据")

            # 板块间短暂停顿
            if i < len(markets) - 1:
                time.sleep(3)
        if all_data:
            success = db_manager.insert_stock_data(all_data)
            if success:
                print(f"成功保存股票数据到数据库")
            else:
                print("保存数据到数据库失败")
        else:
            print("没有获取到任何股票数据")
    except Exception as e:
        print(f"主程序执行出错: {e}")
    finally:
        input("按回车键关闭浏览器...")
        driver.quit()


if __name__ == "__main__":
    main()

image

通过图片可以看到通过标签//table//tbody/tr定位到每个股票的数据,再通过td[i]来提取到对应的数据,但是仅靠这个提取,会出现一个空的标头,所以加上[td[2]/a[contains(@href, "quote.eastmoney.com")]],来更好的定位.

def get_html(url, max_page):
    """获取HTML数据"""
    driver.get(url)
    current_page = 1
    data = []

    while current_page <= max_page:
        try:
            print(f"\n正在获取第 {current_page} 页数据...")

            # 等待表格数据加载完成
            wait = WebDriverWait(driver, 10)
            marks = wait.until(
                EC.presence_of_all_elements_located(
                    (By.XPATH, '//table//tbody/tr[td[2]/a[contains(@href, "quote.eastmoney.com")]]'))
            )
            print(f"找到 {len(marks)} 条数据")

            # 提取当前页的所有数据
            for mark in marks:
                try:
                    code = mark.find_element(By.XPATH, './/td[2]').text
                    name = mark.find_element(By.XPATH, './/td[3]').text

                    # 价格信息
                    last_price = mark.find_element(By.XPATH, './/td[5]').text
                    change_percent = mark.find_element(By.XPATH, './/td[6]').text
                    change_amount = mark.find_element(By.XPATH, './/td[7]').text

                    # 成交量信息
                    volume = mark.find_element(By.XPATH, './/td[8]').text
                    turnover = mark.find_element(By.XPATH, './/td[9]').text
                    turnover_rate = mark.find_element(By.XPATH, './/td[10]').text

                    # 价格区间
                    high_price = mark.find_element(By.XPATH, './/td[11]').text
                    low_price = mark.find_element(By.XPATH, './/td[12]').text
                    open_price = mark.find_element(By.XPATH, './/td[13]').text
                    close_price = mark.find_element(By.XPATH, './/td[14]').text

                    data.append([ code, name, last_price, change_percent, change_amount,
                                 volume, turnover, turnover_rate, high_price, low_price,
                                 open_price, close_price])

                except NoSuchElementException as e:
                    print(f"提取数据时元素未找到: {e}")
                    continue

            # 翻页逻辑(处理完当前页所有数据后再翻页)
            if current_page < max_page:
                if not click_next_page():
                    print("没有下一页,停止翻页")
                    break
            else:
                break  

            # 当前页处理完成,页码递增
            current_page += 1
            time.sleep(2)  # 翻页间隔

        except TimeoutException:
            print("页面加载超时,未找到数据")
            break
        except Exception as e:
            print(f"获取第 {current_page} 页数据时出错: {e}")
            break

    return data  

image

对于翻页,通过定位到下一页的按钮,然后自动点击。

def click_next_page():
    try:
        time.sleep(2)

        # 查找并点击下一页按钮
        next_buttons = driver.find_elements(By.XPATH, '//a[@title="下一页"]')

        for button in next_buttons:
            if button.is_displayed() and button.is_enabled():
                driver.execute_script("arguments[0].click();", button)
                time.sleep(3)
                return True

        return False

    except Exception as e:
        print(f"翻页失败: {e}")
        return False

再往数据库里面存储数据

点击查看代码
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.edge.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
import time
import pymysql
from beautifultable import BeautifulTable

driver = webdriver.Edge(service=Service())
class DatabaseManager:
    def __init__(self):
        self.db = None
        self.cursor = None
        self.connect()

    def connect(self):
        """连接数据库"""
        try:
            self.db = pymysql.connect(
                host='localhost',
                user='root',
                password='826922',
                database='stocks_db',
                charset='utf8mb4',
                port=3306
            )
            self.cursor = self.db.cursor()
            print("MySQL数据库连接成功")
            self.create_table()
        except Exception as e:
            print(f"MySQL连接失败: {e}")
            self.db = None
            self.cursor = None

    def create_table(self):
        """创建数据表"""
        self.cursor.execute('DROP TABLE IF EXISTS stocks1')
        try:
            self.cursor.execute('''
                CREATE TABLE IF NOT EXISTS stocks1 (
                   stock_code VARCHAR(20) PRIMARY KEY,
                    stock_name VARCHAR(50),
                    last_price DECIMAL(10,2),
                    change_percent VARCHAR(30),
                    change_amount DECIMAL(10,2),
                    volume VARCHAR(30),
                    turnover VARCHAR(30),
                    turnover_rate VARCHAR(30),
                    high_price DECIMAL(10,2),
                    low_price DECIMAL(10,2),
                    open_price DECIMAL(10,2),
                    close_price DECIMAL(10,2)
                ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4
            ''')
            self.db.commit()
            print("数据表 stocks1 创建成功或已存在")
        except Exception as e:
            print(f"创建表失败: {e}")

    def insert_stock_data(self, stock_data):
        """插入股票数据"""
        if not self.db or not self.cursor:
            print("数据库未连接,跳过数据插入")
            return False
        try:
            sql = '''
                INSERT INTO stocks1 
                (stock_code, stock_name, last_price, change_percent, change_amount, 
                 volume, turnover,turnover_rate,high_price, low_price, open_price, close_price)
                VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s,%s)
                ON DUPLICATE KEY UPDATE
                stock_name = VALUES(stock_name),
                last_price = VALUES(last_price),
                change_percent = VALUES(change_percent),
                change_amount = VALUES(change_amount),
                volume = VALUES(volume),
                turnover = VALUES(turnover),turnover_rate = VALUES(turnover_rate),
                high_price = VALUES(high_price),
                low_price = VALUES(low_price),
                open_price = VALUES(open_price),
                close_price = VALUES(close_price)
            '''

            self.cursor.executemany(sql, stock_data)
            self.db.commit()
            print(f"成功插入/更新 {self.cursor.rowcount} 条股票数据")
            return True
        except Exception as e:
            self.db.rollback()
            print(f"数据插入失败: {e}")
            return False

1.2实验结果

从运行结果里面查看:
image
从Mysql里面查看:
image

1.3心得提会

在本次实验中,通过使用 Selenium 爬取了东方财富网的股票数据。在提取数据的时候,不需要做可以处理,可以直接读出单位与正确的数据格式,其次,在翻页的时候,可以通过定位到下一页的按钮的位置,直接进行点击,而非之前的,需要采用url里面的页数的参数,来作为翻页。最后往数据库里面使用ON DUPLICATE KEY UPDATE语句的设计很实用,能够实现数据的更新而非简单插入。

通过这个股票数据爬取项目的实践,我深刻体会到编程不仅是代码的堆砌,更是一场与数据和细节的博弈。在数据抓取中,最复杂的往往不是技术本身,而是对网站结构变化的应对和处理各种意外情况的能力。翻页逻辑的调试、数据格式的统一、异常处理的完善,每一步都需要严谨的思考。

2.第二题


使用Selenium框架+MySQL爬取中国mooc网课程资源信息(课程号、课程名称、学校名称、主讲教师、团队成员、参加人数、课程进度、课程简介)

2.1主要代码


第二题全部代码

点击查看代码
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.edge.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
from beautifultable import BeautifulTable
from selenium.webdriver.common.keys import Keys
import pymysql
driver = webdriver.Edge(service=Service())


#数据库:
class DatabaseManager:
    def __init__(self):
        self.db = None
        self.cursor = None
        self.connect()

    def connect(self):
        """连接数据库"""
        try:
            self.db = pymysql.connect(
                host='localhost',
                user='root',
                password='826922',
                database='mooc_db',
                charset='utf8mb4',
                port=3306
            )
            self.cursor = self.db.cursor()
            print("MySQL数据库连接成功")
            self.create_table()
        except Exception as e:
            print(f"MySQL连接失败: {e}")
            self.db = None
            self.cursor = None

    def create_table(self):
        """创建数据表"""
        self.cursor.execute('DROP TABLE IF EXISTS mooc1')
        try:
            self.cursor.execute('''
                CREATE TABLE IF NOT EXISTS mooc1 (
                    mooc_name VARCHAR(255),
                    school VARCHAR(100),
                    teacher VARCHAR(100),
                    teamates TEXT,
                    num VARCHAR(50),
                    schedule VARCHAR(100),
                    text TEXT
                ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4
            ''')
            self.db.commit()
            print("数据表 mooc1 创建成功或已存在")
        except Exception as e:
            print(f"创建表失败: {e}")

    def insert_mooc_data(self, mooc_data):
        """插入mooc数据"""
        if not self.db or not self.cursor:
            print("数据库未连接,跳过数据插入")
            return False
        try:
            sql = '''
                INSERT INTO mooc1 
                (mooc_name, school, teacher, teamates, num,schedule,text)
                VALUES (%s, %s, %s, %s, %s, %s, %s)
            '''

            self.cursor.executemany(sql, mooc_data)
            self.db.commit()
            print(f"成功插入/更新 {self.cursor.rowcount} 条股票数据")
            return True
        except Exception as e:
            self.db.rollback()
            print(f"数据插入失败: {e}")
            return False
def click_login_button():
    try:
        wait = WebDriverWait(driver, 10)

        login_button = wait.until(EC.element_to_be_clickable((By.CLASS_NAME,'_3uWA6')))
        login_button.click()
        print("点击登录按钮成功")
        return True
    except Exception as e:
        print(f"点击登录按钮失败: {e}")
        return False

def fill_login_form():
    try:
        try:
        # 查找所有iframe
            iframes = driver.find_elements(By.TAG_NAME, "iframe")
            print(f"找到 {len(iframes)} 个iframe")

            # 切换到第一个iframe(通常是登录框)
            driver.switch_to.frame(iframes[0])
            print("已切换到第一个iframe")
        except Exception as e:
            print(f"切换iframe失败: {e}")

        time.sleep(2)  #等待切换至iframe结构

        #1.找到手机号输入框
        phone_input = driver.find_element(By.ID, "phoneipt")
        phone_input.send_keys("15880128200")
        print("已输入手机号")
        time.sleep(1)

        password_input = driver.find_element(By.CLASS_NAME, "j-inputtext")
        password_input.send_keys("Xwj.826922")

        print("已输入密码")

    # 点击登录
        submit_btn = driver.find_element(By.ID, 'submitBtn')
        submit_btn.click()
        print("已提交登录")
        #再切换回原本的iframe结构
        driver.switch_to.default_content()
        return True

    except Exception as e:
        print(f"自动填写失败: {e}")
        return False

def find_selection(keyword):
    print("等待页面加载...")
    time.sleep(3)
    try:
        select_btn = driver.find_element(By.CLASS_NAME, "ant-input")
        select_btn.clear()
        select_btn.send_keys(keyword)
        print("已成功输入数据")
        select_btn.send_keys(Keys.ENTER)
        print("已按回车键搜索")
        time.sleep(10)
        return True
    except Exception as e:
        print(f"搜索失败:{e}")
        return False

def get_html():
    wait = WebDriverWait(driver, 10)
    wait.until(EC.presence_of_element_located((By.CLASS_NAME, "_3NYsM")))
    data=[]
    # 给页面更多时间加载
    time.sleep(2)

    # 查找所有搜索结果项
    result_items = driver.find_elements(By.CLASS_NAME, "_3NYsM")
    print(f"找到 {len(result_items)} 个搜索结果")
    for mark in result_items:
        title = mark.find_element(By.CLASS_NAME, '_1vfZ-').text.strip()
        author_elements = mark.find_elements(By.CLASS_NAME, '_3t_C8')
        teacher=author_elements[0].text
        teammates = ' '.join([auth.text.strip() for auth in author_elements])
        school = mark.find_elements(By.CLASS_NAME, '_3vJDG')[0].text.strip() if mark.find_elements(By.CLASS_NAME, '_3vJDG') else 'Null'
        num=mark.find_element(By.CLASS_NAME, '_CWjg').text.strip()
        text = mark.find_element(By.CLASS_NAME, '_3JEMz').text.strip()
        schedule = mark.find_elements(By.CLASS_NAME, 'NOdDs')[0].text.strip() if mark.find_elements(By.CLASS_NAME, 'NOdDs') else 'Null'
        data.append([title,school,teacher, teammates, num,schedule,text])
    return data

def main(url):
    db_manager =DatabaseManager()
    all_data=[]
    driver.get(url)
    wait = WebDriverWait(driver, 10)
    click_login_button()
    time.sleep(10)  # 等待10秒
    fill_login_form()
    time.sleep(10)
    print("等待登录完成...")
    time.sleep(5)  # 等待页面跳转和加载
    # 检查是否登录成功
    print("在新页面查找搜索框...")
    find_selection("计算机网络")
    all_data=get_html()
    if all_data:
        success=db_manager.insert_mooc_data(all_data)
        if success:
            print(f"成功保存{len(all_data)}条数据到数据库")
        else:
            print("保存数据到数据库失败")
    else:
        print("没有获取到课程数据")
    print("等待结束,页面即将关闭")
    return all_data
if __name__ == '__main__':
    url="https://www.icourse163.org/"
    all_data=main(url)
    table = BeautifulTable()
    table.set_style(BeautifulTable.STYLE_COMPACT)
    table.columns.alignment = BeautifulTable.ALIGN_CENTER
    table.columns.header = [
        '课程名称', '学校', '老师', '团队成员', '参与人数',
        '进度', '简介'
    ]

    table.columns.width = [20, 18, 14,20,14,20,100]
    for course in all_data:
        # 简化商品名称显示
        table.rows.append(course)
    print(table)

image

对于爬去中国mooc网时,首先要是实现自动登录的功能,所以使用click_login_button函数,找到定位到‘登录’的按钮

def click_login_button():
    try:
        wait = WebDriverWait(driver, 10)

        login_button = wait.until(EC.element_to_be_clickable((By.CLASS_NAME,'_3uWA6')))
        login_button.click()
        print("点击登录按钮成功")
        return True
    except Exception as e:
        print(f"点击登录按钮失败: {e}")
        return False

image
点击登录后,会弹出登录页面,但是输入框是一个嵌套的iframe结构,所以需要切换至iframe,在这里面定位到输入手机号和密码,输入手机号与密码,实现自动登录,并且登录完后需要再重新切换回原本的iframe结构。

def fill_login_form():
    try:
        try:
        # 查找所有iframe
            iframes = driver.find_elements(By.TAG_NAME, "iframe")
            print(f"找到 {len(iframes)} 个iframe")

            # 切换到第一个iframe(通常是登录框)
            driver.switch_to.frame(iframes[0])
            print("已切换到第一个iframe")
        except Exception as e:
            print(f"切换iframe失败: {e}")

        time.sleep(2)  #等待切换至iframe结构

        #1.找到手机号输入框
        phone_input = driver.find_element(By.ID, "phoneipt")
        phone_input.send_keys("15880128200")
        print("已输入手机号")
        time.sleep(1)

        password_input = driver.find_element(By.CLASS_NAME, "j-inputtext")
        password_input.send_keys("Xwj.826922")

        print("已输入密码")

    # 点击登录
        submit_btn = driver.find_element(By.ID, 'submitBtn')
        submit_btn.click()
        print("已提交登录")
        #再切换回原本的iframe结构
        driver.switch_to.default_content()
        return True

    except Exception as e:
        print(f"自动填写失败: {e}")
        return False

image
在成功登录后,定位搜索框,input数据,再点击搜索,找到需要的课程。

def find_selection(keyword):
    print("等待页面加载...")
    time.sleep(3)
    try:
        select_btn = driver.find_element(By.CLASS_NAME, "ant-input")
        select_btn.clear()
        select_btn.send_keys(keyword)
        print("已成功输入数据")
        select_btn.send_keys(Keys.ENTER)
        print("已按回车键搜索")
        time.sleep(10)
        return True
    except Exception as e:
        print(f"搜索失败:{e}")
        return False

image

在搜索后,到达课程页面,最后开始爬取课程,将没有的数据设置为Null。

def get_html():
    wait = WebDriverWait(driver, 10)
    wait.until(EC.presence_of_element_located((By.CLASS_NAME, "_3NYsM")))
    data=[]
    # 给页面更多时间加载
    time.sleep(2)

    # 查找所有搜索结果项
    result_items = driver.find_elements(By.CLASS_NAME, "_3NYsM")
    print(f"找到 {len(result_items)} 个搜索结果")
    for mark in result_items:
        title = mark.find_element(By.CLASS_NAME, '_1vfZ-').text.strip()
        author_elements = mark.find_elements(By.CLASS_NAME, '_3t_C8')
        teacher=author_elements[0].text
        teammates = ' '.join([auth.text.strip() for auth in author_elements])
        school = mark.find_elements(By.CLASS_NAME, '_3vJDG')[0].text.strip() if mark.find_elements(By.CLASS_NAME, '_3vJDG') else 'Null'
        num=mark.find_element(By.CLASS_NAME, '_CWjg').text.strip()
        text = mark.find_element(By.CLASS_NAME, '_3JEMz').text.strip()
        schedule = mark.find_elements(By.CLASS_NAME, 'NOdDs')[0].text.strip() if mark.find_elements(By.CLASS_NAME, 'NOdDs') else 'Null'
        data.append([title,school,teacher, teammates, num,schedule,text])
    return data

最后往数据库里面插入数据:

点击查看代码
class DatabaseManager:
    def __init__(self):
        self.db = None
        self.cursor = None
        self.connect()

    def connect(self):
        """连接数据库"""
        try:
            self.db = pymysql.connect(
                host='localhost',
                user='root',
                password='826922',
                database='mooc_db',
                charset='utf8mb4',
                port=3306
            )
            self.cursor = self.db.cursor()
            print("MySQL数据库连接成功")
            self.create_table()
        except Exception as e:
            print(f"MySQL连接失败: {e}")
            self.db = None
            self.cursor = None

    def create_table(self):
        """创建数据表"""
        self.cursor.execute('DROP TABLE IF EXISTS mooc1')
        try:
            self.cursor.execute('''
                CREATE TABLE IF NOT EXISTS mooc1 (
                    mooc_name VARCHAR(255),
                    school VARCHAR(100),
                    teacher VARCHAR(100),
                    teamates TEXT,
                    num VARCHAR(50),
                    schedule VARCHAR(100),
                    text TEXT
                ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4
            ''')
            self.db.commit()
            print("数据表 mooc1 创建成功或已存在")
        except Exception as e:
            print(f"创建表失败: {e}")

    def insert_mooc_data(self, mooc_data):
        """插入mooc数据"""
        if not self.db or not self.cursor:
            print("数据库未连接,跳过数据插入")
            return False
        try:
            sql = '''
                INSERT INTO mooc1 
                (mooc_name, school, teacher, teamates, num,schedule,text)
                VALUES (%s, %s, %s, %s, %s, %s, %s)
            '''

            self.cursor.executemany(sql, mooc_data)
            self.db.commit()
            print(f"成功插入/更新 {self.cursor.rowcount} 条股票数据")
            return True
        except Exception as e:
            self.db.rollback()
            print(f"数据插入失败: {e}")
            return False

2.2运行结果


从终端查看
image

从数据库里面查看
image

2.3心得体会


这次实践更像一次“逆向工程”——我需要模拟一个真实用户的完整操作链条,从点击登录、填写信息,到搜索、提取数据。这个过程让我意识到,爬虫不仅是在获取数据,更是在理解一个复杂系统的运作方式。

在自动登录环节,仍有很多挑战,比如iframe的嵌套结构让元素定位变得困难,页面加载的不确定性需要精准的等待策略。每一个time.sleep()都不只是简单的暂停,而是对页面响应节奏的尊重。特别是切换回主框架的那一步,稍有不慎就会导致后续所有操作失败,这让我深刻体会到程序执行“上下文”的重要性。

数据提取时,面对看似规整的页面结构,实际处理中却充满变数。课程信息可能缺失教师,团队成员字段需要合并处理,参与人数和进度的格式需要清洗。每一个字段都可能隐藏着意外,代码必须具备足够的“弹性”来消化这些不完美。

3.第三题

掌握大数据相关服务,熟悉Xshell的使用
完成文档 华为云_大数据实时分析处理实验手册-Flume日志采集实验(部分)v2.docx 中的任务,即为下面5个任务,具体操作见文档。
环境搭建:
任务一:开通MapReduce服务
实时分析开发实战:
任务一:Python脚本生成测试数据
任务二:配置Kafka
任务三: 安装Flume客户端
任务四:配置Flume采集数据

实验步骤

任务一:环境搭建

3.1申请弹性公网IP

image

3.2开通MRS服务

image

任务二:实时分析开发实战

3.3 编写python脚本测试数据

image

3.4配置Kfaka

image

3.5配置Flume

image

3.6配置Flume采集数据(说明Kafka和flume互通)

image

心得体会


从本地Python脚本生成数据,到云端Kafka传输,再到Flume采集,我清晰地看到了数据如何在不同系统间流动。这种端到端的实践让我明白,真正的挑战往往不在单一技术的使用,而在于如何让多个复杂组件无缝协作。

实践中,“配置 - 测试 - 校验” 的闭环操作(如用 Kafka 消费者验证 Flume 采集、通过日志排查 Flink 报错)能有效提前规避问题,Linux 基础与脚本能力(如vim编辑、crontab定时)也大幅提升了操作效率。同时意识到,资源规划(合理选择 MRS、RDS 规格)与安全配置(仅开放必要端口、及时备份数据)是实验顺利推进的基础。后续可从架构优化(MRS 多节点部署、DLI 弹性扩缩容)与技术延伸(Flink CDC、CSS 日志分析)入手,进一步挖掘 “云 + 实时大数据” 的应用价值。

posted @ 2025-12-09 02:22  XWJ_777  阅读(8)  评论(0)    收藏  举报