数据采集作业4 102302111 海米沙
作业1
(1)题目:使用 Selenium 框架+ MySQL 数据库存储技术路线爬取“沪深 A 股”、“上证 A 股”、“深证 A 股”3 个板块的股票数据信息。熟练掌握 Selenium 查找 HTML 元素、爬取 Ajax 网页数据、等待 HTML 元素等内容。
Gitee 文件夹链接:https://gitee.com/haimisha/2025_creat_project/blob/master/作业四/股票
代码内容:
点击查看代码
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
import pymysql
import time
# chromedriver.exe 路径
CHROMEDRIVER_PATH = r"D:\Google Chrome\chromedriver-win32\chromedriver.exe"
# 目标板块 URL
BOARD_URLS = [
"http://quote.eastmoney.com/center/gridlist.html#hs_a_board", # 沪深A股
"http://quote.eastmoney.com/center/gridlist.html#sh_a_board", # 上证A股
"http://quote.eastmoney.com/center/gridlist.html#sz_a_board" # 深证A股
]
# -------------------------- MySQL连接--------------------------
def connect_mysql():
conn = pymysql.connect(
host='localhost',
user='root',
password='123456',
database='stock_db',
charset="utf8mb4",
port=3306
)
print("MySQL已连接")
return conn
# -------------------------- 创建数据表--------------------------
def create_stock_table(conn):
cursor = conn.cursor()
create_sql = """
CREATE TABLE IF NOT EXISTS stock_data (
id INT AUTO_INCREMENT PRIMARY KEY,
bStockNo VARCHAR(20) NOT NULL,
bStockName VARCHAR(50) NOT NULL,
latestPrice DECIMAL(10,2),
priceChangeRate DECIMAL(6,2),
priceChangeAmount DECIMAL(10,2),
tradingVolume BIGINT,
tradingAmount BIGINT,
amplitude DECIMAL(6,2),
highestPrice DECIMAL(10,2),
lowestPrice DECIMAL(10,2),
openingPrice DECIMAL(10,2),
previousClose DECIMAL(10,2)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
"""
cursor.execute(create_sql)
conn.commit()
cursor.close()
print("数据表已创建")
# -------------------------- 数据清洗--------------------------
def clean_data(text, data_type):
text = text.strip().replace(",", "").replace(" ", "")
if not text or text == "-":
return None
try:
if data_type == "rate":
return float(text.replace("%", ""))
elif data_type == "volume":
return int(float(text.replace("万", "")) * 10000) if "万" in text else int(text)
elif data_type == "amount":
if "亿" in text:
return int(float(text.replace("亿", "")) * 100000000)
elif "万" in text:
return int(float(text.replace("万", "")) * 10000)
return int(text)
else:
return round(float(text), 2)
except:
return None # 兼容异常数据,避免程序中断
# -------------------------- 核心爬取逻辑--------------------------
def crawl_board(driver, conn, board_url):
driver.get(board_url)
time.sleep(3) # 延长等待,确保Ajax加载完成(关键优化)
cursor = conn.cursor()
page =1
while True:
try:
# 表格XPath
table_xpath = "//div[@class='quotetable']//table//tbody"
WebDriverWait(driver, 20).until(
EC.presence_of_element_located((By.XPATH, table_xpath))
)
rows = driver.find_elements(By.XPATH, f'{table_xpath}/tr')
if len(rows) == 0:
print(f"第{page}页无数据,爬取结束")
break
print(f"爬取第{page}页,共{len(rows)}条数据...")
for row in rows:
cols = row.find_elements(By.TAG_NAME, "td")
if len(cols) < 15:
continue
# 提取数据
bStockNo = cols[1].text.strip() #股票代码
bStockName = cols[2].text.strip() #股票名
latestPrice = clean_data(cols[4].text, "price") #最新报价
priceChangeRate = clean_data(cols[5].text, "rate") #涨跌幅
priceChangeAmount = clean_data(cols[6].text, "price") #涨跌额
tradingVolume = clean_data(cols[7].text, "volume") #成交量
tradingAmount = clean_data(cols[8].text, "amount") #成交额
amplitude = clean_data(cols[9].text, "rate") #振幅
highestPrice = clean_data(cols[10].text, "price") #最高
lowestPrice = clean_data(cols[11].text, "price") #最低
openingPrice = clean_data(cols[12].text, "price") #今开
previousClose = clean_data(cols[13].text, "price") #昨收
# 插入数据库
if bStockNo and bStockName:
insert_sql = """
INSERT INTO stock_data (
bStockNo, bStockName, latestPrice, priceChangeRate,
priceChangeAmount, tradingVolume, tradingAmount, amplitude,
highestPrice, lowestPrice, openingPrice, previousClose
) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
"""
cursor.execute(insert_sql, (
bStockNo, bStockName, latestPrice, priceChangeRate,
priceChangeAmount, tradingVolume, tradingAmount, amplitude,
highestPrice, lowestPrice, openingPrice, previousClose
))
conn.commit()
# 下一页逻辑
if page >= 2: # 每个板块只爬2页
board_name = board_url.split("#")[-1].replace("_a_board", "")
break
try:
next_btn = driver.find_element(By.XPATH, "//div[@class='quotetable']//table//tbody")
if "disabled" in next_btn.get_attribute("class"):
board_name = board_url.split("#")[-1].replace("_a_board", "")
print(f" {board_name}板块爬取完成!")
break
next_btn.click()
page += 1
time.sleep(3) # 延长等待,避免爬取过快被拦截
except:
print(f"无下一页,{board_url.split('#')[-1].replace('_a_board', '')}板块爬取完成")
break
except Exception as e:
print(f"第{page}页爬取失败:{str(e)}")
break
cursor.close()
# -------------------------- 主程序--------------------------
if __name__ == "__main__":
options = webdriver.ChromeOptions()
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")
# 模拟正常浏览器请求
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36")
options.add_argument("--ignore-certificate-errors") # 忽略证书错误
# 启动驱动
try:
service = Service(executable_path=CHROMEDRIVER_PATH)
driver = webdriver.Chrome(service=service, options=options)
print("Chrome驱动启动成功!")
except Exception as e:
print(f"驱动启动失败:{str(e)}")
exit()
# 执行爬取
conn = connect_mysql()
create_stock_table(conn)
for url in BOARD_URLS:
crawl_board(driver, conn, url)
# 关闭资源
conn.close()
driver.quit()
print("\n爬取完成!已存入stock_data表")
运行结果:

(2)心得体会:一个错误的XPath、一个缺失的异常处理、甚至只是time.sleep()的秒数不对,都可能让整个程序崩溃。但正是这些“坑”让我真正理解了编程不只是写代码,更是思考如何让代码在复杂环境中“聪明”地运行。
作业2
(1)题目:使用 Selenium 框架+MySQL 爬取中国 mooc 网课程资源信息(课程号、课程名
称、学校名称、主讲教师、团队成员、参加人数、课程进度、课程简介。熟练掌握 Selenium 查找 HTML 元素、实现用户模拟登录、爬取 Ajax 网页数据、
等待 HTML 元素等内容。
Gitee 文件夹链接:https://gitee.com/haimisha/2025_creat_project/blob/master/作业四/慕课
代码内容:
点击查看代码
# mooc_crawler_final.py
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
import time
import pymysql
import re
# ==================== 配置部分 ====================
# ChromeDriver路径
CHROMEDRIVER_PATH = r"D:\Google Chrome\chromedriver-win32\chromedriver.exe"
# 数据库配置
DB_CONFIG = {
'host': 'localhost',
'user': 'root',
'password': '123456',
'port': 3306,
'database': 'mooc_courses'
}
# ==================== 初始化函数 ====================
def init_database():
"""初始化数据库连接和表"""
try:
db = pymysql.connect(**DB_CONFIG)
cursor = db.cursor()
print("数据库连接成功")
# 创建表
cursor.execute('DROP TABLE IF EXISTS courseMessage')
sql = '''CREATE TABLE courseMessage(
cCourse VARCHAR(255),
cCollege VARCHAR(100),
cTeacher VARCHAR(100),
cTeam VARCHAR(500),
cCount VARCHAR(50),
cProcess VARCHAR(100),
cBrief TEXT,
crawl_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4'''
cursor.execute(sql)
print("数据表创建成功")
return db, cursor
except Exception as e:
print(f"数据库初始化失败: {e}")
exit(1)
def init_browser():
"""初始化浏览器(优化反爬配置)"""
print("正在初始化浏览器...")
options = Options()
options.add_argument('--disable-blink-features=AutomationControlled')
options.add_argument('--disable-gpu')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument('--start-maximized')
options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36')
# 反爬核心设置
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
try:
service = Service(CHROMEDRIVER_PATH)
driver = webdriver.Chrome(service=service, options=options)
# 清除webdriver标识
driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
print("浏览器初始化成功")
return driver
except Exception as e:
print(f"浏览器初始化失败: {e}")
exit(1)
# ==================== 爬虫核心函数 ====================
def handle_privacy_popup(driver):
"""处理隐私协议弹窗(兼容多种弹窗样式)"""
try:
time.sleep(2)
privacy_selectors = [
"//button[contains(text(), '同意')]",
"//button[contains(text(), '接受')]",
"//div[contains(@class, 'privacy')]//button[last()]",
"//div[contains(@class, 'popup-close')]"
]
for selector in privacy_selectors:
try:
agree_btn = WebDriverWait(driver, 3).until(
EC.element_to_be_clickable((By.XPATH, selector))
)
agree_btn.click()
print("已处理隐私协议弹窗")
time.sleep(1)
return True
except:
continue
except:
pass
return False
def navigate_to_computer_category(driver):
"""导航到计算机分类页面(优化定位逻辑)"""
print("正在访问中国大学MOOC...")
driver.get("https://www.icourse163.org/")
time.sleep(3)
# 处理隐私弹窗
handle_privacy_popup(driver)
# 查找并点击计算机分类(优先通过导航栏定位)
try:
# 等待导航栏加载
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, '//div[@class="nav"]'))
)
# 查找计算机分类
computer_links = driver.find_elements(By.XPATH,
"//div[@class='nav']//a[contains(text(), '计算机') or contains(@href, 'computer')]")
if computer_links:
# 确保元素可见再点击
driver.execute_script("arguments[0].scrollIntoView();", computer_links[0])
time.sleep(1)
computer_links[0].click()
print("成功点击计算机分类")
time.sleep(3)
return True
except Exception as e:
print(f"通过导航栏定位计算机分类失败: {e}")
# 备用方案:直接访问计算机分类URL
try:
driver.get("https://www.icourse163.org/category/computer")
print("直接访问计算机分类页面")
time.sleep(3)
# 验证是否进入正确页面
WebDriverWait(driver, 10).until(
EC.title_contains("计算机")
)
return True
except Exception as e:
print(f"无法访问计算机分类页面: {e}")
return False
def spider_one_page(driver, db, cursor, page_num=1):
"""爬取单页课程(修复定位错误,加强异常处理)"""
print(f"\n正在爬取第 {page_num} 页...")
time.sleep(4) # 延长加载时间,适配慢网络
current_window_handle = driver.current_window_handle
saved_count = 0
try:
# 等待课程列表容器加载(更稳定的定位)
WebDriverWait(driver, 15).until(
EC.presence_of_element_located((By.XPATH, '//div[contains(@id, "channel-course-list")]'))
)
# 重新定位课程列表
courses = driver.find_elements(By.XPATH,
'//div[contains(@id, "channel-course-list")]//div[contains(@class, "course-card") or @class="_1fpi1"]')
print(f"找到 {len(courses)} 门课程")
if not courses:
print("未找到课程元素,可能页面结构已变更")
return 0
for i, course in enumerate(courses):
try:
print(f"\n处理第 {i+1} 个课程...")
# 提取基本信息(添加存在性判断,避免字段缺失)
cCourse = course.find_element(By.XPATH, './/h3').text.strip() if course.find_elements(By.XPATH, './/h3') else "未知课程"
cCollege = course.find_element(By.XPATH, './/p[contains(@class, "_2lZi3") or contains(@class, "college")]').text.strip() if course.find_elements(By.XPATH, './/p[contains(@class, "_2lZi3") or contains(@class, "college")]') else "未知院校"
cTeacher = course.find_element(By.XPATH, './/div[contains(@class, "_1Zkj9") or contains(@class, "teacher")]').text.strip() if course.find_elements(By.XPATH, './/div[contains(@class, "_1Zkj9") or contains(@class, "teacher")]') else "未知教师"
cCount = course.find_element(By.XPATH, './/div[contains(@class, "jvxcQ") or contains(@class, "count")]/span').text.strip() if course.find_elements(By.XPATH, './/div[contains(@class, "jvxcQ") or contains(@class, "count")]/span') else "0"
cProcess = course.find_element(By.XPATH, './/div[contains(@class, "jvxcQ") or contains(@class, "process")]/div').text.strip() if course.find_elements(By.XPATH, './/div[contains(@class, "jvxcQ") or contains(@class, "process")]/div') else "未知进度"
print(f"课程: {cCourse}")
print(f"学校: {cCollege}")
# 点击课程进入详情页(确保点击成功)
driver.execute_script("arguments[0].click();", course)
time.sleep(2)
# 切换到新窗口(等待窗口数量变化)
WebDriverWait(driver, 5).until(
EC.number_of_windows_to_be(2)
)
handles = driver.window_handles
driver.switch_to.window(handles[-1]) # 切换到最新打开的窗口
time.sleep(3)
# 提取课程简介(兼容多种页面结构)
cBrief = ""
try:
# 优先找固定ID
cBrief = driver.find_element(By.XPATH, '//*[@id="j-rectxt2"]').text.strip()
if not cBrief:
# 备选:找简介区块
brief_elements = driver.find_elements(By.XPATH,
'//div[contains(@class, "course-brief") or //div[contains(@id, "content-section")]/div[4]]//*')
cBrief = " ".join([elem.text.strip() for elem in brief_elements])
except Exception as e:
print(f"提取简介失败: {e}")
cBrief = "暂无简介"
# 清理简介文本(避免SQL错误)
cBrief = cBrief.replace('"', r'\"').replace("'", r"\'").strip()
# 提取授课团队(优化定位,去重)
nameList = []
try:
# 等待团队区域加载
WebDriverWait(driver, 5).until(
EC.presence_of_element_located((By.XPATH, '//div[contains(@class, "um-list-slider")]'))
)
cTeachers = driver.find_elements(By.XPATH, '//div[@class="um-list-slider_con_item"]')
for Teacher in cTeachers:
name = Teacher.find_element(By.XPATH, './/h3[contains(@class, "f-fc3") or contains(@class, "name")]').text.strip()
if name and name not in nameList:
nameList.append(name)
# 处理团队翻页(避免无限循环)
nextButton = driver.find_elements(By.XPATH, '//div[@class="um-list-slider_next f-pa" and not(@style="display: none;")]')
click_count = 0 # 限制最大点击次数,防止死循环
while nextButton and click_count < 5:
nextButton[0].click()
time.sleep(2)
cTeachers = driver.find_elements(By.XPATH, '//div[@class="um-list-slider_con_item"]')
for Teacher in cTeachers:
name = Teacher.find_element(By.XPATH, './/h3[contains(@class, "f-fc3") or contains(@class, "name")]').text.strip()
if name and name not in nameList:
nameList.append(name)
nextButton = driver.find_elements(By.XPATH, '//div[@class="um-list-slider_next f-pa" and not(@style="display: none;")]')
click_count += 1
except Exception as e:
print(f"提取团队信息失败: {e}")
nameList = [cTeacher] if cTeacher else ["未知团队"]
cTeam = '、'.join(nameList)
# 限制团队名称长度,避免字段溢出
if len(cTeam) > 490:
cTeam = cTeam[:490] + "..."
# 关闭详情页,返回列表页
driver.close()
driver.switch_to.window(current_window_handle)
time.sleep(1)
# 插入数据库(参数化查询,避免SQL注入)
try:
sql = '''INSERT INTO courseMessage (cCourse, cCollege, cTeacher, cTeam, cCount, cProcess, cBrief)
VALUES (%s, %s, %s, %s, %s, %s, %s)'''
cursor.execute(sql, (cCourse, cCollege, cTeacher, cTeam, cCount, cProcess, cBrief))
db.commit()
print(f"已保存: {cCourse}")
saved_count += 1
except Exception as e:
print(f"保存失败: {e}")
db.rollback()
# 延迟防止反爬
time.sleep(1)
except Exception as e:
print(f"处理课程时出错: {e}")
# 确保回到主窗口,避免影响后续爬取
try:
if driver.current_window_handle != current_window_handle:
driver.close()
driver.switch_to.window(current_window_handle)
except:
pass
continue
print(f"第 {page_num} 页爬取完成,共保存 {saved_count} 门课程")
return saved_count
except Exception as e:
print(f"爬取页面时出错: {e}")
return 0
def go_to_next_page(driver):
"""翻页到下一页(修复定位错误,用文本定位更稳定)"""
try:
# 等待分页区域加载
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, '//div[contains(@class, "pagination")]'))
)
# 用"下一页"文本定位(比a[10]稳定10倍)
next_page = driver.find_element(By.XPATH,
'//div[contains(@class, "pagination")]//a[contains(text(), "下一页")]')
# 判断是否可点击(通过class和disabled属性)
if next_page.get_attribute('class') == '_3YiUU ' and not next_page.get_attribute('disabled'):
driver.execute_script("arguments[0].scrollIntoView();", next_page)
time.sleep(1)
next_page.click()
time.sleep(4) # 等待新页面加载完成
return True
else:
print("已是最后一页,无法继续翻页")
return False
except Exception as e:
print(f"翻页失败: {e}")
return False
# ==================== 主程序 ====================
def main():
"""主函数(优化流程控制)"""
print("="*60)
print("中国大学MOOC课程爬虫")
print("="*60)
driver = None
db = None
cursor = None
try:
# 1. 初始化数据库
db, cursor = init_database()
# 2. 初始化浏览器
driver = init_browser()
# 3. 导航到计算机分类
if not navigate_to_computer_category(driver):
print("无法访问课程页面,程序退出")
return
# 4. 询问爬取页数
try:
page_count_input = input("\n请输入要爬取的页数 (建议1-2页): ").strip()
page_count = int(page_count_input) if page_count_input else 1
# 限制最大爬取页数,避免反爬
page_count = min(page_count, 5)
except:
page_count = 1
print("输入无效,默认爬取1页")
total_saved = 0
# 5. 爬取每一页
for page in range(1, page_count + 1):
saved = spider_one_page(driver, db, cursor, page)
total_saved += saved
# 如果不是最后一页,尝试翻页
if page < page_count:
if not go_to_next_page(driver):
print(f"爬取到第 {page} 页后无法继续翻页,提前结束")
break
# 6. 显示爬取结果
print("\n" + "="*60)
print(f"爬取完成!总共保存 {total_saved} 门课程")
# 查询数据库中的总记录数
cursor.execute("SELECT COUNT(*) FROM courseMessage")
total_count = cursor.fetchone()[0]
print(f"数据库中现有 {total_count} 门课程记录")
# 显示最新的5门课程
print("\n最新爬取的课程:")
cursor.execute("SELECT cCourse, cCollege, cTeacher FROM courseMessage ORDER BY crawl_time DESC LIMIT 5")
results = cursor.fetchall()
for i, row in enumerate(results, 1):
# 截断过长的课程名称,避免显示混乱
course_name = row[0][:30] + "..." if len(row[0]) > 30 else row[0]
print(f"{i}. {course_name} - {row[1]} - {row[2]}")
# 7. 导出CSV功能(可选)
export_csv = input("\n是否导出为CSV文件?(y/n): ").strip().lower()
if export_csv == 'y':
import csv
cursor.execute("SELECT * FROM courseMessage")
all_results = cursor.fetchall()
field_names = ['课程名称', '学校', '主讲教师', '团队成员', '参加人数', '课程进度', '课程简介', '爬取时间']
with open('mooc_courses_result.csv', 'w', newline='', encoding='utf-8-sig') as f:
writer = csv.writer(f)
writer.writerow(field_names)
writer.writerows(all_results)
print(f"已导出 {len(all_results)} 条记录到 mooc_courses_result.csv")
print("\n程序执行完成!")
except KeyboardInterrupt:
print("\n\n程序被用户中断")
except Exception as e:
print(f"\n程序运行出错: {e}")
import traceback
traceback.print_exc()
finally:
# 关闭所有连接,释放资源
print("\n正在关闭连接...")
if cursor:
cursor.close()
if db:
db.close()
if driver:
driver.quit()
print("所有连接已关闭")
# ==================== 运行程序 ====================
if __name__ == "__main__":
main()
运行结果:

(2)心得体会: 这项作业中我学会了如何应对真实网站的反爬机制和动态页面结构变化。当面对xpath,元素定位失败等问题时,通过查阅文档、调试分析和多方案尝试,最终找到解决方案的过程让我收获颇丰
作业3
(1)题目:完成文档 华为云_大数据实时分析处理实验手册-Flume 日志采集实验(部分)v2.docx 中的任务,即为下面 5 个任务,具体操作见文档
流程:
任务一:Python 脚本生成测试数据
登录MRS的master节点服务器

编写Python脚本




创建存放测试数据的目录
执行脚本测试

任务二:配置 Kafka
进入MRS Manager集群管理

下载Flume客户端

校验下载的客户端文件包

安装Kafka客户端

在kafka中创建topic

任务三: 安装 Flume 客户端
进入MRS Manager集群管理

下载Flume客户端

校验下载的客户端文件包

安装Flume运行环境

安装Flume客户端

重启Flume服务

任务四:配置 Flume 采集数据
修改配置文件

创建消费者消费kafka中的数据

MySQL中准备结果表与维度表数据
登录MySQL的DAS管理平台。
创建数据库
进入SQL界面

创建维度表并插入数据

创建Flink作业的结果表

(2)心得体会:认识到 Flume 作为采集agent的可靠性,以及 Kafka 作为高吞吐消息队列在削峰填谷中的作用。配置过程中需注意 Flume Source/Sink 与 Kafka Topic 的匹配,以及序列化格式的一致性。Flume 配置文件中 Channel 类型的选择,需要权衡性能与数据可靠性。

浙公网安备 33010602011771号