博客4
作业①:
o 要求:
▪ 熟练掌握 Selenium 查找 HTML 元素、爬取 Ajax 网页数据、等待 HTML 元素等内
容。
▪ 使用 Selenium 框架+ MySQL 数据库存储技术路线爬取“沪深 A 股”、“上证 A 股”、
“深证 A 股”3 个板块的股票数据信息。
o 候选网站:东方财富网:
http://quote.eastmoney.com/center/gridlist.html#hs_a_board

通过分析html结构发现股票信息存储的位置
完整代码:
import time
import pymysql
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# ===========================
# MySQL 配置
# ===========================
DB_HOST = "localhost"
DB_USER = "scrapy"
DB_PASS = "777456qqQ"
DB_NAME = "stockdb"
db = pymysql.connect(
host=DB_HOST,
user=DB_USER,
password=DB_PASS,
database=DB_NAME,
charset="utf8mb4"
)
cursor = db.cursor()
# ===========================
# 工具函数:安全转换 float
# ===========================
def safe_float(v):
try:
if not v:
return 0.0
v = v.replace(",", "").replace("万", "").replace("亿", "").replace("--", "").strip()
if v == "":
return 0.0
return float(v)
except:
return 0.0
# ===========================
# 保存函数
# ===========================
def save(row):
sql = """
INSERT INTO eastmoney_stock(
board, xuhao, daima, mingcheng, zuixinbaojia,
zhangdiefu, zhangdiee, chengjiaoliang,
chengjiaoe, zhenfu, zuigao, zuidi,
jinkai, zuoshou
) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
ON DUPLICATE KEY UPDATE
zuixinbaojia=VALUES(zuixinbaojia),
zhangdiefu=VALUES(zhangdiefu),
zhangdiee=VALUES(zhangdiee),
chengjiaoliang=VALUES(chengjiaoliang),
chengjiaoe=VALUES(chengjiaoe),
zhenfu=VALUES(zhenfu),
zuigao=VALUES(zuigao),
zuidi=VALUES(zuidi),
jinkai=VALUES(jinkai),
zuoshou=VALUES(zuoshou)
"""
cursor.execute(sql, row)
db.commit()
# ===========================
# 爬取单个板块
# ===========================
def crawl_board(board_name, url, max_pages=2):
print(f"\n>>> 正在抓取板块:{board_name}")
driver.get(url)
wait = WebDriverWait(driver, 20)
page = 1
while page <= max_pages:
print(f" - 抓取第 {page} 页 ...")
wait.until(EC.presence_of_all_elements_located((By.XPATH, "//tbody/tr")))
rows = driver.find_elements(By.XPATH, "//tbody/tr")
for r in rows:
tds = r.find_elements(By.TAG_NAME, "td")
if len(tds) < 13:
continue
xuhao = tds[0].text.strip()
daima = tds[1].text.strip()
mingcheng = tds[2].text.strip()
zuixinbaojia = tds[3].text.strip()
zhangdiefu = tds[4].text.strip()
zhangdiee = tds[5].text.strip()
chengjiaoliang = tds[6].text.strip()
chengjiaoe = tds[7].text.strip()
zhenfu = tds[8].text.strip()
zuigao = tds[9].text.strip()
zuidi = tds[10].text.strip()
jinkai = tds[11].text.strip()
zuoshou = tds[12].text.strip()
# 跳过不合法股票
if not daima.isdigit():
continue
data = (
board_name, xuhao, daima, mingcheng, zuixinbaojia,
zhangdiefu, zhangdiee, chengjiaoliang,
chengjiaoe, zhenfu, zuigao, zuidi,
jinkai, zuoshou
)
save(data)
print(f" ✔ 保存 {daima} {mingcheng}")
# 翻页
if page < max_pages:
try:
next_btn = wait.until(
EC.element_to_be_clickable((By.XPATH, '//a[@title="下一页"]'))
)
next_btn.click()
print(" - 翻页成功")
time.sleep(2)
except:
print(" - 找不到下一页按钮,提前结束。")
break
page += 1
# ===========================
# 主程序
# ===========================
if __name__ == "__main__":
opt = Options()
# opt.add_argument("--headless") # 如果需要无头浏览器可打开
driver = webdriver.Chrome(options=opt)
boards = {
"沪深A股": "https://quote.eastmoney.com/center/gridlist.html#hs_a_board",
"上证A股": "https://quote.eastmoney.com/center/gridlist.html#sh_a_board",
"深证A股": "https://quote.eastmoney.com/center/gridlist.html#sz_a_board",
}
for name, url in boards.items():
crawl_board(name, url, max_pages=2)
driver.quit()
cursor.close()
db.close()
print("\n全部板块前 2 页爬取完成!")
运行结果:

作业②:
o 要求:
▪ 熟练掌握 Selenium 查找 HTML 元素、实现用户模拟登录、爬取 Ajax 网页数据、
等待 HTML 元素等内容。
▪ 使用 Selenium 框架+MySQL 爬取中国 mooc 网课程资源信息(课程号、课程名
称、学校名称、主讲教师、团队成员、参加人数、课程进度、课程简介)
o 候选网站:中国 mooc 网:https://www.icourse163.org
使用扫码来实现模拟用户登录,编写了cookie.py来实现第一次扫码登录
完整代码:
import json
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
COOKIE_FILE = "cookies.json"
CHROME_DRIVER_PATH = r"D:\python\pythonProject1\.venv\Scripts\chromedriver.exe"
def save_cookies(driver):
cookies = driver.get_cookies()
with open(COOKIE_FILE, "w", encoding="utf-8") as f:
json.dump(cookies, f, ensure_ascii=False, indent=2)
print("\n>>> Cookie 已保存到 cookies.json!\n")
def generate_cookie():
print(">>> 正在启动 Chrome,请稍等…")
# ===== 必须使用 Service 才能正确加载 chromedriver =====
service = Service(executable_path=CHROME_DRIVER_PATH)
options = webdriver.ChromeOptions()
options.add_experimental_option('excludeSwitches', ['enable-logging'])
options.add_argument('--log-level=3')
driver = webdriver.Chrome(service=service, options=options)
wait = WebDriverWait(driver, 20)
driver.get("https://www.icourse163.org/")
time.sleep(2)
print(">>> 浏览器已打开,请稍等...")
# 点击右上角登录按钮(最新页面结构)
try:
login_button = wait.until(EC.element_to_be_clickable(
(By.XPATH, "//div[contains(@class,'loginBtn')]")
))
login_button.click()
print(">>> 已点击登录按钮")
except:
print(">>> 登录按钮未找到,尝试备用 XPATH…")
try:
fallback_login_btn = wait.until(EC.element_to_be_clickable(
(By.XPATH, "//div[contains(text(),'登录')]")
))
fallback_login_btn.click()
except:
print(">>> 无法找到登录按钮!程序退出")
driver.quit()
return
time.sleep(2)
# 点击“扫码登录”
try:
qrcode_btn = wait.until(EC.element_to_be_clickable(
(By.XPATH, "//img[contains(@src,'qrcode')]")
))
qrcode_btn.click()
print(">>> 已切换到扫码登录,请扫码!")
except:
print(">>> 找不到二维码按钮,将直接等待扫码界面出现…")
input("\n>>> 请使用手机扫码登录,完成后按 Enter 继续… ")
# 保存 cookie
save_cookies(driver)
driver.quit()
print(">>> 已关闭浏览器")
if __name__ == "__main__":
generate_cookie()
爬取通过解析xpath来实现内容的爬取,同时将需要爬取的课程的链接直接放在代码内:

完整代码:
import json
import os
import time
import pymysql
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# ================== 配置区域 ==================
CHROME_DRIVER_PATH = r"D:\python\pythonProject1\.venv\Scripts\chromedriver.exe"
COOKIE_FILE = "cookies.json"
DB_HOST = "localhost"
DB_USER = "scrapy"
DB_PASS = "777456qqQ"
DB_NAME = "stockdb"
# 你要爬取的课程链接列表
COURSE_URLS = [
"https://www.icourse163.org/course/ZJU-199001",
"https://www.icourse163.org/course/XJTU-1002529011",
"https://www.icourse163.org/course/PKU-1206624828"
]
# ============================================
def load_cookies(driver):
if not os.path.exists(COOKIE_FILE):
print("cookies.json 不存在,请先生成 cookie!")
exit()
driver.get("https://www.icourse163.org/")
time.sleep(2)
with open(COOKIE_FILE, "r", encoding="utf-8") as f:
cookies = json.load(f)
for cookie in cookies:
cookie.pop("sameSite", None)
cookie.pop("expiry", None)
try:
driver.add_cookie(cookie)
except Exception as e:
print("添加 cookie 失败:", e)
driver.get("https://www.icourse163.org/")
time.sleep(2)
print("使用 cookie 登录成功!")
def safe_text(driver, wait, xpath):
try:
return wait.until(EC.presence_of_element_located((By.XPATH, xpath))).text.strip()
except:
return ""
def safe_attr(driver, wait, xpath, attr):
try:
return wait.until(EC.presence_of_element_located((By.XPATH, xpath))).get_attribute(attr)
except:
return ""
def crawl_course(driver, wait, url, cursor, db):
print(f"\n正在爬取:{url}")
driver.get(url)
time.sleep(2)
cCourse = safe_text(driver, wait,
"/html/body/div[5]/div[2]/div[1]/div/div/div/div[2]/div[2]/div/div[2]/div[1]/span[1]")
cCollege = safe_attr(driver, wait,
"/html/body/div[5]/div[2]/div[2]/div[2]/div[2]/div[2]/div[2]/div/a/img",
"alt")
# 获取所有老师
all_teachers = []
def get_teachers_from_page():
page_teachers = []
try:
teacher_container = driver.find_element(By.XPATH, "//div[@class='m-teachers_teacher-list']")
slider_con = teacher_container.find_element(By.XPATH, ".//div[@class='um-list-slider_con']")
con_items = slider_con.find_elements(By.XPATH, ".//div[@class='um-list-slider_con_item']")
for item in con_items:
try:
img = item.find_element(By.XPATH, ".//img")
name = img.get_attribute("alt")
if name:
page_teachers.append(name)
except:
continue
except Exception as e:
print("获取老师失败:", e)
return page_teachers
# 第一页
all_teachers.extend(get_teachers_from_page())
# 如果有翻页的话
try:
buttons = driver.find_elements(By.XPATH,
"/html/body/div[5]/div[2]/div[2]/div[2]/div[2]/div[2]/div[2]/div/div/div[2]/div/div[1]/span")
if buttons:
buttons[0].click()
time.sleep(2)
while True:
page_teachers = get_teachers_from_page()
all_teachers.extend(page_teachers)
buttons = driver.find_elements(By.XPATH,
"/html/body/div[5]/div[2]/div[2]/div[2]/div[2]/div[2]/div[2]/div/div/div[2]/div/div[1]/span")
if len(buttons) == 2:
buttons[1].click()
time.sleep(2)
else:
break
except Exception as e:
print("老师翻页失败:", e)
cTeacher = all_teachers[0] if all_teachers else ""
cTeam = ",".join(all_teachers)
# 参加人数
try:
elem = wait.until(EC.presence_of_element_located((By.XPATH,
"/html/body/div[5]/div[2]/div[1]/div/div/div/div[2]/div[2]/div/div[3]/div/div[1]/div[4]/span[2]"
)))
cCount = elem.text.strip()
except:
cCount = ""
# 进度
cProcess = safe_text(driver, wait,
"/html/body/div[5]/div[2]/div[1]/div/div/div/div[2]/div[2]/div/div[3]/div/div[1]/div[2]/div/span[2]"
)
# 简介
cBrief = safe_text(driver, wait,
"/html/body/div[5]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/div[2]/div[1]"
)
cursor.execute("""
INSERT INTO course_info(url, cCourse, cCollege, cTeacher, cTeam, cCount, cProcess, cBrief)
VALUES (%s,%s,%s,%s,%s,%s,%s,%s)
""", (url, cCourse, cCollege, cTeacher, cTeam, cCount, cProcess, cBrief))
db.commit()
print(f"爬取完成:{cCourse}")
def main():
db = pymysql.connect(
host=DB_HOST,
user=DB_USER,
password=DB_PASS,
database=DB_NAME,
charset="utf8mb4"
)
cursor = db.cursor()
cursor.execute("""
CREATE TABLE IF NOT EXISTS course_info(
id INT AUTO_INCREMENT PRIMARY KEY,
url VARCHAR(255),
cCourse VARCHAR(255),
cCollege VARCHAR(255),
cTeacher VARCHAR(255),
cTeam TEXT,
cCount VARCHAR(50),
cProcess VARCHAR(255),
cBrief TEXT
);
""")
db.commit()
service = Service(CHROME_DRIVER_PATH)
driver = webdriver.Chrome(service=service)
wait = WebDriverWait(driver, 20)
# 使用 cookie 登录
load_cookies(driver)
# 爬取你指定的链接列表
for url in COURSE_URLS:
crawl_course(driver, wait, url, cursor, db)
driver.quit()
db.close()
print("\n全部课程爬取完毕!")
if __name__ == "__main__":
main()
运行结果:

作业③:
要求:
掌握大数据相关服务,熟悉Xshell的使用
完成文档 华为云_大数据实时分析处理实验手册-Flume日志采集实验(部分)v2.docx 中的任务,即为下面5个任务,具体操作见文档。
环境搭建:
任务一:开通MapReduce服务
实时分析开发实战:
任务一:Python脚本生成测试数据
任务二:配置Kafka
任务三: 安装Flume客户端
任务四:配置Flume采集数据
1.Python脚本生成测试数据

2.配置Kafka

3.安装Flume客户端

4.配置Flume采集数据

浙公网安备 33010602011771号