数据采集第四次作业
作业内容
作业一:
要求:
熟练掌握 Selenium 查找HTML元素、爬取Ajax网页数据、等待HTML元素等内容。
使用Selenium框架+ MySQL数据库存储技术路线爬取“沪深A股”、“上证A股”、“深证A股”3个板块的股票数据信息。
候选网站:东方财富网:http://quote.eastmoney.com/center/gridlist.html#hs_a_board
输出信息:MYSQL数据库存储和输出格式如下,表头应是英文命名例如:序号id,股票代码:bStockNo……,由同学们自行定义设计表头:
Gitee文件夹链接 :https://gitee.com/hongjinju/songwenton/blob/master/作业四/4-1.py
代码与运行结果:
代码:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
import time
import pymysql
# 设置 ChromeDriver 的路径
service = Service(executable_path='D:\\chromedriver\\chromedriver.exe')
# 初始化 WebDriver
driver = webdriver.Chrome(service=service)
# 访问网页
driver.get('https://quote.eastmoney.com/center/gridlist.html#hs_a_board')
driver.implicitly_wait(5)
# 获取所有行元素
elements = driver.find_elements(By.XPATH, "//tbody//tr")
# 数据库连接配置
config = {
    'host': 'localhost',
    'user': 'root',
    'password': '123456',
    'database': 'stock4_1',  # 确保这里填写了正确的数据库名
    'charset': 'utf8mb4',
    'cursorclass': pymysql.cursors.DictCursor
}
# 连接数据库
connection = pymysql.connect(**config)
try:
    with connection.cursor() as cursor:
        for t in elements:
            # 获取每个单元格的文本
            id = t.find_element(By.XPATH, ".//td[1]").text
            stock_code = t.find_element(By.XPATH, ".//td[2]").text
            stock_name = t.find_element(By.XPATH, ".//td[3]").text
            latest_price = t.find_element(By.XPATH, ".//td[5]").text
            change_percent = t.find_element(By.XPATH, ".//td[6]").text
            change_amount = t.find_element(By.XPATH, ".//td[7]").text
            volume = t.find_element(By.XPATH, ".//td[8]").text
            turnover = t.find_element(By.XPATH, ".//td[9]").text
            amplitude = t.find_element(By.XPATH, ".//td[10]").text
            highest = t.find_element(By.XPATH, ".//td[11]").text
            lowest = t.find_element(By.XPATH, ".//td[12]").text
            open_price = t.find_element(By.XPATH, ".//td[13]").text
            last_close = t.find_element(By.XPATH, ".//td[14]").text
            # SQL 插入语句
            sql = """
            INSERT INTO stock_data (id, stock_code, stock_name, latest_price, change_percent, change_amount, volume, turnover, amplitude, highest, lowest, open_price, last_close)
            VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
            """
            # 执行 SQL 插入语句
            cursor.execute(sql, (id, stock_code, stock_name, latest_price, change_percent, change_amount, volume, turnover, amplitude, highest, lowest, open_price, last_close))
        # 提交事务
        connection.commit()
except pymysql.MySQLError as e:
    print(f"Error: {e}")
button1 = driver.find_element(By.XPATH,'//*[@id="nav_sh_a_board"]/a')
button1.click()
time.sleep(2)
element2 = driver.find_elements(By.XPATH, "//tbody//tr")
try:
    with connection.cursor() as cursor:
        for t in element2:
            # 获取每个单元格的文本
            id = t.find_element(By.XPATH, ".//td[1]").text
            stock_code = t.find_element(By.XPATH, ".//td[2]").text
            stock_name = t.find_element(By.XPATH, ".//td[3]").text
            latest_price = t.find_element(By.XPATH, ".//td[5]").text
            change_percent = t.find_element(By.XPATH, ".//td[6]").text
            change_amount = t.find_element(By.XPATH, ".//td[7]").text
            volume = t.find_element(By.XPATH, ".//td[8]").text
            turnover = t.find_element(By.XPATH, ".//td[9]").text
            amplitude = t.find_element(By.XPATH, ".//td[10]").text
            highest = t.find_element(By.XPATH, ".//td[11]").text
            lowest = t.find_element(By.XPATH, ".//td[12]").text
            open_price = t.find_element(By.XPATH, ".//td[13]").text
            last_close = t.find_element(By.XPATH, ".//td[14]").text
            # SQL 插入语句
            sql = """
            INSERT INTO stock_data2 (id, stock_code, stock_name, latest_price, change_percent, change_amount, volume, turnover, amplitude, highest, lowest, open_price, last_close)
            VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
            """
            # 执行 SQL 插入语句
            cursor.execute(sql, (id, stock_code, stock_name, latest_price, change_percent, change_amount, volume, turnover, amplitude, highest, lowest, open_price, last_close))
        # 提交事务
        connection.commit()
except pymysql.MySQLError as e:
    print(f"Error: {e}")
button2 = driver.find_element(By.XPATH,'//*[@id="nav_sz_a_board"]/a')
button2.click()
time.sleep(2)
element3 = driver.find_elements(By.XPATH, "//tbody//tr")
try:
    with connection.cursor() as cursor:
        for t in element3:
            # 获取每个单元格的文本
            id = t.find_element(By.XPATH, ".//td[1]").text
            stock_code = t.find_element(By.XPATH, ".//td[2]").text
            stock_name = t.find_element(By.XPATH, ".//td[3]").text
            latest_price = t.find_element(By.XPATH, ".//td[5]").text
            change_percent = t.find_element(By.XPATH, ".//td[6]").text
            change_amount = t.find_element(By.XPATH, ".//td[7]").text
            volume = t.find_element(By.XPATH, ".//td[8]").text
            turnover = t.find_element(By.XPATH, ".//td[9]").text
            amplitude = t.find_element(By.XPATH, ".//td[10]").text
            highest = t.find_element(By.XPATH, ".//td[11]").text
            lowest = t.find_element(By.XPATH, ".//td[12]").text
            open_price = t.find_element(By.XPATH, ".//td[13]").text
            last_close = t.find_element(By.XPATH, ".//td[14]").text
            # SQL 插入语句
            sql = """
            INSERT INTO stock_data3 (id, stock_code, stock_name, latest_price, change_percent, change_amount, volume, turnover, amplitude, highest, lowest, open_price, last_close)
            VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
            """
            # 执行 SQL 插入语句
            cursor.execute(sql, (id, stock_code, stock_name, latest_price, change_percent, change_amount, volume, turnover, amplitude, highest, lowest, open_price, last_close))
        # 提交事务
        connection.commit()
except pymysql.MySQLError as e:
    print(f"Error: {e}")
finally:
    connection.close()
# 关闭 WebDriver
driver.quit()
# 等待一段时间
time.sleep(10)
运行结果:



心得体会:通过实验一,使我进一步理解数据库与爬虫的结合,同时使我更熟练地掌握x-path
作业二:
要求:
熟练掌握 Selenium 查找HTML元素、实现用户模拟登录、爬取Ajax网页数据、等待HTML元素等内容。
使用Selenium框架+MySQL爬取中国mooc网课程资源信息(课程号、课程名称、学校名称、主讲教师、团队成员、参加人数、课程进度、课程简介)
候选网站:中国mooc网:https://www.icourse163.org
输出信息:MYSQL数据库存储和输出格式
Gitee文件夹链接:https://gitee.com/hongjinju/songwenton/blob/master/作业四/4-2.py
代码与运行结果:
代码:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
import time
import pymysql
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# 设置 ChromeDriver 的路径
service = Service(executable_path='D:\\chromedriver\\chromedriver.exe')
# 初始化 WebDriver
driver = webdriver.Chrome(service=service)
config = {
    'host': 'localhost',
    'user': 'root',
    'password': '123456',
    'database': 'Mook',  # 确保这里填写了正确的数据库名
    'charset': 'utf8mb4',
    'cursorclass': pymysql.cursors.DictCursor
}
connection = pymysql.connect(**config)
driver.get("https://www.icourse163.org")
time.sleep(2)
# 点击“登录/注册”按钮
login_register_button = WebDriverWait(driver, 2).until(
    EC.element_to_be_clickable((By.CSS_SELECTOR, "div._3uWA6[role='button']"))
)
login_register_button.click()
# 切换到 iframe 并输入手机号和密码
iframe = WebDriverWait(driver, 2).until(
    EC.presence_of_element_located((By.CSS_SELECTOR, "iframe[src*='index_dl2_new.html']"))
)
driver.switch_to.frame(iframe)
phone_input = WebDriverWait(driver, 2).until(
    EC.presence_of_element_located((By.ID, "phoneipt"))
)
phone_input.send_keys("18950468826")  # 替换为实际手机号
password_input = WebDriverWait(driver, 2).until(
    EC.presence_of_element_located((By.CLASS_NAME, "j-inputtext"))
)
password_input.send_keys("hjj040323.")  # 替换为实际密码
login_button = WebDriverWait(driver, 5).until(
    EC.element_to_be_clickable((By.ID, "submitBtn"))
)
login_button.click()
driver.switch_to.window(driver.window_handles[-1])  # 切换到新窗口
# 点击“同意”按钮
agree_button = WebDriverWait(driver, 10).until(
    EC.element_to_be_clickable((By.XPATH, '//button[ @class="btn ok"]'))
)
agree_button.click()
# 定位到“国家精品课”链接并点击
national_course_link = WebDriverWait(driver, 10).until(
    EC.element_to_be_clickable((By.XPATH, '//a[@href="https://www.icourse163.org/channel/2001.htm"]'))
)
national_course_link.click()
wait = WebDriverWait(driver, 10)
driver.execute_script("window.scrollTo(0, 0);")
courses = driver.find_elements(By.XPATH, '//ul[@class="_2mEuw"][position()=1]/div')
i = 1
try:
    with connection.cursor() as cursor:
        for course in courses:
            if i == 10:
                break
            course.click()
            driver.implicitly_wait(20)  # 每次点击后都设置隐式等待
            new_window_handle = driver.window_handles[-1]  # 获取新打开的窗口句柄
            driver.switch_to.window(new_window_handle)
            Id = i - 1
            cCourse = driver.find_element(By.XPATH, '//span[@class="course-title f-ib f-vam"]').text
            cCollege = driver.find_element(By.XPATH, "//img[@class='u-img']").get_attribute('alt')
            cTeacher = driver.find_element(By.XPATH, "//div[@class='cnt f-fl']").text
            cTeam = driver.find_element(By.XPATH, "//div[@class='cnt f-fl']").text
            cCount = driver.find_element(By.XPATH, "//span[@class='count']").text
            cProcess = driver.find_element(By.XPATH,
                                           "//div[@class='course-enroll-info_course-info_term-info_term-time']").text
            cBrief = driver.find_element(By.XPATH, "//div[@class='f-richEditorText']").text
            # 这里可以添加您的其他操作,比如保存数据等
            driver.close()  # 关闭当前窗口
            driver.switch_to.window(driver.window_handles[0])  # 切换回原始窗口
            i += 1
            sql = """
                        INSERT INTO mooktables (cCourse, cCollege, cTeacher, cTeam, cCount, cProcess)
                        VALUES (%s, %s, %s, %s, %s, %s);
                        """
            # 执行 SQL 插入语句
            cursor.execute(sql, (
                cCourse, cCollege, cTeacher, cTeam, cCount, cProcess))
            connection.commit()
except pymysql.MySQLError as e:
    print(f"Error: {e}")
```plaintext
# 关闭 WebDriver
driver.quit()
运行结果:

心得体会:通过任务2,我进一步理解了selenium的模拟点击和页面跳转功能
作业3:
要求:
掌握大数据相关服务,熟悉Xshell的使用
完成文档 华为云_大数据实时分析处理实验手册-Flume日志采集实验(部分)v2.docx 中的任务,即为下面5个任务,具体操作见文档。
环境搭建:
任务一:开通MapReduce服务




实时分析开发实战:
任务一:Python脚本生成测试数据

任务二:配置Kafka


任务三: 安装Flume客户端

任务四:配置Flume采集数据


                    
                
                
            
        
浙公网安备 33010602011771号