2023数据采集与融合技术作业四

作业1

要求

熟练掌握 Selenium 查找 HTML 元素、爬取 Ajax 网页数据、等待 HTML 元素等内容
使用 Selenium 框架+ MySQL 数据库存储技术路线爬取“沪深 A 股”、“上证 A 股”、“深证 A 股”3 个板块的股票数据信息。
候选网站：东方财富网http://quote.eastmoney.com/center/gridlist.html#hs_a_board

输出信息

Code

import selenium
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium import webdriver
import time

Option = webdriver.ChromeOptions()
url = "http://quote.eastmoney.com/center/gridlist.html#hs_a_board"
chrome_options = Option
#chrome_options.add_argument('--headless')				#设置浏览器不显示
driver = webdriver.Chrome(options=chrome_options)
driver.get(url)
print(driver.find_element(By.XPATH,'.//*[@id="table_wrapper-table"]/tbody').text)
time.sleep(5)

page2 = driver.find_element(By.XPATH,'//*[@id="nav_sh_a_board"]/a')              
page2.click()
time.sleep(5)
print(driver.find_element(By.XPATH,'.//*[@id="table_wrapper-table"]/tbody').text)

page3 = driver.find_element(By.XPATH,'//*[@id="nav_sz_a_board"]/a')  
page3.click()
time.sleep(5)
print(driver.find_element(By.XPATH,'.//*[@id="table_wrapper-table"]/tbody').text)

心得体会

学会了如何用click去切换板块，只需要在元素里找到能定位该板块的XPATH即可。难度不大，对selenium熟悉程度加深

作业2

要求

熟练掌握 Selenium 查找 HTML 元素、实现用户模拟登录、爬取 Ajax 网页数据、等待 HTML 元素等内容。
使用 Selenium 框架+MySQL 爬取中国 mooc 网课程资源信息（课程号、课程名称、学校名称、主讲教师、团队成员、参加人数、课程进度、课程简介）
候选网站：中国 mooc 网：https://www.icourse163.org

输出信息

MYSQL 数据库存储和输出格式

Code

from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from prettytable import PrettyTable
from selenium.webdriver.common.keys import Keys
import time
import random

# 声明一个谷歌驱动器，并设置不加载图片，间接加快访问速度
options = webdriver.ChromeOptions()
options.add_experimental_option('prefs', {'profile.managed_default_content_settings.images': 2})
driver = webdriver.Chrome(options=options)
url = "https://www.icourse163.org/search.htm?search=%E6%93%8D%E4%BD%9C%E7%B3%BB%E7%BB%9F#/"
# 声明一个list，存储dict
data_list = []
table = PrettyTable(['课程号','课程名称','学校名称','主讲教师','团队成员','参加人数','课程进度','课程简介'])

def start_spider():
    # 请求url
    driver.get(url)

# 点击登录按钮
    WebDriverWait(driver,10,0.48).until(EC.presence_of_element_located((By.XPATH,'//a[@class="f-f0 navLoginBtn"]'))).click()
    iframe = WebDriverWait(driver,10,0.48).until(EC.presence_of_element_located((By.XPATH,'//*[@frameborder="0"]')))
    # 转到登录界面的iframe
    driver.switch_to.frame(iframe)
    # 输入账号密码并点击登录按钮
    driver.find_element(By.XPATH,'//*[@id="phoneipt"]').send_keys("")
    time.sleep(2)
    driver.find_element(By.XPATH,'//*[@class="j-inputtext dlemail"]').send_keys("")
    time.sleep(2)
    driver.find_element(By.ID,'submitBtn').click()

    # 开始提取信息,找到ul标签下的全部li标签
    count = 0
    for link in browser.find_elements(By.XPATH,'//div[@class="u-clist f-bgw f-cb f-pr j-href ga-click"]'):
        count += 1
        #课程号
      
        #课程名称
        course_name = link.find_element(By.XPATH,'.//span[@class=" u-course-name f-thide"]').text
        print("course name ",course_name)
        
        #学校名称，可能是机构老师或者mooc自己推出的，特别处理一下为空的情况
        try:
            school_name = link.find_element(By.XPATH,'.//a[@class="t21 f-fc9"]').text
        except Exception as err:
            school_name = 'none'
            
        print("school ", school_name)
        
        #主讲教师
        m_teacher = link.find_element(By.XPATH,'.//a[@class="f-fc9"]').text
        print("laoshi：", m_teacher)
        #团队成员
        try:
            team_member = link.find_element(By.XPATH,'.//span[@class="f-fc9"]').text
        except Exception as err:
            team_member = 'none'
        #print("团队：",team_member)
        #参加人数
        join = link.find_element(By.XPATH,'.//span[@class="hot"]').text
        join.replace('参加','')
        print("参加人数",join)
        #课程进度
        try:
            process = link.find_element(By.XPATH,'.//span[@class="txt"]').text
        except Exception as err:
            process = 'none'
        print('jingdu ',process)
        #课程简介
        try:
            introduction = link.find_element(By.XPATH,'.//span[@class="p5 brief f-ib f-f0 f-cb"]').text
        except Exception as err:
            introduction = 'none'
        print(introduction)
        table.add_row([ count,course_name,school_name,m_teacher,team_member,join,process,introduction]);


def main():

    start_spider()
if __name__ == '__main__':

    main()
    # 退出浏览器
    driver.quit()
    print(table)

心得体会

小问题1：最开始用id去定位登录框，但是发现总是会报找不到的错。是因为输入框以及登录按钮的标签都位于iframe标签中，需要使用switch_to.frame()函数将查找区域变更为对应iframe，才能找到输入框。每个iframe一般都有唯一的ID属性，但是该网站的ID属性值是动态生成的，每次加载的ID都不一样，应该选择根据frameborder属性值来定位该iframe。