Loading

selenium+mysql 爬取LEI官网数据

import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
import pymysql

# 指定WebDriver的路径
webdriver_path = 'C:/chromedriver/chromedriver.exe'
# 创建Service对象
service = Service(executable_path=webdriver_path)
# 连接数据库
db = pymysql.connect(host='127.0.0.1', user='root', password='******', db='lei_db', charset='utf8mb4')
# 创建游标对象
cursor = db.cursor()
# 初始化WebDriver
driver = webdriver.Chrome(service=service)
# 打开目标网页
driver.get("https://www.leichina.org/cei/2935720/2935943/index.html")
time.sleep(10)
driver.maximize_window()
# 等待页面加载完成
WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
# 定位iframe并切换上下文
iframe = driver.find_element(By.ID, "frame2")
driver.switch_to.frame(iframe)
# 定位“更多”按钮并点击
more_btn = driver.find_element(By.ID, "moreBtn")
more_btn.click()
# 等待select元素出现
select = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "#btn2 > select")))
# 使用Select类来处理下拉框
select = Select(select)
select.select_by_index(1)
# 定位验证码并刷icon
icon_element = driver.find_element(By.ID, "changepic")
icon_element.click()  # 点击刷新icon,刷新它
# 等待验证码图片加载完成
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "#imgcode")))
# 这里等待用户手动输入验证码
user_input = input("请打开验证码图片URL,在浏览器中查看验证码,并输入验证码:")
# 定位验证码输入框并输入验证码
vcode_input = driver.find_element(By.CSS_SELECTOR, "#searchForm\:vCode")
vcode_input.send_keys(user_input)
# 定位查询按钮并点击
query_btn = driver.find_element(By.ID, "searchForm:j_id36")
query_btn.click()
time.sleep(10)
# 等待查询结果表格加载完成
WebDriverWait(driver, 30).until(
    EC.presence_of_element_located((By.CSS_SELECTOR, "#resultForm\:cdmLegalEntityListData")))
# 解析查询结果
total_pages = 10692
for page in range(1, total_pages + 1):
    print(f"正在处理第{page}页...")
    time.sleep(10)
    # 等待表格加载完成
    WebDriverWait(driver, 30).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, "#resultForm\:cdmLegalEntityListData")))
    #仅用于获取行数 因为最后一页行数可能小于10
    table = driver.find_element(By.CSS_SELECTOR, "#resultForm\:cdmLegalEntityListData\:tb")
    rows = table.find_elements(By.TAG_NAME, "tr")
    r_count = len(rows)
    for i in range(0,r_count):
        # 解析表格
        table = driver.find_element(By.CSS_SELECTOR, "#resultForm\:cdmLegalEntityListData\:tb")
        # rows = table.find_elements(By.TAG_NAME, "tr")[1:]  # 跳过表头
        rows = table.find_elements(By.TAG_NAME, "tr")
        row=rows[i]
        cells = row.find_elements(By.TAG_NAME, "td")
        lei = cells[0].text
        company_cn_name = cells[1].text
        status = cells[2].text
        address = cells[3].text

        # 模拟点击进入详情页
        detail_link = cells[4].find_element(By.TAG_NAME, "a")
        detail_link.click()
        time.sleep(5)

        # 等待详情页加载完成
        WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.TAG_NAME, "body")))

        # 定位验证码输入框并输入验证码
        company_en_name = driver.find_element(By.CSS_SELECTOR,
                                       "#j_id3\:j_id6 > table > tbody > tr > td > table > tbody > tr:nth-child(2) > td.txtLeft").text
        # 执行SQL语句,插入数据
        sql = "INSERT INTO lei (lei, company_cn_name, status, address, company_en_name) VALUES (%s, %s, %s, %s, %s)"
        cursor.execute(sql, (lei, company_cn_name, status, address, company_en_name))
        # print(f"lei: {lei}")
        # print(f"company_cn_name: {company_cn_name}")
        # print(f"status: {status}")
        # print(f"address: {address}")
        # print(f"company_en_name: {company_en_name}")

        # 定位返回按钮并点击
        back_btn = driver.find_element(By.CSS_SELECTOR,
                                       "#j_id3 > table > tbody > tr > td > table > tbody > tr:nth-child(1) > td > table > tbody > tr > td > table:nth-child(15) > tbody > tr:nth-child(2) > td > table > tbody > tr > td:nth-child(2) > a > img")
        back_btn.click()
        time.sleep(5)

        # 提交事务
        db.commit()
        print(f"已经爬取第{i+1}条...")
    # 翻页
    if page <= total_pages:
        next_page_btn = driver.find_element(By.CSS_SELECTOR, '#resultForm > table > tbody > tr:nth-child(1) > td > table > tbody > tr > td > table:nth-child(2) > tbody > tr:nth-child(2) > td > a:nth-child(9)')
        next_page_btn.click()
        time.sleep(20)  # 等待页面加载完成
# 关闭数据库连接
cursor.close()
db.close()
posted @ 2024-05-10 14:18  知之不若行之  阅读(3)  评论(0编辑  收藏  举报