第四次作业

业①:
o 要求:
 熟练掌握Selenium查找HTML元素、爬取Ajax网页数据、等待HTML元素等内容。
 使用Selenium框架+MySQL数据库存储技术路线爬取“沪深A股”、“上证A股”、“深证A股”3个板块的股票数据信息。
o 候选网站:东方财富网:http://quote.eastmoney.com/center/gridlist.html#hs_a_board
o 输出信息:MYSQL数据库存储和输出格式如下,表头应是英文命名例如:序号id,股票代码:bStockNo……,由同学们自行定义设计表头:
首先查找Xpath,进入网站,点击F12,在弹出来的页面的右上角找到箭头并点击,然后点击沪深京个股,在elements中搜索相关名字,就能找到对应的Xpath
沪深京A股
上证A股
深证A股
>
image
然后用同样的方法点击下一页按钮,就能中找到下一页的Xpath
image
程序运行前需要下载Chrome
核心代码
` def scrape_board(self, board_code, board_name):
"""爬取指定板块的数据"""
logger.info(f"\n{'=' * 60}")
logger.info(f"开始爬取 {board_name} ({board_code})")
logger.info(f"{'=' * 60}")

    # 构造板块URL
    url = f"{self.base_url}#{board_code}"
    logger.info(f"访问URL: {url}")

    try:
        self.driver.get(url)
        # 等待页面完全加载
        self.random_delay(3, 5)

        # 等待表格加载
        table_element = self.wait_for_table_load()
        if not table_element:
            logger.error(f"表格加载失败: {board_name}")
            return []

        # 滚动页面以确保内容加载
        self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight/3);")
        self.random_delay(2, 3)
        self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight/2);")
        self.random_delay(2, 3)

    except Exception as e:
        logger.error(f"访问页面失败: {e}")
        return []

    stock_data = []
    page_num = 1
    max_pages = 10  # 限制爬取页数,避免无限循环

    while page_num <= max_pages:
        logger.info(f"正在处理 {board_name} 第 {page_num} 页...")

        # 尝试多种XPath定位策略
        row_selectors = [
            '//div[@class="listview"]//table/tbody/tr',
            '//table[contains(@class, "table")]/tbody/tr',
            '//div[contains(@class, "dataview")]//tr',
            '//table/tbody/tr',
            '//tbody/tr',
            '//tr[contains(@class, "row")]'
        ]

        rows = []
        for selector in row_selectors:
            try:
                rows = self.driver.find_elements(By.XPATH, selector)
                if rows and len(rows) > 5:  # 确保找到足够多的行
                    logger.info(f"使用选择器找到 {len(rows)} 行数据: {selector}")
                    break
            except Exception:
                continue

        if not rows or len(rows) < 5:
            logger.warning(f"第 {page_num} 页未找到足够数据行,尝试重新查找...")
            # 尝试重新加载
            try:
                self.driver.refresh()
                self.random_delay(3, 5)
                rows = self.driver.find_elements(By.XPATH, row_selectors[0])
            except Exception:
                break

        # 解析每一行数据
        page_records = 0
        for i, row in enumerate(rows):
            try:
                # 跳过表头行
                row_class = row.get_attribute("class") or ""
                if "head" in row_class.lower() or "header" in row_class.lower():
                    continue

                # 获取所有单元格
                cells = row.find_elements(By.XPATH, './/td')

                if len(cells) < 10:  # 至少需要10个单元格
                    # 尝试其他定位方式
                    cells = row.find_elements(By.TAG_NAME, "td")

                if len(cells) < 10:
                    continue

                # 提取数据 - 根据实际网页结构调整索引
                stock_info = self.extract_stock_info(cells, board_name)

                if stock_info:
                    stock_data.append(stock_info)
                    page_records += 1

            except StaleElementReferenceException:
                logger.warning(f"第{i + 1}行元素已过时,跳过...")
                continue
            except Exception as e:
                logger.warning(f"解析第{i + 1}行数据时出错: {e}")
                continue

        logger.info(f"第 {page_num} 页解析完成,获得 {page_records} 条记录")

        # 尝试查找并点击下一页 - 更新了选择器,优先使用 data-pi="2"
        try:
            next_button = None
            next_selectors = [
                '//a[@data-pi="2"]',  # 优先使用这个选择器
                '//a[@title="下一页"]',
                '//a[contains(text(), "下一页")]',
                '//a[contains(@class, "next")]',
                '//button[contains(text(), "下一页")]',
                '//a[text()=">"]'
            ]

            for selector in next_selectors:
                try:
                    next_button = self.driver.find_element(By.XPATH, selector)
                    if next_button.is_displayed() and next_button.is_enabled():
                        logger.info(f"找到下一页按钮,使用选择器: {selector}")
                        break
                    else:
                        next_button = None
                except:
                    continue

            if not next_button:
                logger.info(f"{board_name} 没有下一页按钮,爬取完成")
                break

            # 检查按钮是否禁用
            button_class = next_button.get_attribute("class") or ""
            if "disabled" in button_class or "禁" in button_class:
                logger.info(f"{board_name} 已到最后一页")
                break

            # 点击下一页
            logger.info("点击下一页...")
            self.driver.execute_script("arguments[0].scrollIntoView();", next_button)
            self.random_delay(1, 2)  # 修正了拼写错误
            next_button.click()

            # 等待下一页加载
            self.random_delay(4, 6)  # 增加等待时间
            page_num += 1

            # 每3页添加一个额外等待
            if page_num % 3 == 0:
                self.random_delay(5, 8)

        except ElementClickInterceptedException:
            logger.warning("下一页按钮点击被拦截,尝试JavaScript点击")
            try:
                self.driver.execute_script("arguments[0].click();", next_button)
                self.random_delay(4, 6)
                page_num += 1
            except:
                logger.error("JavaScript点击也失败,停止翻页")
                break
        except NoSuchElementException:
            logger.info(f"{board_name} 没有找到下一页按钮,爬取完成")
            break
        except Exception as e:
            logger.error(f"翻页时出错: {e}")
            break

    logger.info(f"{board_name} 数据爬取完成,共 {len(stock_data)} 条记录")
    return stock_data`

运行结果
image
image
image
• 作业②:
o 要求:
 熟练掌握 Selenium 查找HTML元素、实现用户模拟登录、爬取Ajax网页数据、等待HTML元素等内容。
 使用Selenium框架+MySQL爬取中国mooc网课程资源信息(课程号、课程名称、学校名称、主讲教师、团队成员、参加人数、课程进度、课程简介)
o 候选网站:中国mooc网:https://www.icourse163.org
获取登录,课程信息等xpath
image
核心代码`# 从URL提取课程ID
if '/course/' in course_url:
parts = course_url.split('/course/')
if len(parts) > 1:
course_data['course_id'] = parts[1].split('?')[0]

        # 爬取课程名称
        try:
            # 尝试多种可能的XPath选择器
            name_selectors = [
                "//h1[contains(@class, 'course-title')]",
                "//div[contains(@class, 'course-title')]",
                "//h1[@class='u-course-title']",
                "//h1"
            ]

            for selector in name_selectors:
                try:
                    course_name_elem = self.driver.find_element(By.XPATH, selector)
                    if course_name_elem.text.strip():
                        course_data['course_name'] = course_name_elem.text.strip()
                        break
                except:
                    continue

            if not course_data['course_name']:
                # 从URL中提取课程名
                course_data['course_name'] = course_url.split('/')[-1].replace('-', ' ')

        except Exception as e:
            logger.warning(f"获取课程名称失败: {e}")

        # 爬取学校名称
        try:
            university_selectors = [
                "//a[contains(@class, 'u-course-university')]",
                "//div[contains(@class, 'university')]",
                "//span[contains(text(), '大学') or contains(text(), '学院')]",
                "//a[contains(@href, 'university')]"
            ]

            for selector in university_selectors:
                try:
                    uni_elem = self.driver.find_element(By.XPATH, selector)
                    if uni_elem.text.strip():
                        course_data['university_name'] = uni_elem.text.strip()
                        break
                except:
                    continue
        except Exception as e:
            logger.warning(f"获取学校名称失败: {e}")

        # 爬取主讲教师和团队成员
        try:
            teacher_selectors = [
                "//div[contains(@class, 'teacher')]",
                "//span[contains(text(), '教授') or contains(text(), '老师')]",
                "//div[contains(@class, 'lecturer')]"
            ]

            teachers = []
            for selector in teacher_selectors:
                try:
                    teacher_elems = self.driver.find_elements(By.XPATH, selector)
                    for elem in teacher_elems:
                        if elem.text.strip() and len(elem.text.strip()) < 50:  # 防止抓到太长文本
                            teachers.append(elem.text.strip())
                except:
                    continue

            if teachers:
                course_data['main_teacher'] = teachers[0] if len(teachers) > 0 else ''
                course_data['team_members'] = '; '.join(teachers[1:]) if len(teachers) > 1 else ''

        except Exception as e:
            logger.warning(f"获取教师信息失败: {e}")

        # 爬取参加人数
        try:
            participants_selectors = [
                "//span[contains(text(), '人参加')]",
                "//div[contains(text(), '人参加')]",
                "//span[contains(@class, 'participants')]"
            ]

            for selector in participants_selectors:
                try:
                    participants_elem = self.driver.find_element(By.XPATH, selector)
                    text = participants_elem.text.strip()
                    # 提取数字
                    import re
                    numbers = re.findall(r'\d+', text)
                    if numbers:
                        course_data['participants_count'] = int(''.join(numbers[:3]))
                        break
                except:
                    continue
        except Exception as e:
            logger.warning(f"获取参加人数失败: {e}")

        # 爬取课程进度
        try:
            progress_selectors = [
                "//span[contains(text(), '进行至')]",
                "//div[contains(text(), '第') and contains(text(), '周')]",
                "//span[contains(@class, 'progress')]"
            ]

            for selector in progress_selectors:
                try:
                    progress_elem = self.driver.find_element(By.XPATH, selector)
                    if progress_elem.text.strip():
                        course_data['course_progress'] = progress_elem.text.strip()
                        break
                except:
                    continue
        except Exception as e:
            logger.warning(f"获取课程进度失败: {e}")

        # 爬取课程简介
        try:
            # 根据你提供的HTML结构
            description_selectors = [
                "//span[contains(@style, 'font-size: 16px; font-family: 宋体')]",
                "//div[contains(@class, 'course-description')]",
                "//div[contains(@class, 'overview')]",
                "//meta[@name='description']/@content"
            ]

            for selector in description_selectors:
                try:
                    if selector.startswith('//meta'):
                        desc_elem = self.driver.find_element(By.XPATH, "//meta[@name='description']")
                        description = desc_elem.get_attribute('content')
                    else:
                        desc_elem = self.driver.find_element(By.XPATH, selector)
                        description = desc_elem.text.strip()

                    if description:
                        course_data['course_description'] = description[:1000]  # 限制长度
                        break
                except:
                    continue
        except Exception as e:
            logger.warning(f"获取课程简介失败: {e}")

        return course_data

    except Exception as e:
        logger.error(f"爬取课程详情失败 {course_url}: {e}")
        return None

def scrape_course_list(self, start_url="https://www.icourse163.org"):
    """爬取课程列表"""
    try:
        self.driver.get(start_url)
        time.sleep(3)

        # 等待页面加载
        self.wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))

        # 查找课程链接
        course_links = []

        # 多种可能的课程链接选择器
        link_selectors = [
            "//a[contains(@href, '/course/')]",
            "//a[contains(@class, 'course-card')]",
            "//div[contains(@class, 'course-item')]//a"
        ]

        for selector in link_selectors:
            try:
                links = self.driver.find_elements(By.XPATH, selector)
                for link in links:
                    href = link.get_attribute('href')
                    if href and '/course/' in href and href not in course_links:
                        course_links.append(href)
                if course_links:  # 如果找到链接就停止
                    break
            except:
                continue

        logger.info(f"找到 {len(course_links)} 个课程链接")
        return course_links[:10]  # 限制爬取数量,防止被封

    except Exception as e:
        logger.error(f"爬取课程列表失败: {e}")
        return []`

运行结果
QQ_1768376350078

posted @ 2026-01-14 15:40  102102134陈凯  阅读(2)  评论(0)    收藏  举报