bs4的解析不成功,select怎么都是返回空列表

之后找到缘由的解决办法https://www.cnblogs.com/wkhzwmr/p/15230518.html

import requests
from bs4 import BeautifulSoup
if __name__ == '__main__':
    #对首页的页面数据进行爬取
    headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
    }
    url = 'https://www.shicimingju.com/book/sanguoyanyi.html'

    response = requests.get(url = url, headers = headers)
    response.encoding = 'utf-8'
    page_text = response.text # 网页好像挂了

    #在首页中解析出章节的标题和详情页的url
    #实例化BeautifulSoup对象,需要将页面源码数据加载到该对象中
    soup = BeautifulSoup(page_text, 'lxml')
    # 解析章节标题和详情页的url
    li_list = soup.find_all('div ul li a') # find_all带上属性就为空 属性为空找不到
    print(li_list)
    # li_list = soup.select('.book-mulu > ul > li')
    # fp = open('./sanguo.txt', 'w', encoding = 'utf-8')
    # for li in li_list:
    #     title = li.a.string
    #     detail_url ='http://www.shicimingju.com' + li.a['href'] # a为标签名 []内的为属性
    #     #对详情页发起请求,解析出章节内容
    #     detail_response = requests.get(url = detail_url, headers = headers)
    #     detail_response.encoding = 'utf-8'
    #     detail_page_text = detail_response.text
    #     #解析出详情页中相关的章节内容
    #     a = BeautifulSoup('提取的数据对象或者文件名','lxml')
    #     detail_soup = BeautifulSoup(detail_page_text, 'lxml')# 这个内容很好玩 直接用文字嵌入里面 再用<br>换行
    #     div_tag = detail_soup.find('div', class_ = 'chapter_content') # 注意是class_,不是class,因为class是python的关键字,所以后面要加个尾巴,防止冲突
    #     #解析到了章节的内容
    #     content = div_tag.text
    #     fp.write(title + ':' + content + '\n')
    #     print(title, '爬取成功!')



posted @ 2021-07-01 20:01  索匣  阅读(674)  评论(0)    收藏  举报