去B站咯

python3二级页面爬虫

from bs4 import BeautifulSoup
import requests

# title:    文娱数据库
# target:   http://wydb.leshanvc.com/
# author:   不想长大a

page = 1                                            # 定义页数初始值
website = ' http://wydb.leshanvc.com/'              # 目标网站
url_file = 'out.txt'                                # 存放目标网站链接
headers = {
    'user-agent': 'Mozilla/5.0'                     # 伪装
}
count = 0                                           # 统计个数
for i in range(1, 1388):
    # http://wydb.leshanvc.com/page-1/
    basic_url = website+'page-'+str(page)+'/'       # 一级链接
    html = requests.get(basic_url, headers)         # 获取一级页面内容
    page += 1
    soup = BeautifulSoup(html.text, 'html.parser')  # 解析一级页面内容
    # print(soup)
    for item in soup.find_all('div', 'list'):

        link = item.find('div', 'info').find('div').find('span', 'companyname').find('a')['href']   # 二级页面链接
        # print(link)
        xq_html = requests.get(website + link, headers)                                             # 获取详情页
        xq_soup = BeautifulSoup(xq_html.text, 'html.parser')                                        # 解析详情页
        url_list = ''
        try:
            # 使用copy selector 获取二级页面指定内容
            url_list = xq_soup.select('#wydb > div.right > div.bor.con > div:nth-child(7) > span:nth-child(2) > a')
        except:
            url_list = 'none'                                                                       # 爬不到就为空
        for i in url_list:
            for _char in i:
                if not '\u4e00' <= _char <= '\u9fa5':                                               # 过滤汉字
                    with open(url_file, "a+") as f:
                        f.write(i.get_text())                                                       # 写入txt
                        f.write('\n')                                                               # 换行
                    print(i.get_text())
                    count += 1
print("一共爬了", count)
print("结束")

 

posted @ 2020-07-02 01:56  不想长大a  阅读(714)  评论(0编辑  收藏  举报