python爬虫之爬取笑话(一)

爬取“糗事百科”笑话

import sys
import requests
from bs4 import BeautifulSoup
headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36'
        }


# 获取每一页的URL
def get_url(url, num):
    bmp =dict.fromkeys(range(0x10000, sys.maxunicode+1),0xfffd)
    r = requests.get(url + str(num), headers = headers)
    content = r.text.translate(bmp)
    return content

# 获取节点信息
def ext_info(content):
    soup = BeautifulSoup(content, 'lxml')
    divs = soup.find_all(class_='article block untagged mb15 typs_hot')
    return divs

# 提取数据
def get_info(divs, num):
    with open ('糗事百科笑话.txt', 'a+', encoding='utf-8') as file:
        for div in divs:
            joke = div.span.get_text()
            file.write(joke)
    print('第{}页已经爬取完毕!'.format(num))

# 执行主函数,即爬虫整流程
def main():
    url = 'https://www.qiushibaike.com/text/page/'
    for num in range(1,14):
        res = get_url(url, num)
        tsc = ext_info(res)
        get_info(tsc, num)

if __name__ == '__main__':
    main()
posted @ 2020-09-23 10:11  chchcharlie、  阅读(322)  评论(0)    收藏  举报