python爬虫之爬取笑话(一)
爬取“糗事百科”笑话
import sys import requests from bs4 import BeautifulSoup headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36' } # 获取每一页的URL def get_url(url, num): bmp =dict.fromkeys(range(0x10000, sys.maxunicode+1),0xfffd) r = requests.get(url + str(num), headers = headers) content = r.text.translate(bmp) return content # 获取节点信息 def ext_info(content): soup = BeautifulSoup(content, 'lxml') divs = soup.find_all(class_='article block untagged mb15 typs_hot') return divs # 提取数据 def get_info(divs, num): with open ('糗事百科笑话.txt', 'a+', encoding='utf-8') as file: for div in divs: joke = div.span.get_text() file.write(joke) print('第{}页已经爬取完毕!'.format(num)) # 执行主函数,即爬虫整流程 def main(): url = 'https://www.qiushibaike.com/text/page/' for num in range(1,14): res = get_url(url, num) tsc = ext_info(res) get_info(tsc, num) if __name__ == '__main__': main()

浙公网安备 33010602011771号