2025.2.18
今天学习了对网站的信息爬取,学会用python爬取网页的基本信息。比如下面就是爬取博客园一百页的标题热词。
可以把爬取的信息保存到test.txt文本文件中,但是还不会清洗数据。
import requests import re import xlwt url = 'https://news.cnblogs.com/n/recommend' headers = { "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36" } def get_page(url): try: response = requests.get(url, headers=headers) if response.status_code == 200: print('获取网页成功') print(response.encoding) return response.text else: print('获取网页失败') except Exception as e: print(e) f = xlwt.Workbook(encoding='utf-8') sheet01 = f.add_sheet(u'sheet1', cell_overwrite_ok=True) sheet01.write(0, 0, '热词') # 第一行第一列 urls = ['https://news.cnblogs.com/n/recommend?page={}'.format(i * 1) for i in range(100)] temp=0 num=0 for url in urls: print(url) page = get_page(url) items = re.findall('<h2 class="news_entry">.*?<a href=".*?" target="_blank">(.*?)</a>',page,re.S) print(len(items)) print(items) for i in range(len(items)): sheet01.write(temp + i + 1, 0, items[i]) temp += len(items) num+=1 print("已打印完第"+str(num)+"页") print("打印完!!!") f.save('test1.txt')