#批量爬取贴吧页面数据
#第1页: https://tieba.baidu.com/f?kw=%E6%97%85%E8%A1%8C%E9%9D%92%E8%9B%99&ie=utf-8&pn=0
#第2页:https://tieba.baidu.com/f?kw=%E6%97%85%E8%A1%8C%E9%9D%92%E8%9B%99&ie=utf-8&pn=50
#第3页 https://tieba.baidu.com/f?kw=%E6%97%85%E8%A1%8C%E9%9D%92%E8%9B%99&ie=utf-8&pn=100
#第4页 pn=150
#及格水平
# base_url = "https://tieba.baidu.com/f?kw=%E6%97%85%E8%A1%8C%E9%9D%92%E8%9B%99&ie=utf-8&pn="
# for page in range(10):
# new_url = base_url + str(page*50)
# print(new_url)
#进阶水平
#从键盘去输入贴吧名称和页数,然后爬取指定页面的内容
base_url = "https://tieba.baidu.com/f?"
name = input("请输入贴吧名称")
page = input("请输入贴吧页数")
from urllib import request,parse
for i in range(int(page)):
qs = {
"kw":name,
"pn":i*50
}
qs_data = parse.urlencode(qs)
url = base_url + qs_data
headers = {
"user_agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36"
}
req = request.Request(base_url,headers = headers)
response = request.urlopen(req)
html = response.read()
html = html.decode("utf-8")
# with open(name+"第"+page+"页"+".html","w",encoding="utf-8") as f:
with open(name+"第"+str(i)+"页"+".html","w",encoding="utf-8") as f:
f.write(html)