python学习第二天
今天学的就是简单的python操作,爬取百度贴吧中选定贴吧选定页数的内容。
import random
import time
import urllib.parse
import urllib.request
ugList=["Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1 Edg/101.0.4951.54"
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36 Edg/101.0.1210.39"]
header={"User_Agent":random.choice(ugList)}
# print(header)
# https://tieba.baidu.com/f?ie=utf-8&kw=%E7%A8%8B%E5%BA%8F%E5%91%98
# https://tieba.baidu.com/f?kw=%E7%A8%8B%E5%BA%8F%E5%91%98&ie=utf-8&pn=50
def loadpage(allurl):
print("正在下载-----")
urr = urllib.request.Request(allurl, headers=header)
response = urllib.request.urlopen(urr).read()
return response
def savepage(html, fileName):
print("正在保存-----")
f = open(fileName, "wb")
f.write(html)
f.close()
def getallurl(url,kw,starpage,endpage):
for i in range(starpage, endpage + 1):
allurl =url+kw+"&pn="+str((i-1)*50)
# 定义文件输出的路径
fileName=r"C:\Users\97807\Desktop\冷轧退火对CoCrNi中熵合金组织及力学性能的影响—张莹"+r"\第"+str(i)+"页爬虫.html"
# loadpage函数下载网页
html = loadpage(allurl)
savepage(html, fileName)
if __name__=="__main__":
kw=input("请输入贴吧")
starpage=int(input("请输入起始页码"))
endpage=int(input("请输入终止页码"))
url="https://tieba.baidu.com/f?"
keyvalue = urllib.parse.urlencode({"kw": kw})
getallurl(url,keyvalue,starpage,endpage)
time.sleep(10)
# url="https://fanyi.youdao.com/"
# html = loadpage(url)
# fileName = r"C:\Users\97807\Desktop\冷轧退火对CoCrNi中熵合金组织及力学性能的影响—张莹"+"爬虫.html"
# savepage(html,fileName)
浙公网安备 33010602011771号