python学习第二天

今天学的就是简单的python操作,爬取百度贴吧中选定贴吧选定页数的内容。

import random
import time
import urllib.parse
import urllib.request

ugList=["Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1 Edg/101.0.4951.54"
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36 Edg/101.0.1210.39"]

header={"User_Agent":random.choice(ugList)}
# print(header)
# https://tieba.baidu.com/f?ie=utf-8&kw=%E7%A8%8B%E5%BA%8F%E5%91%98
# https://tieba.baidu.com/f?kw=%E7%A8%8B%E5%BA%8F%E5%91%98&ie=utf-8&pn=50


def loadpage(allurl):
    print("正在下载-----")
    urr = urllib.request.Request(allurl, headers=header)
    response = urllib.request.urlopen(urr).read()
    return response


def savepage(html, fileName):
    print("正在保存-----")
    f = open(fileName, "wb")
    f.write(html)
    f.close()


def getallurl(url,kw,starpage,endpage):
    for i in range(starpage, endpage + 1):
        allurl =url+kw+"&pn="+str((i-1)*50)
        # 定义文件输出的路径
        fileName=r"C:\Users\97807\Desktop\冷轧退火对CoCrNi中熵合金组织及力学性能的影响—张莹"+r"\第"+str(i)+"页爬虫.html"
        # loadpage函数下载网页
        html = loadpage(allurl)
        savepage(html, fileName)


if __name__=="__main__":
    kw=input("请输入贴吧")
    starpage=int(input("请输入起始页码"))
    endpage=int(input("请输入终止页码"))
    url="https://tieba.baidu.com/f?"
    keyvalue = urllib.parse.urlencode({"kw": kw})
    getallurl(url,keyvalue,starpage,endpage)
    time.sleep(10)
    # url="https://fanyi.youdao.com/"
    # html = loadpage(url)
    # fileName = r"C:\Users\97807\Desktop\冷轧退火对CoCrNi中熵合金组织及力学性能的影响—张莹"+"爬虫.html"
    # savepage(html,fileName)

  

posted on 2022-06-01 07:57  付治齐吖  阅读(32)  评论(0)    收藏  举报