webcralertest

自己爬虫新手实验

必应关键词搜索

import requests
try:
    url="https://cn.bing.com/search"
    keyword="计算机组成原理"
    wk={'q':keyword}
    r=requests.get(url,params=wk,timeout=30)
    r.raise_for_status()
    r.encoding=r.apparent_encoding
    print(r.text[:1000])
except IOError as e:
    print(str(e))

timeout是请求时长

当当网书本

import requests
try:
    url="https://product.dangdang.com/29526048.html"
    kv={'User-Agent': 'Mozilla/5.0'}#构建hearders,没有这个会拒绝不合理的请求
    r=requests.get(url,headers=kv,timeout=30)
    r.raise_for_status()
    r.encoding=r.apparent_encoding
    print(r.text[:1000])
except IOError as e:
    print(str(e))

网站图片下载

import requests
import os
url="https://alifei02.cfp.cn/creative/vcg/800/new/VCG41N1397194266.jpg"
dir="D://ai绘画//test//"
path=dir+url.split('/')[-1]#设置图片保存路径并以原图名名字命名
try:
    if not os.path.exists(dir):
        os.makedirs(dir)
    if not os.path.exists(path):
        r=requests.get(url)
        # print(r.content)
        with open(path,"wb") as f:
            f.write(r.content)
            f.close()
            print("文件保存成功")
    else:
        print("文件已存在")
except IOError as e:
    print(str(e))

post请求初尝试

import requests
import json
def baidutranslate(words=None):
    url="https://fanyi.baidu.com/sug"
    header={
       "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36"
    }
    data={
        "kw": words
    }
    res=requests.post(url,headers=header,data=data)
    content=res.json()
    print(content['data'][0]['v'])
if __name__=="__main__":
    st=input("输入想翻译的")
    baidutranslate(st)

网站小说爬取(感觉太慢了)

import requests
import os
from bs4 import BeautifulSoup
import time
import random
def Reverse(lst):
    new_lst = lst[::-1]
    return new_lst
def get_norvel_chapters(url):#获取所有章节页面连接和章节标题
    res=requests.get(url)
    res.encoding='utf-8'
    data=[]
    soup=BeautifulSoup(res.text,'html.parser')
    for dd in soup.find_all("dd"):
        link=dd.find("a")
        if not link:
            continue
        data.append(("{}{}".format(url,link['href']),link.get_text()))
    return data
def get_chapters_content(url):
    res=requests.get(url)
    res.encoding='utf-8'
    soup=BeautifulSoup(res.text,'html.parser')
    contentbox=soup.find('div',id="content")
    return contentbox.get_text(separator="\n").replace('\xa0',' ')
rooturl=input("输入书网址:")
name=input("输入书名:")
novel_chapters=get_norvel_chapters(rooturl)
total_chapters=len(novel_chapters)
now_chapter=0
for chapter in novel_chapters:
    now_chapter+=1
    url,title=chapter
    with open("C:\\Users\\蔡汉林\\Desktop\\novels\\{}.txt".format(name),'a',encoding='gbk',errors='ignore') as fout:
        fout.write("{}\n".format(title)+get_chapters_content(url)+'\n')
    # if now_chapter==100:
    #     fout.close()
    #     break
    if now_chapter%10==0:#隔10条休眠
        #os.system("clear")
        print(now_chapter, total_chapters)
        time.sleep(random.random()*3)

print("爬取完成o(* ̄︶ ̄*)o")
posted @ 2023-03-30 20:03  minormi  阅读(45)  评论(0)    收藏  举报