webcralertest
自己爬虫新手实验
必应关键词搜索
import requests
try:
url="https://cn.bing.com/search"
keyword="计算机组成原理"
wk={'q':keyword}
r=requests.get(url,params=wk,timeout=30)
r.raise_for_status()
r.encoding=r.apparent_encoding
print(r.text[:1000])
except IOError as e:
print(str(e))
timeout是请求时长
当当网书本
import requests
try:
url="https://product.dangdang.com/29526048.html"
kv={'User-Agent': 'Mozilla/5.0'}#构建hearders,没有这个会拒绝不合理的请求
r=requests.get(url,headers=kv,timeout=30)
r.raise_for_status()
r.encoding=r.apparent_encoding
print(r.text[:1000])
except IOError as e:
print(str(e))
网站图片下载
import requests
import os
url="https://alifei02.cfp.cn/creative/vcg/800/new/VCG41N1397194266.jpg"
dir="D://ai绘画//test//"
path=dir+url.split('/')[-1]#设置图片保存路径并以原图名名字命名
try:
if not os.path.exists(dir):
os.makedirs(dir)
if not os.path.exists(path):
r=requests.get(url)
# print(r.content)
with open(path,"wb") as f:
f.write(r.content)
f.close()
print("文件保存成功")
else:
print("文件已存在")
except IOError as e:
print(str(e))
post请求初尝试
import requests
import json
def baidutranslate(words=None):
url="https://fanyi.baidu.com/sug"
header={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36"
}
data={
"kw": words
}
res=requests.post(url,headers=header,data=data)
content=res.json()
print(content['data'][0]['v'])
if __name__=="__main__":
st=input("输入想翻译的")
baidutranslate(st)
网站小说爬取(感觉太慢了)
import requests
import os
from bs4 import BeautifulSoup
import time
import random
def Reverse(lst):
new_lst = lst[::-1]
return new_lst
def get_norvel_chapters(url):#获取所有章节页面连接和章节标题
res=requests.get(url)
res.encoding='utf-8'
data=[]
soup=BeautifulSoup(res.text,'html.parser')
for dd in soup.find_all("dd"):
link=dd.find("a")
if not link:
continue
data.append(("{}{}".format(url,link['href']),link.get_text()))
return data
def get_chapters_content(url):
res=requests.get(url)
res.encoding='utf-8'
soup=BeautifulSoup(res.text,'html.parser')
contentbox=soup.find('div',id="content")
return contentbox.get_text(separator="\n").replace('\xa0',' ')
rooturl=input("输入书网址:")
name=input("输入书名:")
novel_chapters=get_norvel_chapters(rooturl)
total_chapters=len(novel_chapters)
now_chapter=0
for chapter in novel_chapters:
now_chapter+=1
url,title=chapter
with open("C:\\Users\\蔡汉林\\Desktop\\novels\\{}.txt".format(name),'a',encoding='gbk',errors='ignore') as fout:
fout.write("{}\n".format(title)+get_chapters_content(url)+'\n')
# if now_chapter==100:
# fout.close()
# break
if now_chapter%10==0:#隔10条休眠
#os.system("clear")
print(now_chapter, total_chapters)
time.sleep(random.random()*3)
print("爬取完成o(* ̄︶ ̄*)o")

浙公网安备 33010602011771号