爬虫
爬虫
爬虫介绍
1 爬虫流程
模拟发送http请求-----》解析数据(清洗数据)----》入库
2 百度,谷歌:就是爬虫
- 百度搜索,输入关键字---》搜的是百度的数据库---》页面中看到---》点击具体内容---》跳转到互联网的某个网页
- seo优化:让百度爬到你,你直接主动上报
- sem:花钱做广告买关键词
3 爬虫协议
- 哪部分允许爬取,哪部分不允许爬取
- https://www.csdn.net/robots.txt
4 python中爬虫相关内容
- 模拟发送http请求(requests,selenium)-----》解析数据(清洗数据)(json,bs4。。。)----》入库(文件,mysql,redis,excel,mongodb)
- 反扒:
- 封ip:代理池
- 封账号:cookie池
- 请求头中带特殊校验:相应的破解出哪写字段
- 数据加密:js解析出加密方式,自行组装数据
- html:css反扒,字体反扒
requests库介绍
requests模块,基于urllib3 封装,方便的发送http的请求
pip3 install requests
#各种请求方式:常用的就是requests.get()和requests.post()
>>> import requests
>>> r = requests.get('https://api.github.com/events')
>>> r = requests.post('http://httpbin.org/post', data = {'key':'value'})
>>> r = requests.put('http://httpbin.org/put', data = {'key':'value'})
>>> r = requests.delete('http://httpbin.org/delete')
>>> r = requests.head('http://httpbin.org/get')
>>> r = requests.options('http://httpbin.org/get')
发送get请求
添加请求头:headers关键字参数传入字典
get请求添加额外参数:params关键字参数传入字典
视情况请求头中携带cookie
res = requests.get('https://www.cnblogs.com/xiaoyuanqujing/articles/11805698.html')
print(res.text) # 返回的数据
search = input('请输入要搜索的内容:')
res = requests.get('https://www.baidu.com/s?wd=' + search,
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36',
'Host': 'www.baidu.com',
})
print(res.text)
with open('search.html','w',encoding='utf-8') as f:
f.write(res.text)
res = requests.get('https://www.baidu.com/s?', headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36',
'Referer': 'https://www.baidu.com/s?wd=python'
},params={'wd':'python'})
# url编码和解码
from urllib.parse import quote,unquote
# res=quote('美女')
# print(res) #%E7%BE%8E%E5%A5%B3
res=unquote('%E7%BE%8E%E5%A5%B3')
print(res)
from urllib.parse import urlencode
res=urlencode({'wd':'美女','age':19},encoding='utf-8')
print(res)
发送post请求
# 自动登录网站
res = requests.post('http://www.aa7a.cn/user.php', data={
'username': '9@qq.com',
'password': '',
'captcha': 'zxv7',
'remember': 1,
'ref': 'http://www.aa7a.cn/',
'act': 'act_login'
})
# print(res.text)
## 取出cookie,登录成功的cookie
cookie=res.cookies # CookieJar对象
print(cookie)
res2=requests.get('http://www.aa7a.cn/',cookies=cookie)
# res2=requests.get('http://www.aa7a.cn/')
print('616564099@qq.com' in res2.text)
# body体携带数据
# res = requests.post('',data={}) # urlencoded方式
# res = requests.post('',json='json格式字符串') # aplication/json方式
# res = requests.post('',json='',headers={
# 'content-type': 'application/json;charset=utf-8'
# })
自动处理cookie
#### 使用requests.session()自动处理cookie
import requests
session=requests.session()
### 自己处理cookie情况
# res = requests.post('http://www.aa7a.cn/user.php', data={
# 'username': '69@qq.com',
# 'password': '1',
# 'captcha': 'zxv7',
# 'remember': 1,
# 'ref': 'http://www.aa7a.cn/',
# 'act': 'act_login'
# })
# cookie=res.cookies
# res2=requests.get('http://www.aa7a.cn/',cookies=cookie)
# print('616564099@qq.com' in res2.text)
### 自动处理cookie情况
# res = session.post('http://www.aa7a.cn/user.php', data={
# 'username': '9@qq.com',
# 'password': '3',
# 'captcha': 'zxv7',
# 'remember': 1,
# 'ref': 'http://www.aa7a.cn/',
# 'act': 'act_login'
# })
res2=session.get('http://www.aa7a.cn/')
print('616564099@qq.com' in res2.text)
响应request属性
# print(respone.text) # 响应体的字符串
# print(respone.content) # 响应体的二进制(图片,视频,页面)
# print(respone.status_code) # 响应的状态码
# print(respone.headers) # 响应头
# print(respone.cookies) # 返回的cookie
# print(respone.cookies.get_dict()) # cookieJar对象转成字典
# print(respone.cookies.items()) # 相当于字典的items
# print(respone.url) # 当次请求地址
# print(respone.history) # 重定向过才有值
# print(respone.encoding) # 响应的编码格式
# 关闭:response.close()
# from contextlib import closing
# with closing(requests.get('xxx',stream=True)) as response:
# for line in response.iter_content():
# pass
# 编码问题,
# 可能会遇到打印respone.text出现乱码,在浏览器页面中看不会出现乱码
respone=requests.get('http://www.aa7a.cn/')
respone.encoding='gbk' # 之间指定编码方式
# respone.encoding=respone.apparent_encoding # 使用页面的编码方式
print(respone.text)
# 9 获取二进制,
res=requests.get('http://www.aa7a.cn/data/afficheimg/20201102gophex.png')
print(res.content)
with open('致命诱惑.png','wb') as f:
f.write(res.content)
# 分多次写入
with open('致命诱惑.png','wb') as f:
for line in res.iter_content(1024):
f.write(line)
# 解析json
import json
res=requests.get('https://api.luffycity.com/api/v1/course/category/actual/?courseType=actual')
print(json.loads(res.text))
print(res.json())
爬取视频
爬取好看视频
# https://haokan.baidu.com/tab/zongyi_new
# https://haokan.baidu.com/web/video/feed?tab=zongyi_new&act=pcFeed&pd=pc&num=20&shuaxin_id=1628075696379
# https://haokan.baidu.com/web/video/feed?tab=zongyi_new&act=pcFeed&pd=pc&num=20&shuaxin_id=1628075696379
import re
import requests
import json
res = requests.get('https://haokan.baidu.com/web/video/feed?tab=zongyi_new&act=pcFeed&pd=pc&num=20&shuaxin_id=1628075696379')
# print(type(json.loads(res.text)))
# print(json.loads(res.text).get('data').get('response').get('videos'))
videos_list = json.loads(res.text).get('data').get('response').get('videos')
for videos_obj in videos_list:
print(videos_obj.get('play_url'))
res1 = requests.get(videos_obj.get('play_url'))
name = videos_obj.get('play_url').split('/')[-2]
with open(f'{name}.mp4','wb') as f:
for line in res1.iter_content(1024):
f.write(line)
# 分析过程稿
# referer:上一次访问的地址,可以做图片防盗链
header={
'Referer': 'https://www.pearvideo.com/video_1737590'
}
res=requests.get('https://www.pearvideo.com/videoStatus.jsp?contId=1737590&mrd=0.5165499193941832',headers=header)
print(res.text)
开线程池爬取
from concurrent.futures import ThreadPoolExecutor
import requests
import json
def task(url):
name = url.split('/')[-2]
print(f'{name}.mp4 开始')
res = requests.get(url)
with open(f'{name}.mp4', 'wb') as f:
for line in res.iter_content(1024):
f.write(line)
print(f'{name}.mp4 完成')
if __name__ == '__main__':
res = requests.get(
'https://haokan.baidu.com/web/video/feed?tab=zongyi_new&act=pcFeed&pd=pc&num=20&shuaxin_id=1628075696379')
videos_list = json.loads(res.text).get('data').get('response').get('videos')
pool_p = ThreadPoolExecutor(5)
for videos_obj in videos_list:
play_url = videos_obj.get('play_url')
pool_p.submit(task, url=play_url)
pool_p.shutdown(wait=True)
爬取梨视频
# 爬取视频
#https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=5&start=0
# import re
# res=requests.get('https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=5&start=0')
#
# # print(res.text)
# # 如果使用bs4,非常简单
#
# video_list=re.findall('<a href="(.*?)" class="vervideo-lilink actplay">',res.text)
# # print(video_list)
# for video in video_list:
# video_url='https://www.pearvideo.com/'+video
# # print(video_url)
# video_id=video.split('_')[-1]
#
# header={
# 'Referer':video_url
# }
#
# res2=requests.get('https://www.pearvideo.com/videoStatus.jsp?contId=%s&mrd=0.5165499193941832'%video_id,headers=header)
#
# video_f_url=res2.json()['videoInfo']['videos']['srcUrl']
# video_real_url=video_f_url.replace(video_f_url.rsplit('/')[-1].split('-')[0], 'cont-%s' % video_id)
# print(video_real_url)
#
# res3=requests.get(video_real_url)
# with open('%s.mp4'%video_id,'wb') as f:
# for line in res3.iter_content(1024):
# f.write(line)
# 分析过程稿
# referer:上一次访问的地址,可以做图片防盗链
# header={
# 'Referer': 'https://www.pearvideo.com/video_1737590'
# }
#
# res=requests.get('https://www.pearvideo.com/videoStatus.jsp?contId=1737590&mrd=0.5165499193941832',headers=header)
# print(res.text)
## 可以播放的视频
# 'https://video.pearvideo.com/mp4/short/20210729/cont-1736870-15732687-hd.mp4'
# ## 不可以播放的视频
# 'https://video.pearvideo.com/mp4/short/20210729/1628062847275-15732687-hd.mp4'
#
#
# 'https://video.pearvideo.com/mp4/short/20210729/ cont-1736870 -15732687-hd.mp4'
# 'https://video.pearvideo.com/mp4/short/20210729/ 1628062847275 -15732687-hd.mp4'
#
# s='https://video.pearvideo.com/mp4/short/20210729/ 1628062847275 -15732687-hd.mp4'
# s.replace(s.rsplit('/')[-1].split('-')[0],'cont-%s'%video_id)
requests高级用法
1 证书 SSL Cert Verification(了解)
2 使用代理
3 超时时间
4 认证(像老款路由器的登录)
5 异常处理
6 文件上传
证书 SSL Cert Verification(了解)## 不验证证书# import requests# respone=requests.get('https://www.12306.cn',verify=False) #不验证证书,报警告,返回200# print(respone.status_code)# ## 携带证书# import requests# respone=requests.get('https://www.12306.cn',# cert=('/path/server.crt',# '/path/key'))# print(respone.status_code)使用代理# import requests# proxies = {# 'http':'http://117.69.230.132:3256',# }# respone=requests.get('https://www.12306.cn',# proxies=proxies)## print(respone.status_code)# 获取代理:花钱买or白嫖or借助第三方自己搭建一个代理池# https://github.com/jhao104/proxy_pool3 超时时间# respone=requests.get('https://www.baidu.com',timeout=0.0001)4 认证(像老款路由器的登录)# import requests# from requests.auth import HTTPBasicAuth# r=requests.get('xxx',auth=HTTPBasicAuth('user','password'))# print(r.status_code)5 异常处理# import requests# from requests.exceptions import * #可以查看requests.exceptions获取异常类型## try:# r=requests.get('http://www.baidu.com',timeout=0.00001)# # except ReadTimeout:# # print('===:')# # except ConnectionError: #网络不通# # print('-----')# # except Timeout:# # print('aaaaa')## except Exception:# print('Error')6 文件上传# import requests# files={'myfile':open('1 自动处理cookie.py','rb')}# respone=requests.post('http://127.0.0.1:8000/upload_file/',files=files)# print(respone.text)
抽屉自动点赞
import requestsheader={ 'Cookie':'', 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36'}# res=requests.post('https://dig.chouti.com/link/vote',data={'linkId':'31857081'},headers=header)# print(res.text)# 所有的都点赞,----》id解析---》bs4模块(解析xml)res=requests.get('https://dig.chouti.com/top/24hr?_=1628136305346',headers=header).json()for item in res['data']: id=item['id'] res=requests.post('https://dig.chouti.com/link/vote',data={'linkId':'%s'%id},headers=header) print(res.text)
爬取汽车之家新闻
# pip3 install beautifulsoup4import requestsfrom bs4 import BeautifulSoupfor i in range(1,100): res=requests.get('https://www.autohome.com.cn/news/%s/#liststart'%i) # print(res.text) # 第一个参数,要解析的内容,第二参数:使用的解析器 html.parser bs4内置的解析器 lxml soup=BeautifulSoup(res.text,'html.parser') # pip3 install lxml # soup=BeautifulSoup(res.text,'lxml') # find_all找所有 ul_list=soup.find_all(name='ul',class_='article') # ul_list=soup.find_all(name='ul') # print(len(ul_list)) for ul in ul_list: li_list=ul.find_all(name='li') for li in li_list: h3=li.find(name='h3') if h3: title=h3.text # 获取标签的文本内容,标签对象.text # print(title) desc=li.find(name='p').text # print(desc) img_url=li.find(name='img')['src'] if not img_url.startswith('http'): img_url='https:'+img_url # print(img_url) url='https:'+li.find(name='a')['href'] print(url) print(''' 新闻标题:%s 新闻摘要:%s 新闻图片:%s 新闻地址:%s '''%(title,desc,img_url,url))
bs4遍历文档树
美化:soup.prettify() 遍历:.标签名,soup.html.head.title获取标签名:标签对象.name获取标签属性:标签对象['标签名']获取标签内容:标签对象.text 标签对象.string(当标签中只有一个文本内容才能拿出) 标签对象.strings(当前标签下的分别拿出,做成生成器)嵌套选择:soup.html.body.p子节点:soup.p.contents子子孙孙节点:soup.p.descendants父节点:soup.a.parent祖先节点:soup.a.parents兄弟节点: 上一个:soup.a.previous_sibling 下一个:soup.a.next_sibling 上面的所有(生成器对象):soup.a.previous_siblings 下面的所有(生成器对象:soup.a.next_siblingsfrom bs4 import BeautifulSouphtml_doc = """<html><head><title>The Dormouse's story</title></head><body><p class="title"><b>The Dormouse's story <span>111</span></b><span>111</span></p><p class="story">Once upon a time there were three little sisters; and their names were<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;and they lived at the bottom of a well.</p><p class="story">...</p>"""soup=BeautifulSoup(html_doc,'html.parser')# res=soup.prettify() # 美化# print(res)#1、用法# html=soup.html# title=soup.html.head.title# title=soup.title# print(title)#2、获取标签的名称 ---> 标签对象.name# a=soup.body.a# a=soup.a.name# print(a)# print(soup.body.name)#3、获取标签的属性 ---->标签对象['标签名']# href=soup.body.a['href']# attrs=soup.body.a.attrs # 所有属性,---》字典# href=soup.body.a.attrs['href']# print(attrs['class'])# c=soup.p.attrs['class']# print(c)#4、获取标签的内容# res=soup.b.text # 拿到当前标签子子孙所有的text# res=soup.p.text# res=soup.p.string # 当前标签有且只有一个文本内容才能拿出来# res=soup.b.string # 当前标签有且只有一个文本内容才能拿出来# res=soup.p.strings # 把子子孙放到生成器中# print(list(res))#5、嵌套选择# res=soup.html.body.p# print(type(res)) # bs4.element.Tagfrom bs4.element import Tag####了解#6、子节点、子孙节点# print(soup.p.contents) #p下所有子节点,放到列表中# print(soup.p.children) #得到一个迭代器,包含p下所有子节点# for i,child in enumerate(soup.p.children):# print(i,child)# print(soup.p.descendants) #获取子孙节点,p下所有的标签都会选择出来# for i,child in enumerate(soup.p.descendants):# print(i,child)#7、父节点、祖先节点# print(soup.a.parent) #获取a标签的父节点# print(soup.body.parent)# print(soup.a.parents) #找到a标签所有的祖先节点,父亲的父亲,父亲的父亲的父亲...# print(list(soup.a.parents))# print(len(list(soup.a.parents)))#8、兄弟节点# print(soup.a.next_sibling) #下一个兄弟# print(soup.a.previous_sibling) #上一个兄弟# print(list(soup.a.next_siblings)) #下面的兄弟们=>生成器对象# print(list(soup.a.previous_siblings)) #上面的兄弟们=>生成器对象
bs4搜索文档树
可以用 find和find_all两个方法来搜索五种过滤器:字符串、正则表达式、列表、方法、布尔值建议 遍历文档树和搜索文档树混用其他参数:limit(限制获取的条数)、recursive(是否递归查找)from bs4 import BeautifulSouphtml_doc = """<html><head><title>The Dormouse's story</title></head><body id='body'><p class="title"><b>The Dormouse's story <span>111</span></b><span>egon</span></p><p class="story">Once upon a time there were three little sisters; and their names were<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;and they lived at the bottom of a well.</p><p class="story">...</p>"""soup=BeautifulSoup(html_doc,'html.parser')# 搜索文档树 find find_all# 五种过滤器: 字符串、正则表达式、列表、True、方法##### 字符串# res=soup.find(name='body')# res=soup.find(name='p',class_='story')# 查找id为link2的标签# res=soup.find(id='link2',name='a',class_='sister',href='http://example.com/lacie')# res=soup.find(href='http://example.com/lacie')# print(res)# res=soup.find(attrs={'class':['sister']})# print(res)#### 正则表达式import re# res=soup.find_all(name=re.compile('^b')) #找出b开头的标签,结果有body和b标签# res=soup.find(name=re.compile('^b'))# res=soup.find_all(class_=re.compile('^s'))# res=soup.find_all(href=re.compile('^http'))# res=soup.find_all(id=re.compile('^l'))# print(res)####列表、# res=soup.find_all(name=['body','b'])# res=soup.find_all(id=['link1','link2'])# res=soup.find_all(attrs={'id':['link1','link2']})## print(res)# True、# links=soup.find_all(href=True)# print(links)# res=soup.find_all(name=True)# res=soup.find_all(id=True)# print(res)#方法# def has_class_but_no_id(tag):# return tag.has_attr('class') and not tag.has_attr('id')## print(len(soup.find_all(name=has_class_but_no_id)))# 拿出当前页面所有图片soup.find_all(name='img',href=True)## 建议 遍历文档树和搜索文档树混用# soup.body.div.find### 其他参数 find,find_all#limit# soup.find()# res=soup.find_all(name='a',href=True,limit=2) # 限制获取的条数# print(res)# recursive 是否递归查找# res=soup.find_all(name='a',recursive=False)# res=soup.find_all(name='html',recursive=False)# print(res)
css选择器
res=soup.p.select('.sister') #id.类名标签标签>标签标签 标签html_doc = """<html><head><title>The Dormouse's story</title></head><body><p class="title"> <b>The Dormouse's story <p>asdfasdf</p></b> Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1"> <span>Elsie</span> </a> <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; <div class='panel-1'> <ul class='list' id='list-1'> <li class='element'>Foo</li> <li class='element'>Bar</li> <li class='element'>Jay</li> </ul> <ul class='list list-small' id='list-2'> <li class='element'><h1 class='yyyy'>Foo</h1></li> <li class='element xxx'>Bar</li> <li class='element'>Jay</li> </ul> </div> and they lived at the bottom of a well.</p><p class="story">...</p>"""from bs4 import BeautifulSoupsoup=BeautifulSoup(html_doc,'html.parser')'''#id.类名标签标签>标签标签 标签'''# res=soup.p.select('.sister') # 使用css选择器# res=soup.p.select('#link1') # 使用css选择器# res=soup.select('body>p') # 使用css选择器 body的子标签pres=soup.select('body p') # 使用css选择器 body的子子孙孙标签pprint(len(res))### css选择器是通用的:bs4,lxml解析也可以是css选择器##css选择器不会写怎么办?'#maincontent > div:nth-child(3) > table > tbody > tr:nth-child(13) > td:nth-child(3)'## xpath选择'//*[@id="maincontent"]/div[2]/table/tbody/tr[18]/td[2]'
selenium使用
# 如果使用requests模块,发送请求获取的数据不全,它不能执行js# selenium:可以使用代码控制模拟人操作浏览器## 操作某个浏览器,就需要有浏览器驱动# http://npm.taobao.org/mirrors/chromedriver/ 谷歌驱动的淘宝镜像站# 谷歌浏览器版本要跟驱动版本对应## 92.0.4515.131 下载相应版本驱动,放到项目代码中# pip3 install selenium# from selenium import webdriver# import time# # 打开一个谷歌浏览器# bro=webdriver.Chrome(executable_path='chromedriver.exe')## #地址栏中输入百度# bro.get('https://www.cnblogs.com/')## time.sleep(2)## print(bro.page_source) #当前页面的html内容## bro.close() # 关闭浏览器# import requests## res=requests.get('https://dig.chouti.com/',headers={# 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36'# })# print(res.text)
基本使用
from selenium import webdriverimport time# 浏览器对象bro = webdriver.Chrome(executable_path='chromedriver.exe')bro.implicitly_wait(10) # 隐士等待,去找控件,如果没有会等10sbro.get('https://www.baidu.com/')# sub_button=bro.find_element_by_css_selector('#s-top-loginbtn')sub_button = bro.find_element_by_id('s-top-loginbtn') # 如果有id,优先用它# 点击sub_button.click()# 找到用户名密码登录user_btn = bro.find_element_by_xpath('//*[@id="TANGRAM__PSP_11__footerULoginBtn"]')# user_btn=bro.find_element_by_id('TANGRAM__PSP_11__footerULoginBtn')user_btn.click()username = bro.find_element_by_id('TANGRAM__PSP_11__userName')password = bro.find_element_by_id('TANGRAM__PSP_11__password')# 往输入框中写东西username.send_keys('666@qq.com')password.send_keys('45')sumbit_btn = bro.find_element_by_id('TANGRAM__PSP_11__submit')time.sleep(3)sumbit_btn.click()time.sleep(3)bro.close()
无头浏览器
from selenium import webdriverfrom selenium.webdriver.chrome.options import Optionschrome_options = Options()chrome_options.add_argument('window-size=1920x3000') #指定浏览器分辨率chrome_options.add_argument('--disable-gpu') #谷歌文档提到需要加上这个属性来规避bugchrome_options.add_argument('--hide-scrollbars') #隐藏滚动条, 应对一些特殊页面chrome_options.add_argument('blink-settings=imagesEnabled=false') #不加载图片, 提升速度chrome_options.add_argument('--headless') #浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败driver=webdriver.Chrome(executable_path='chromedriver.exe',chrome_options=chrome_options)driver.get('https://www.baidu.com')print(driver.page_source)driver.close()
获取元素位置,属性,大小
from selenium import webdriverimport timedriver=webdriver.Chrome(executable_path='chromedriver.exe')driver.get('https://kyfw.12306.cn/otn/resources/login.html')driver.implicitly_wait(10)user_login=driver.find_element_by_css_selector('.login-hd-account>a')user_login.click()time.sleep(2)img=driver.find_element_by_id('J-loginImg')print(img)print(img.id) #selenium提供的id,忽略print(img.tag_name) # 标签名print('-----')print(img.location) # img标签的位置print(img.size) # img标签大小# 获取属性# print(img.get_attribute('src'))print(img.get_attribute('class'))driver.close()
等待元素被加载
from selenium import webdriver# 两种等待方式# 显示等待# 隐式等待:只需要写一句话,等待所有要获取的标签driver=webdriver.Chrome(executable_path='chromedriver.exe')driver.get('https://www.baidu.com')'''# 两种等待方式# 显示等待(忽略掉) wait=WebDriverWait(driver,10) wait.until(EC.presence_of_element_located((By.ID,'content_left'))) contents=browser.find_element(By.CSS_SELECTOR,'#content_left')# 隐式等待: -driver.implicitly_wait(10) -driver.find_element_by_css_selector() -只需要写一句话,等待所有要获取的标签'''driver.implicitly_wait(10)print(driver.page_source)# 再找控件,只要没加载成功,就会等待,最多等10sdriver.close()
元素操作
from selenium import webdriverimport timedriver=webdriver.Chrome(executable_path='chromedriver.exe')driver.get('https://www.baidu.com')driver.implicitly_wait(10)## 点击,清空,输入操作input_search=driver.find_element_by_id('kw')input_search.send_keys('美女') # 输入time.sleep(3)input_search.clear() # 清空time.sleep(2)input_search.send_keys('性感美女')time.sleep(2)btn=driver.find_element_by_id('su')btn.click() # 点击time.sleep(10)driver.close()
执行js
from selenium import webdriverimport timedriver=webdriver.Chrome(executable_path='chromedriver.exe')driver.get('http://127.0.0.1:8000/')driver.implicitly_wait(10)driver.execute_script("name='egon';") # 这里面写js代码driver.execute_script("alert(name)") # 这里面写js代码time.sleep(5)# driver.close()
切换选项卡
import timefrom selenium import webdriverbrowser=webdriver.Chrome()browser.get('https://www.baidu.com')browser.execute_script('window.open()')print(browser.window_handles) #获取所有的选项卡# browser.switch_to_window(browser.window_handles[1])# browser.switch_to_window(browser.window_handles[1])browser.switch_to.window(browser.window_handles[1])browser.get('https://www.taobao.com')time.sleep(5)# browser.switch_to_window(browser.window_handles[0])browser.switch_to.window(browser.window_handles[0])browser.get('https://www.sina.com.cn')browser.close()
模拟前进后退
import timefrom selenium import webdriverbrowser=webdriver.Chrome(executable_path='chromedriver.exe')browser.get('https://www.baidu.com')browser.get('https://www.taobao.com')browser.get('http://www.sina.com.cn/')browser.back()time.sleep(3)browser.forward()browser.close()
异常处理
from selenium import webdriverfrom selenium.common.exceptions import TimeoutException,NoSuchElementException,NoSuchFrameExceptionbrowser = webdriver.Chrome()try: browser.get('http://www.baidu.com')except Exception as e: print(e)finally: browser.close()
elenium登录cnblogs获取cookie
#selenium登录cnblogs获取cookiefrom selenium import webdriverfrom selenium.common.exceptions import TimeoutException,NoSuchElementException,NoSuchFrameExceptionimport timeimport jsonbrowser = webdriver.Chrome(executable_path='chromedriver.exe')browser.implicitly_wait(10)#### 登录过程# try:# browser.get('http://www.cnblogs.com')# submit_btn=browser.find_element_by_link_text('登录') # a标签的内容# submit_btn.click()## username=browser.find_element_by_id('mat-input-0')# password=browser.find_element_by_id('mat-input-1')# username.send_keys('9@qq.com')# password.send_keys('1111')# input('等会')# sub_btn=browser.find_element_by_css_selector('body > app-root > mat-sidenav-container > mat-sidenav-content > div > div > app-sign-in > app-content-container > div > div > div > form > div > button > span.mat-button-wrapper')# sub_btn.click()## # 人工参与,滑动# input('等会')## # 获取到登录后的cookie# print(browser.get_cookies())## with open('cookie.json','w') as f:# json.dump(browser.get_cookies(),f)### except Exception as e:# print(e)# finally:# browser.close()### 不登录了,把cookie写入浏览器# browser.get('http://www.cnblogs.com')# with open('cookie.json','r') as f:# cookie=json.load(f)# time.sleep(5)# for item in cookie: # 设置cookie必须用字典,cookie的json文件是列表,所以用循环往里放# browser.add_cookie(item)#### browser.refresh() # 刷新页面## time.sleep(5)## browser.close()
抽屉半自动点赞
from selenium import webdriverimport jsonimport time#### 登录过程# bro=webdriver.Chrome(executable_path='chromedriver.exe')# bro.implicitly_wait(10)# bro.get('https://dig.chouti.com/')# try:# sub_btn=bro.find_element_by_id('login_btn')# print(sub_btn)## # sub_btn.click() # 报错# bro.execute_script('arguments[0].click();',sub_btn)## # username=bro.find_element_by_css_selector('body > div.login-dialog.dialog.animated2.scaleIn > div > div.login-body > div.form-item.login-item.clearfix.phone-item.mt24 > div.input-item.input-item-short.left.clearfix > input')# username=bro.find_element_by_css_selector('div.input-item>input.login-phone')# username.send_keys('675221')# # password=bro.find_element_by_css_selector('body > div.login-dialog.dialog.animated2.scaleIn > div > div.login-footer > div.form-item.login-item.clearfix.mt24 > div')# password = bro.find_element_by_css_selector('div.input-item>input.pwd-password-input')# password.send_keys('111')## time.sleep(3)# btn=bro.find_element_by_css_selector('body > div.login-dialog.dialog.animated2.scaleIn > div > div.login-footer > div:nth-child(4) > button')## btn.click()## input('等')## with open('chouti.json','w') as f:# json.dump(bro.get_cookies(),f)##### finally:# bro.close()import requestsbro=webdriver.Chrome(executable_path='chromedriver.exe')bro.implicitly_wait(10)bro.get('https://dig.chouti.com/')# 把屏幕滑倒最底下bro.execute_script('window.scrollTo(0, document.body.scrollHeight);')# bro.find_elements_by_css_selector('.link-item')cookie={}##从文件中读出cookiewith open('chouti.json','r') as f: res=json.load(f)for item in res: cookie[item['name']]=item['value']print(cookie) # requests能够使用的cookiediv= bro.find_element_by_class_name('link-con')time.sleep(2)header={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36'}div_list=div.find_elements_by_class_name('link-item')for div in div_list: article_id=div.get_attribute('data-id') print(article_id) # 使用requests发送请求 res=requests.post('https://dig.chouti.com/link/vote',data={'linkId': article_id},cookies=cookie,headers=header) print(res.text)bro.close()
打码平台使用
# 人工破解# 图像识别模块---》数字,字母组合# 验证码破解平台---》云打码,超级鹰 -给它一张图片---》结果返回 (收费的)
#!/usr/bin/env python# coding:utf-8import requestsfrom hashlib import md5class Chaojiying_Client(object): def __init__(self, username, password, soft_id): self.username = username password = password.encode('utf8') self.password = md5(password).hexdigest() self.soft_id = soft_id self.base_params = { 'user': self.username, 'pass2': self.password, 'softid': self.soft_id, } self.headers = { 'Connection': 'Keep-Alive', 'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)', } def PostPic(self, im, codetype): """ im: 图片字节 codetype: 题目类型 参考 http://www.chaojiying.com/price.html """ params = { 'codetype': codetype, } params.update(self.base_params) files = {'userfile': ('ccc.jpg', im)} r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, files=files, headers=self.headers) return r.json() def ReportError(self, im_id): """ im_id:报错题目的图片ID """ params = { 'id': im_id, } params.update(self.base_params) r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers) return r.json()if __name__ == '__main__': chaojiying = Chaojiying_Client('111', '111', '111') # 用户中心>>软件ID 生成一个替换 96001 im = open('a.jpg', 'rb').read() # 本地图片文件路径 来替换 a.jpg 有时WIN系统须要// print(chaojiying.PostPic(im, 1902)) # 1902 验证码类型 官方网站>>价格体系 3.4+版 print 后要加()
xpath使用
1 一门在html中查找数据的语言2 记住的语法: / 取当前路径下的xx // 取所有路径下的xx . 当前路径 .. 上一层 @ 取属性 4 lxml解析模块提供的xpathdoc='''<html> <head> <base href='http://example.com/' /> <title>Example website</title> </head> <body> <div id='images'> <a href='image1.html' name='sss'>Name: My image 1 <br /><img src='image1_thumb.jpg' /></a> <a href='image2.html' name='111'>Name: My image 2 <br /><img src='image2_thumb.jpg' /></a> <a href='image3.html'>Name: My image 3 <br /><img src='image3_thumb.jpg' /></a> <a href='image4.html' class='li'>Name: My image 4 <br /><img src='image4_thumb.jpg' /></a> <a href='image5.html' class='li li-item' name='items'>Name: My image 5 <br /><img src='image5_thumb.jpg' /></a> <a href='image6.html' name='items'><span><h5>test</h5></span>Name: My image 6 <br /><img src='image6_thumb.jpg' /></a> </div> </body></html>'''from lxml import etree# 传入要解析的内容html=etree.HTML(doc)# res=html.xpath('//body')# print(res)# 1 所有节点# a=html.xpath('//*')# 2 指定节点(结果为列表)# a=html.xpath('//head')# 3 子节点,子孙节点# a=html.xpath('//div/a')# a=html.xpath('//body//a') #无数据# a=html.xpath('//body//a')# 4 父节点# a=html.xpath('//body//a[@href="image1.html"]/..')# a=html.xpath('//body//a')# a=html.xpath('//body//a[@href="image1.html"]')# a=html.xpath('//body//a[1]/..')# 也可以这样# a=html.xpath('//body//a[1]/parent::*')# a=html.xpath('//body//a[1]/parent::p')# 5 属性匹配# a=html.xpath('//a[@href="image1.html"]')# a=html.xpath('//a[@name="sss"]')# 6 文本获取 text()# a=html.xpath('//a[@href="image1.html"]/text()')# a=html.xpath('//a/text()')# 7 属性获取# a=html.xpath('//a/@href')# a=html.xpath('//a[1]/@name')# # 注意从1 开始取(不是从0)# a=html.xpath('//body//a[2]/@href')# 8 属性多值匹配# a 标签有多个class类,直接匹配就不可以了,需要用contains# a=html.xpath('//a[@class="li"]')# a=html.xpath('//a[contains(@class,"li")]')# a=html.xpath('//body//a[contains(@class,"li")]/text()')# 9 多属性匹配# a=html.xpath('//body//a[contains(@class,"li") or @name="items"]')# a=html.xpath('//body//a[contains(@class,"li") and @name="items"]/text()')# a=html.xpath('//body//a[contains(@class,"li")]/text()')# 10 按序选择# a=html.xpath('//a[2]/text()')# a=html.xpath('//a[2]/@href')# a=html.xpath('//a[2]/@name')# 取最后一个# a=html.xpath('//a[last()]/@href')# 位置小于3的# a=html.xpath('//a[position()<3]/@href')# 倒数第二个# a=html.xpath('//a[last()-2]/@href')# 11 节点轴选择# ancestor:祖先节点# 使用了* 获取所有祖先节点# a=html.xpath('//a/ancestor::*')# # 获取祖先节点中的div# a=html.xpath('//a/ancestor::div')# attribute:属性值# a=html.xpath('//a[1]/attribute::*')# child:直接子节点# a=html.xpath('//a[1]/child::*')# a=html.xpath('//a[1]/child::img/@src')# descendant:所有子孙节点# a=html.xpath('//a[6]/descendant::*')# following:当前节点之后所有节点# a=html.xpath('//a[1]/following::*')# a=html.xpath('//a[1]/following::*[1]/@href')# following-sibling:当前节点之后同级节点# a=html.xpath('//a[1]/following-sibling::*')# a=html.xpath('//a[1]/following-sibling::a')# a=html.xpath('//a[1]/following-sibling::*[2]/text()')# a=html.xpath('//a[1]/following-sibling::*[2]/@href')print(a)

浙公网安备 33010602011771号