爬虫

爬虫介绍

1 爬虫流程
模拟发送http请求-----》解析数据（清洗数据）----》入库
2 百度，谷歌：就是爬虫

百度搜索，输入关键字---》搜的是百度的数据库---》页面中看到---》点击具体内容---》跳转到互联网的某个网页
seo优化：让百度爬到你，你直接主动上报
sem：花钱做广告买关键词

3 爬虫协议

哪部分允许爬取，哪部分不允许爬取
https://www.csdn.net/robots.txt

4 python中爬虫相关内容

模拟发送http请求（requests，selenium）-----》解析数据（清洗数据）（json，bs4。。。）----》入库（文件，mysql，redis，excel，mongodb）
反扒：
- 封ip：代理池
- 封账号：cookie池
- 请求头中带特殊校验：相应的破解出哪写字段
- 数据加密：js解析出加密方式，自行组装数据
- html：css反扒，字体反扒

requests库介绍

requests模块，基于urllib3 封装，方便的发送http的请求

pip3 install requests

#各种请求方式：常用的就是requests.get()和requests.post()
>>> import requests
>>> r = requests.get('https://api.github.com/events')
>>> r = requests.post('http://httpbin.org/post', data = {'key':'value'})
>>> r = requests.put('http://httpbin.org/put', data = {'key':'value'})
>>> r = requests.delete('http://httpbin.org/delete')
>>> r = requests.head('http://httpbin.org/get')
>>> r = requests.options('http://httpbin.org/get')

发送get请求

添加请求头：headers关键字参数传入字典
get请求添加额外参数：params关键字参数传入字典

视情况请求头中携带cookie

res = requests.get('https://www.cnblogs.com/xiaoyuanqujing/articles/11805698.html')
print(res.text)  # 返回的数据

search = input('请输入要搜索的内容：')
res = requests.get('https://www.baidu.com/s?wd=' + search,
                   headers={
                       'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36',
                       'Host': 'www.baidu.com',
                   })

print(res.text)
with open('search.html','w',encoding='utf-8') as f:
    f.write(res.text)
    
    
res = requests.get('https://www.baidu.com/s?', headers={
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36',
    'Referer': 'https://www.baidu.com/s?wd=python'
},params={'wd':'python'})

# url编码和解码
from urllib.parse import quote,unquote
# res=quote('美女')
# print(res)  #%E7%BE%8E%E5%A5%B3

res=unquote('%E7%BE%8E%E5%A5%B3')
print(res)


from urllib.parse import urlencode
res=urlencode({'wd':'美女','age':19},encoding='utf-8')
print(res)

发送post请求

# 自动登录网站
res = requests.post('http://www.aa7a.cn/user.php', data={
    'username': '9@qq.com',
    'password': '',
    'captcha': 'zxv7',
    'remember': 1,
    'ref': 'http://www.aa7a.cn/',
    'act': 'act_login'
})

# print(res.text)
## 取出cookie,登录成功的cookie
cookie=res.cookies  # CookieJar对象
print(cookie)

res2=requests.get('http://www.aa7a.cn/',cookies=cookie)
# res2=requests.get('http://www.aa7a.cn/')
print('616564099@qq.com' in res2.text)

# body体携带数据
# res = requests.post('',data={})  # urlencoded方式
# res = requests.post('',json='json格式字符串')  # aplication/json方式
# res = requests.post('',json='',headers={
# 'content-type': 'application/json;charset=utf-8'
# })

自动处理cookie

#### 使用requests.session()自动处理cookie
import requests

session=requests.session()

### 自己处理cookie情况
# res = requests.post('http://www.aa7a.cn/user.php', data={
#     'username': '69@qq.com',
#     'password': '1',
#     'captcha': 'zxv7',
#     'remember': 1,
#     'ref': 'http://www.aa7a.cn/',
#     'act': 'act_login'
# })
# cookie=res.cookies
# res2=requests.get('http://www.aa7a.cn/',cookies=cookie)
# print('616564099@qq.com' in res2.text)


### 自动处理cookie情况
# res = session.post('http://www.aa7a.cn/user.php', data={
#     'username': '9@qq.com',
#     'password': '3',
#     'captcha': 'zxv7',
#     'remember': 1,
#     'ref': 'http://www.aa7a.cn/',
#     'act': 'act_login'
# })
res2=session.get('http://www.aa7a.cn/')
print('616564099@qq.com' in res2.text)

响应request属性

# print(respone.text)   # 响应体的字符串
# print(respone.content) # 响应体的二进制（图片，视频，页面）
# print(respone.status_code) # 响应的状态码
# print(respone.headers)    # 响应头
# print(respone.cookies)    # 返回的cookie
# print(respone.cookies.get_dict())  # cookieJar对象转成字典
# print(respone.cookies.items())    # 相当于字典的items
# print(respone.url)               # 当次请求地址
# print(respone.history)           # 重定向过才有值
# print(respone.encoding)          # 响应的编码格式


# 关闭：response.close()
# from contextlib import closing
# with closing(requests.get('xxx',stream=True)) as response:
#     for line in response.iter_content():
#     pass

# 编码问题，
# 可能会遇到打印respone.text出现乱码，在浏览器页面中看不会出现乱码
respone=requests.get('http://www.aa7a.cn/')
respone.encoding='gbk'  # 之间指定编码方式
# respone.encoding=respone.apparent_encoding   # 使用页面的编码方式
print(respone.text)

# 9 获取二进制，
res=requests.get('http://www.aa7a.cn/data/afficheimg/20201102gophex.png')
print(res.content)
with open('致命诱惑.png','wb') as f:
    f.write(res.content)

# 分多次写入
with open('致命诱惑.png','wb') as f:
    for line in res.iter_content(1024):
        f.write(line)
        
# 解析json
import json
res=requests.get('https://api.luffycity.com/api/v1/course/category/actual/?courseType=actual')
print(json.loads(res.text))
print(res.json())

爬取视频

爬取好看视频

# https://haokan.baidu.com/tab/zongyi_new
# https://haokan.baidu.com/web/video/feed?tab=zongyi_new&act=pcFeed&pd=pc&num=20&shuaxin_id=1628075696379
# https://haokan.baidu.com/web/video/feed?tab=zongyi_new&act=pcFeed&pd=pc&num=20&shuaxin_id=1628075696379
import re
import requests
import json

res = requests.get('https://haokan.baidu.com/web/video/feed?tab=zongyi_new&act=pcFeed&pd=pc&num=20&shuaxin_id=1628075696379')
# print(type(json.loads(res.text)))
# print(json.loads(res.text).get('data').get('response').get('videos'))
videos_list = json.loads(res.text).get('data').get('response').get('videos')
for videos_obj in videos_list:
    print(videos_obj.get('play_url'))
    res1 = requests.get(videos_obj.get('play_url'))
    name = videos_obj.get('play_url').split('/')[-2]
    with open(f'{name}.mp4','wb') as f:
        for line in res1.iter_content(1024):
            f.write(line)

            
# 分析过程稿
# referer:上一次访问的地址，可以做图片防盗链
header={
    'Referer': 'https://www.pearvideo.com/video_1737590'
}

res=requests.get('https://www.pearvideo.com/videoStatus.jsp?contId=1737590&mrd=0.5165499193941832',headers=header)
print(res.text)

开线程池爬取

from concurrent.futures import ThreadPoolExecutor
import requests
import json
def task(url):
    name = url.split('/')[-2]
    print(f'{name}.mp4 开始')
    res = requests.get(url)
    with open(f'{name}.mp4', 'wb') as f:
        for line in res.iter_content(1024):
            f.write(line)
    print(f'{name}.mp4 完成')
if __name__ == '__main__':
    res = requests.get(
        'https://haokan.baidu.com/web/video/feed?tab=zongyi_new&act=pcFeed&pd=pc&num=20&shuaxin_id=1628075696379')
    videos_list = json.loads(res.text).get('data').get('response').get('videos')
    pool_p = ThreadPoolExecutor(5)
    for videos_obj in videos_list:
        play_url = videos_obj.get('play_url')
        pool_p.submit(task, url=play_url)

    pool_p.shutdown(wait=True)

爬取梨视频

#  爬取视频
#https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=5&start=0

# import re
# res=requests.get('https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=5&start=0')
#
# # print(res.text)
# # 如果使用bs4，非常简单
#
# video_list=re.findall('<a href="(.*?)" class="vervideo-lilink actplay">',res.text)
# # print(video_list)
# for video in video_list:
#     video_url='https://www.pearvideo.com/'+video
#     # print(video_url)
#     video_id=video.split('_')[-1]
#
#     header={
#         'Referer':video_url
#     }
#
#     res2=requests.get('https://www.pearvideo.com/videoStatus.jsp?contId=%s&mrd=0.5165499193941832'%video_id,headers=header)
#
#     video_f_url=res2.json()['videoInfo']['videos']['srcUrl']
#     video_real_url=video_f_url.replace(video_f_url.rsplit('/')[-1].split('-')[0], 'cont-%s' % video_id)
#     print(video_real_url)
#
#     res3=requests.get(video_real_url)
#     with open('%s.mp4'%video_id,'wb') as f:
#         for line in res3.iter_content(1024):
#             f.write(line)

# 分析过程稿
# referer:上一次访问的地址，可以做图片防盗链
# header={
#     'Referer': 'https://www.pearvideo.com/video_1737590'
# }
#
# res=requests.get('https://www.pearvideo.com/videoStatus.jsp?contId=1737590&mrd=0.5165499193941832',headers=header)
# print(res.text)

## 可以播放的视频
# 'https://video.pearvideo.com/mp4/short/20210729/cont-1736870-15732687-hd.mp4'
# ## 不可以播放的视频
# 'https://video.pearvideo.com/mp4/short/20210729/1628062847275-15732687-hd.mp4'
#
#
# 'https://video.pearvideo.com/mp4/short/20210729/   cont-1736870   -15732687-hd.mp4'
# 'https://video.pearvideo.com/mp4/short/20210729/   1628062847275   -15732687-hd.mp4'
#
# s='https://video.pearvideo.com/mp4/short/20210729/  1628062847275 -15732687-hd.mp4'
# s.replace(s.rsplit('/')[-1].split('-')[0],'cont-%s'%video_id)

requests高级用法

1 证书 SSL Cert Verification（了解）
2 使用代理
3 超时时间
4 认证（像老款路由器的登录）
5 异常处理
6 文件上传

证书 SSL Cert Verification（了解）## 不验证证书# import requests# respone=requests.get('https://www.12306.cn',verify=False) #不验证证书,报警告,返回200# print(respone.status_code)# ## 携带证书# import requests# respone=requests.get('https://www.12306.cn',#                      cert=('/path/server.crt',#                            '/path/key'))# print(respone.status_code)使用代理# import requests# proxies = {#     'http':'http://117.69.230.132:3256',# }# respone=requests.get('https://www.12306.cn',#                      proxies=proxies)## print(respone.status_code)# 获取代理：花钱买or白嫖or借助第三方自己搭建一个代理池# https://github.com/jhao104/proxy_pool3 超时时间# respone=requests.get('https://www.baidu.com',timeout=0.0001)4 认证（像老款路由器的登录）# import requests# from requests.auth import HTTPBasicAuth# r=requests.get('xxx',auth=HTTPBasicAuth('user','password'))# print(r.status_code)5 异常处理# import requests# from requests.exceptions import * #可以查看requests.exceptions获取异常类型## try:#     r=requests.get('http://www.baidu.com',timeout=0.00001)# # except ReadTimeout:# #     print('===:')# # except ConnectionError: #网络不通# #     print('-----')# # except Timeout:# #     print('aaaaa')## except Exception:#     print('Error')6 文件上传# import requests# files={'myfile':open('1 自动处理cookie.py','rb')}# respone=requests.post('http://127.0.0.1:8000/upload_file/',files=files)# print(respone.text)

抽屉自动点赞

import requestsheader={    'Cookie':'',    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36'}# res=requests.post('https://dig.chouti.com/link/vote',data={'linkId':'31857081'},headers=header)# print(res.text)# 所有的都点赞，----》id解析---》bs4模块（解析xml）res=requests.get('https://dig.chouti.com/top/24hr?_=1628136305346',headers=header).json()for item in res['data']:    id=item['id']    res=requests.post('https://dig.chouti.com/link/vote',data={'linkId':'%s'%id},headers=header)    print(res.text)

爬取汽车之家新闻

# pip3 install beautifulsoup4import requestsfrom bs4 import BeautifulSoupfor i in range(1,100):    res=requests.get('https://www.autohome.com.cn/news/%s/#liststart'%i)    # print(res.text)    # 第一个参数，要解析的内容，第二参数：使用的解析器  html.parser  bs4内置的解析器   lxml    soup=BeautifulSoup(res.text,'html.parser')    # pip3 install lxml    # soup=BeautifulSoup(res.text,'lxml')    # find_all找所有    ul_list=soup.find_all(name='ul',class_='article')    # ul_list=soup.find_all(name='ul')    # print(len(ul_list))    for ul in ul_list:        li_list=ul.find_all(name='li')        for li in li_list:            h3=li.find(name='h3')            if h3:                title=h3.text   # 获取标签的文本内容，标签对象.text                # print(title)                desc=li.find(name='p').text                # print(desc)                img_url=li.find(name='img')['src']                if not img_url.startswith('http'):                    img_url='https:'+img_url                # print(img_url)                url='https:'+li.find(name='a')['href']                print(url)                print('''                新闻标题：%s                新闻摘要：%s                新闻图片：%s                新闻地址：%s                          '''%(title,desc,img_url,url))

bs4遍历文档树

美化：soup.prettify()  遍历：.标签名，soup.html.head.title获取标签名：标签对象.name获取标签属性：标签对象['标签名']获取标签内容：标签对象.text  	标签对象.string（当标签中只有一个文本内容才能拿出）    标签对象.strings（当前标签下的分别拿出，做成生成器）嵌套选择：soup.html.body.p子节点：soup.p.contents子子孙孙节点：soup.p.descendants父节点：soup.a.parent祖先节点：soup.a.parents兄弟节点：	上一个：soup.a.previous_sibling    下一个：soup.a.next_sibling    上面的所有（生成器对象）：soup.a.previous_siblings    下面的所有（生成器对象：soup.a.next_siblingsfrom bs4 import BeautifulSouphtml_doc = """<html><head><title>The Dormouse's story</title></head><body><p class="title"><b>The Dormouse's story <span>111</span></b><span>111</span></p><p class="story">Once upon a time there were three little sisters; and their names were<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;and they lived at the bottom of a well.</p><p class="story">...</p>"""soup=BeautifulSoup(html_doc,'html.parser')# res=soup.prettify()  # 美化# print(res)#1、用法# html=soup.html# title=soup.html.head.title# title=soup.title# print(title)#2、获取标签的名称 ---> 标签对象.name# a=soup.body.a# a=soup.a.name# print(a)# print(soup.body.name)#3、获取标签的属性  ---->标签对象['标签名']# href=soup.body.a['href']# attrs=soup.body.a.attrs  # 所有属性，---》字典# href=soup.body.a.attrs['href']# print(attrs['class'])# c=soup.p.attrs['class']# print(c)#4、获取标签的内容# res=soup.b.text  # 拿到当前标签子子孙所有的text# res=soup.p.text# res=soup.p.string # 当前标签有且只有一个文本内容才能拿出来# res=soup.b.string # 当前标签有且只有一个文本内容才能拿出来# res=soup.p.strings   # 把子子孙放到生成器中# print(list(res))#5、嵌套选择# res=soup.html.body.p# print(type(res))  # bs4.element.Tagfrom bs4.element import Tag####了解#6、子节点、子孙节点# print(soup.p.contents) #p下所有子节点，放到列表中# print(soup.p.children) #得到一个迭代器,包含p下所有子节点# for i,child in enumerate(soup.p.children):#     print(i,child)# print(soup.p.descendants) #获取子孙节点,p下所有的标签都会选择出来# for i,child in enumerate(soup.p.descendants):#     print(i,child)#7、父节点、祖先节点# print(soup.a.parent) #获取a标签的父节点# print(soup.body.parent)# print(soup.a.parents) #找到a标签所有的祖先节点，父亲的父亲，父亲的父亲的父亲...# print(list(soup.a.parents))# print(len(list(soup.a.parents)))#8、兄弟节点# print(soup.a.next_sibling) #下一个兄弟# print(soup.a.previous_sibling) #上一个兄弟# print(list(soup.a.next_siblings)) #下面的兄弟们=>生成器对象# print(list(soup.a.previous_siblings)) #上面的兄弟们=>生成器对象

bs4搜索文档树

可以用 find和find_all两个方法来搜索五种过滤器：字符串、正则表达式、列表、方法、布尔值建议 遍历文档树和搜索文档树混用其他参数：limit（限制获取的条数）、recursive（是否递归查找）from bs4 import BeautifulSouphtml_doc = """<html><head><title>The Dormouse's story</title></head><body id='body'><p class="title"><b>The Dormouse's story <span>111</span></b><span>egon</span></p><p class="story">Once upon a time there were three little sisters; and their names were<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;and they lived at the bottom of a well.</p><p class="story">...</p>"""soup=BeautifulSoup(html_doc,'html.parser')# 搜索文档树  find  find_all# 五种过滤器: 字符串、正则表达式、列表、True、方法##### 字符串# res=soup.find(name='body')# res=soup.find(name='p',class_='story')# 查找id为link2的标签# res=soup.find(id='link2',name='a',class_='sister',href='http://example.com/lacie')# res=soup.find(href='http://example.com/lacie')# print(res)# res=soup.find(attrs={'class':['sister']})# print(res)#### 正则表达式import re# res=soup.find_all(name=re.compile('^b')) #找出b开头的标签，结果有body和b标签# res=soup.find(name=re.compile('^b'))# res=soup.find_all(class_=re.compile('^s'))# res=soup.find_all(href=re.compile('^http'))# res=soup.find_all(id=re.compile('^l'))# print(res)####列表、# res=soup.find_all(name=['body','b'])# res=soup.find_all(id=['link1','link2'])# res=soup.find_all(attrs={'id':['link1','link2']})## print(res)# True、# links=soup.find_all(href=True)# print(links)# res=soup.find_all(name=True)# res=soup.find_all(id=True)# print(res)#方法# def has_class_but_no_id(tag):#     return tag.has_attr('class') and not tag.has_attr('id')## print(len(soup.find_all(name=has_class_but_no_id)))# 拿出当前页面所有图片soup.find_all(name='img',href=True)## 建议 遍历文档树和搜索文档树混用# soup.body.div.find### 其他参数  find，find_all#limit# soup.find()# res=soup.find_all(name='a',href=True,limit=2)  # 限制获取的条数# print(res)# recursive 是否递归查找# res=soup.find_all(name='a',recursive=False)# res=soup.find_all(name='html',recursive=False)# print(res)

css选择器

res=soup.p.select('.sister') #id.类名标签标签>标签标签 标签html_doc = """<html><head><title>The Dormouse's story</title></head><body><p class="title">    <b>The Dormouse's story  <p>asdfasdf</p></b>    Once upon a time there were three little sisters; and their names were    <a href="http://example.com/elsie" class="sister" id="link1">        <span>Elsie</span>    </a>    <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and    <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;    <div class='panel-1'>        <ul class='list' id='list-1'>            <li class='element'>Foo</li>            <li class='element'>Bar</li>            <li class='element'>Jay</li>        </ul>        <ul class='list list-small' id='list-2'>            <li class='element'><h1 class='yyyy'>Foo</h1></li>            <li class='element xxx'>Bar</li>            <li class='element'>Jay</li>        </ul>    </div>    and they lived at the bottom of a well.</p><p class="story">...</p>"""from bs4 import BeautifulSoupsoup=BeautifulSoup(html_doc,'html.parser')'''#id.类名标签标签>标签标签 标签'''# res=soup.p.select('.sister')  # 使用css选择器# res=soup.p.select('#link1')  # 使用css选择器# res=soup.select('body>p')  # 使用css选择器 body的子标签pres=soup.select('body p')  # 使用css选择器 body的子子孙孙标签pprint(len(res))### css选择器是通用的：bs4，lxml解析也可以是css选择器##css选择器不会写怎么办？'#maincontent > div:nth-child(3) > table > tbody > tr:nth-child(13) > td:nth-child(3)'## xpath选择'//*[@id="maincontent"]/div[2]/table/tbody/tr[18]/td[2]'

selenium使用

# 如果使用requests模块，发送请求获取的数据不全，它不能执行js# selenium:可以使用代码控制模拟人操作浏览器## 操作某个浏览器，就需要有浏览器驱动# http://npm.taobao.org/mirrors/chromedriver/  谷歌驱动的淘宝镜像站# 谷歌浏览器版本要跟驱动版本对应## 92.0.4515.131  下载相应版本驱动，放到项目代码中# pip3 install selenium# from selenium import webdriver# import time# # 打开一个谷歌浏览器# bro=webdriver.Chrome(executable_path='chromedriver.exe')## #地址栏中输入百度# bro.get('https://www.cnblogs.com/')## time.sleep(2)## print(bro.page_source)  #当前页面的html内容## bro.close()  # 关闭浏览器# import requests## res=requests.get('https://dig.chouti.com/',headers={#     'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36'# })# print(res.text)

基本使用

from selenium import webdriverimport time# 浏览器对象bro = webdriver.Chrome(executable_path='chromedriver.exe')bro.implicitly_wait(10)  # 隐士等待，去找控件，如果没有会等10sbro.get('https://www.baidu.com/')# sub_button=bro.find_element_by_css_selector('#s-top-loginbtn')sub_button = bro.find_element_by_id('s-top-loginbtn')  # 如果有id，优先用它# 点击sub_button.click()# 找到用户名密码登录user_btn = bro.find_element_by_xpath('//*[@id="TANGRAM__PSP_11__footerULoginBtn"]')# user_btn=bro.find_element_by_id('TANGRAM__PSP_11__footerULoginBtn')user_btn.click()username = bro.find_element_by_id('TANGRAM__PSP_11__userName')password = bro.find_element_by_id('TANGRAM__PSP_11__password')# 往输入框中写东西username.send_keys('666@qq.com')password.send_keys('45')sumbit_btn = bro.find_element_by_id('TANGRAM__PSP_11__submit')time.sleep(3)sumbit_btn.click()time.sleep(3)bro.close()

无头浏览器

from selenium import webdriverfrom selenium.webdriver.chrome.options import Optionschrome_options = Options()chrome_options.add_argument('window-size=1920x3000') #指定浏览器分辨率chrome_options.add_argument('--disable-gpu') #谷歌文档提到需要加上这个属性来规避bugchrome_options.add_argument('--hide-scrollbars') #隐藏滚动条, 应对一些特殊页面chrome_options.add_argument('blink-settings=imagesEnabled=false') #不加载图片, 提升速度chrome_options.add_argument('--headless') #浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败driver=webdriver.Chrome(executable_path='chromedriver.exe',chrome_options=chrome_options)driver.get('https://www.baidu.com')print(driver.page_source)driver.close()

获取元素位置，属性，大小

from selenium import webdriverimport timedriver=webdriver.Chrome(executable_path='chromedriver.exe')driver.get('https://kyfw.12306.cn/otn/resources/login.html')driver.implicitly_wait(10)user_login=driver.find_element_by_css_selector('.login-hd-account>a')user_login.click()time.sleep(2)img=driver.find_element_by_id('J-loginImg')print(img)print(img.id)    #selenium提供的id，忽略print(img.tag_name) # 标签名print('-----')print(img.location) # img标签的位置print(img.size)     # img标签大小# 获取属性# print(img.get_attribute('src'))print(img.get_attribute('class'))driver.close()

等待元素被加载

from selenium import webdriver# 两种等待方式# 显示等待# 隐式等待：只需要写一句话，等待所有要获取的标签driver=webdriver.Chrome(executable_path='chromedriver.exe')driver.get('https://www.baidu.com')'''# 两种等待方式# 显示等待(忽略掉)    wait=WebDriverWait(driver,10)    wait.until(EC.presence_of_element_located((By.ID,'content_left')))    contents=browser.find_element(By.CSS_SELECTOR,'#content_left')# 隐式等待：    -driver.implicitly_wait(10)    -driver.find_element_by_css_selector()    -只需要写一句话，等待所有要获取的标签'''driver.implicitly_wait(10)print(driver.page_source)# 再找控件，只要没加载成功，就会等待，最多等10sdriver.close()

元素操作

from selenium import webdriverimport timedriver=webdriver.Chrome(executable_path='chromedriver.exe')driver.get('https://www.baidu.com')driver.implicitly_wait(10)## 点击，清空，输入操作input_search=driver.find_element_by_id('kw')input_search.send_keys('美女')  # 输入time.sleep(3)input_search.clear() # 清空time.sleep(2)input_search.send_keys('性感美女')time.sleep(2)btn=driver.find_element_by_id('su')btn.click()  # 点击time.sleep(10)driver.close()

执行js

from selenium import webdriverimport timedriver=webdriver.Chrome(executable_path='chromedriver.exe')driver.get('http://127.0.0.1:8000/')driver.implicitly_wait(10)driver.execute_script("name='egon';") # 这里面写js代码driver.execute_script("alert(name)") # 这里面写js代码time.sleep(5)# driver.close()

切换选项卡

import timefrom selenium import webdriverbrowser=webdriver.Chrome()browser.get('https://www.baidu.com')browser.execute_script('window.open()')print(browser.window_handles) #获取所有的选项卡# browser.switch_to_window(browser.window_handles[1])# browser.switch_to_window(browser.window_handles[1])browser.switch_to.window(browser.window_handles[1])browser.get('https://www.taobao.com')time.sleep(5)# browser.switch_to_window(browser.window_handles[0])browser.switch_to.window(browser.window_handles[0])browser.get('https://www.sina.com.cn')browser.close()

模拟前进后退

import timefrom selenium import webdriverbrowser=webdriver.Chrome(executable_path='chromedriver.exe')browser.get('https://www.baidu.com')browser.get('https://www.taobao.com')browser.get('http://www.sina.com.cn/')browser.back()time.sleep(3)browser.forward()browser.close()

异常处理

from selenium import webdriverfrom selenium.common.exceptions import TimeoutException,NoSuchElementException,NoSuchFrameExceptionbrowser = webdriver.Chrome()try:    browser.get('http://www.baidu.com')except Exception as e:    print(e)finally:    browser.close()

elenium登录cnblogs获取cookie

#selenium登录cnblogs获取cookiefrom selenium import webdriverfrom selenium.common.exceptions import TimeoutException,NoSuchElementException,NoSuchFrameExceptionimport timeimport jsonbrowser = webdriver.Chrome(executable_path='chromedriver.exe')browser.implicitly_wait(10)####  登录过程# try:#     browser.get('http://www.cnblogs.com')#     submit_btn=browser.find_element_by_link_text('登录')  # a标签的内容#     submit_btn.click()##     username=browser.find_element_by_id('mat-input-0')#     password=browser.find_element_by_id('mat-input-1')#     username.send_keys('9@qq.com')#     password.send_keys('1111')#     input('等会')#     sub_btn=browser.find_element_by_css_selector('body > app-root > mat-sidenav-container > mat-sidenav-content > div > div > app-sign-in > app-content-container > div > div > div > form > div > button > span.mat-button-wrapper')#     sub_btn.click()##     # 人工参与，滑动#     input('等会')##     # 获取到登录后的cookie#     print(browser.get_cookies())##     with open('cookie.json','w') as f:#         json.dump(browser.get_cookies(),f)### except Exception as e:#     print(e)# finally:#     browser.close()### 不登录了，把cookie写入浏览器# browser.get('http://www.cnblogs.com')# with open('cookie.json','r') as f:#     cookie=json.load(f)# time.sleep(5)# for item in cookie:  # 设置cookie必须用字典，cookie的json文件是列表，所以用循环往里放#     browser.add_cookie(item)#### browser.refresh()  # 刷新页面## time.sleep(5)## browser.close()

抽屉半自动点赞

from selenium import webdriverimport jsonimport time#### 登录过程# bro=webdriver.Chrome(executable_path='chromedriver.exe')# bro.implicitly_wait(10)# bro.get('https://dig.chouti.com/')# try:#     sub_btn=bro.find_element_by_id('login_btn')#     print(sub_btn)##     # sub_btn.click()  # 报错#     bro.execute_script('arguments[0].click();',sub_btn)##     # username=bro.find_element_by_css_selector('body > div.login-dialog.dialog.animated2.scaleIn > div > div.login-body > div.form-item.login-item.clearfix.phone-item.mt24 > div.input-item.input-item-short.left.clearfix > input')#     username=bro.find_element_by_css_selector('div.input-item>input.login-phone')#     username.send_keys('675221')#     # password=bro.find_element_by_css_selector('body > div.login-dialog.dialog.animated2.scaleIn > div > div.login-footer > div.form-item.login-item.clearfix.mt24 > div')#     password = bro.find_element_by_css_selector('div.input-item>input.pwd-password-input')#     password.send_keys('111')##     time.sleep(3)#     btn=bro.find_element_by_css_selector('body > div.login-dialog.dialog.animated2.scaleIn > div > div.login-footer > div:nth-child(4) > button')##     btn.click()##     input('等')##     with open('chouti.json','w') as f:#         json.dump(bro.get_cookies(),f)##### finally:#     bro.close()import requestsbro=webdriver.Chrome(executable_path='chromedriver.exe')bro.implicitly_wait(10)bro.get('https://dig.chouti.com/')# 把屏幕滑倒最底下bro.execute_script('window.scrollTo(0, document.body.scrollHeight);')# bro.find_elements_by_css_selector('.link-item')cookie={}##从文件中读出cookiewith open('chouti.json','r') as f:    res=json.load(f)for item in res:    cookie[item['name']]=item['value']print(cookie) # requests能够使用的cookiediv= bro.find_element_by_class_name('link-con')time.sleep(2)header={    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36'}div_list=div.find_elements_by_class_name('link-item')for div in div_list:    article_id=div.get_attribute('data-id')    print(article_id)    # 使用requests发送请求    res=requests.post('https://dig.chouti.com/link/vote',data={'linkId': article_id},cookies=cookie,headers=header)    print(res.text)bro.close()

打码平台使用

# 人工破解# 图像识别模块---》数字，字母组合# 验证码破解平台---》云打码，超级鹰	-给它一张图片---》结果返回   （收费的）

#!/usr/bin/env python# coding:utf-8import requestsfrom hashlib import md5class Chaojiying_Client(object):    def __init__(self, username, password, soft_id):        self.username = username        password = password.encode('utf8')        self.password = md5(password).hexdigest()        self.soft_id = soft_id        self.base_params = {            'user': self.username,            'pass2': self.password,            'softid': self.soft_id,        }        self.headers = {            'Connection': 'Keep-Alive',            'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)',        }    def PostPic(self, im, codetype):        """        im: 图片字节        codetype: 题目类型 参考 http://www.chaojiying.com/price.html        """        params = {            'codetype': codetype,        }        params.update(self.base_params)        files = {'userfile': ('ccc.jpg', im)}        r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, files=files,                          headers=self.headers)        return r.json()    def ReportError(self, im_id):        """        im_id:报错题目的图片ID        """        params = {            'id': im_id,        }        params.update(self.base_params)        r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers)        return r.json()if __name__ == '__main__':    chaojiying = Chaojiying_Client('111', '111', '111')  # 用户中心>>软件ID 生成一个替换 96001    im = open('a.jpg', 'rb').read()  # 本地图片文件路径 来替换 a.jpg 有时WIN系统须要//    print(chaojiying.PostPic(im, 1902))  # 1902 验证码类型  官方网站>>价格体系 3.4+版 print 后要加()

xpath使用

1 一门在html中查找数据的语言2 记住的语法：	/   取当前路径下的xx       //  取所有路径下的xx       .   当前路径        ..   上一层	@    取属性    4 lxml解析模块提供的xpathdoc='''<html> <head>  <base href='http://example.com/' />  <title>Example website</title> </head> <body>  <div id='images'>   <a href='image1.html' name='sss'>Name: My image 1 <br /><img src='image1_thumb.jpg' /></a>   <a href='image2.html' name='111'>Name: My image 2 <br /><img src='image2_thumb.jpg' /></a>   <a href='image3.html'>Name: My image 3 <br /><img src='image3_thumb.jpg' /></a>   <a href='image4.html' class='li'>Name: My image 4 <br /><img src='image4_thumb.jpg' /></a>   <a href='image5.html' class='li li-item' name='items'>Name: My image 5 <br /><img src='image5_thumb.jpg' /></a>   <a href='image6.html' name='items'><span><h5>test</h5></span>Name: My image 6 <br /><img src='image6_thumb.jpg' /></a>  </div> </body></html>'''from lxml import etree# 传入要解析的内容html=etree.HTML(doc)# res=html.xpath('//body')# print(res)# 1 所有节点# a=html.xpath('//*')# 2 指定节点（结果为列表）# a=html.xpath('//head')# 3 子节点，子孙节点# a=html.xpath('//div/a')# a=html.xpath('//body//a') #无数据# a=html.xpath('//body//a')# 4 父节点# a=html.xpath('//body//a[@href="image1.html"]/..')# a=html.xpath('//body//a')# a=html.xpath('//body//a[@href="image1.html"]')# a=html.xpath('//body//a[1]/..')# 也可以这样# a=html.xpath('//body//a[1]/parent::*')# a=html.xpath('//body//a[1]/parent::p')# 5 属性匹配# a=html.xpath('//a[@href="image1.html"]')# a=html.xpath('//a[@name="sss"]')# 6 文本获取  text()# a=html.xpath('//a[@href="image1.html"]/text()')# a=html.xpath('//a/text()')# 7 属性获取# a=html.xpath('//a/@href')# a=html.xpath('//a[1]/@name')# # 注意从1 开始取（不是从0）# a=html.xpath('//body//a[2]/@href')# 8 属性多值匹配#  a 标签有多个class类，直接匹配就不可以了，需要用contains# a=html.xpath('//a[@class="li"]')# a=html.xpath('//a[contains(@class,"li")]')# a=html.xpath('//body//a[contains(@class,"li")]/text()')# 9 多属性匹配# a=html.xpath('//body//a[contains(@class,"li") or @name="items"]')# a=html.xpath('//body//a[contains(@class,"li") and @name="items"]/text()')# a=html.xpath('//body//a[contains(@class,"li")]/text()')# 10 按序选择# a=html.xpath('//a[2]/text()')# a=html.xpath('//a[2]/@href')# a=html.xpath('//a[2]/@name')# 取最后一个# a=html.xpath('//a[last()]/@href')# 位置小于3的# a=html.xpath('//a[position()<3]/@href')# 倒数第二个# a=html.xpath('//a[last()-2]/@href')# 11 节点轴选择# ancestor：祖先节点# 使用了* 获取所有祖先节点# a=html.xpath('//a/ancestor::*')# # 获取祖先节点中的div# a=html.xpath('//a/ancestor::div')# attribute：属性值# a=html.xpath('//a[1]/attribute::*')# child：直接子节点# a=html.xpath('//a[1]/child::*')# a=html.xpath('//a[1]/child::img/@src')# descendant：所有子孙节点# a=html.xpath('//a[6]/descendant::*')# following:当前节点之后所有节点# a=html.xpath('//a[1]/following::*')# a=html.xpath('//a[1]/following::*[1]/@href')# following-sibling:当前节点之后同级节点# a=html.xpath('//a[1]/following-sibling::*')# a=html.xpath('//a[1]/following-sibling::a')# a=html.xpath('//a[1]/following-sibling::*[2]/text()')# a=html.xpath('//a[1]/following-sibling::*[2]/@href')print(a)

posted @ 2021-08-24 15:37 zheng-sn 阅读(90) 评论(0) 收藏举报

刷新页面返回顶部

zheng-sn

爬虫

爬虫