爬虫 第二弹
一:图片懒加载案例
import requests
from lxml import etree
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'
}
"""
图片懒加载:当用户刷新页面的时候,页面中的图片只会加载出现局部的而不是所有,只有当满足了指定的条件
才可以将剩余的图片加载出来.
如何决定图片是否要加载出来?使用伪属性
"""
url = 'https://sc.chinaz.com/tupian/hunsha.html'
response = requests.get(url,headers=headers)
response.encoding='utf-8'
page_text = response.text
tree = etree.HTML(page_text)
div_list = tree.xpath('//*[@id="container"]/div')
for div in div_list:
img_src = 'https:'+div.xpath('./div/a/img/@src2')[0]
img_name = div.xpath('./div/a/img/@alt')[0]+'.jpg'
print(img_name,img_src)
二:cookie反爬虫案例
import requests
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'
}
# 获取session对象
sess = requests.Session()
# 捕获cookie
sess.get('https://xueqiu.com/',headers=headers)
url = 'https://xueqiu.com/statuses/hot/listV2.json?since_id=-1&max_id=195129&size=15'
result=sess.get(url,headers=headers).json()
print(result)
三:代理操作
import requests
from lxml import etree
import random
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'
}
# 代理池
# 下面写可以使用的代理IP
ips=[
# ......
]
url = 'https://www.sogou.com/web?query=ip'
# 使用代理机制
page_text = requests.get(url,headers=headers,proxies=random.choice(ips)).text
tree = etree.HTML(page_text)
print(tree.xpath('//*[@id="ipsearchresult"]/strong/text()')[0])
四:验证码自动识别
import base64
import json
import requests
# 以下是可以进行验证码识别的数据
# 一、图片文字类型(默认 3 数英混合):
# 1 : 纯数字
# 1001:纯数字2
# 2 : 纯英文
# 1002:纯英文2
# 3 : 数英混合
# 1003:数英混合2
# 4 : 闪动GIF
# 7 : 无感学习(独家)
# 11 : 计算题
# 1005: 快速计算题
# 16 : 汉字
# 32 : 通用文字识别(证件、单据)
# 66: 问答题
# 49 :recaptcha图片识别 参考 https://shimo.im/docs/RPGcTpxdVgkkdQdY
# 二、图片旋转角度类型:
# 29 : 旋转类型
#
# 三、图片坐标点选类型:
# 19 : 1个坐标
# 20 : 3个坐标
# 21 : 3 ~ 5个坐标
# 22 : 5 ~ 8个坐标
# 27 : 1 ~ 4个坐标
# 48 : 轨迹类型
#
# 四、缺口识别
# 18:缺口识别
# 五、拼图识别
# 53:拼图识别
def base64_api(uname, pwd, img,typeid):
with open(img,'rb') as f:
base64_data = base64.b64encode(f.read())
b64 = base64_data.decode()
data = {"username": uname, "password": pwd,"typeid":typeid, "image": b64}
result = json.loads(requests.post("http://api.ttshitu.com/predict", json=data).text)
if result['success']:
return result["data"]["result"]
else:
return result["message"]
return ""
if __name__ == "__main__":
img_path = "./code.jpg" # 这里放要识别的文件 下面是账号密码
result = base64_api(uname='bb328410948', pwd='bb328410948', img=img_path,typeid=3)
print(result)
五:模拟登陆
# 验证码识别
import base64
import json
import requests
from lxml import etree
#获取session
sess = requests.Session()
# ua伪装
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'
}
def base64_api(uname, pwd, img,typeid):
with open(img,'rb') as f:
base64_data = base64.b64encode(f.read())
b64 = base64_data.decode()
data = {"username": uname, "password": pwd,"typeid":typeid, "image": b64}
result = json.loads(requests.post("http://api.ttshitu.com/predict", json=data).text)
if result['success']:
return result["data"]["result"]
else:
return result["message"]
return ""
# 动态获取验证码图片
main_url = 'https://so.gushiwen.cn/user/login.aspx?from=http://so.gushiwen.cn/user/collect.aspx'
page_text =sess.get(main_url,headers=headers).text
tree = etree.HTML(page_text)
img_src = 'https://so.gushiwen.cn'+tree.xpath('//*[@id="imgCode"]/@src')[0]
img_data = sess.get(img_src,headers=headers).content
with open('../code.jpg', 'wb') as fp:
fp.write(img_data)
#识别操作
if __name__ == "__main__":
img_path = "./code.jpg" # 这里放要识别的文件 下面是账号密码
result = base64_api(uname='bb328410948', pwd='bb328410948', img=img_path,typeid=3)
print(result)
#在前台页面动态捕获动态变化的请求参数值
__VIEWSTATE = tree.xpath('//*[@id="__VIEWSTATE"]/@value')[0]
login_url = 'https://so.gushiwen.cn/user/login.aspx?from=http%3a%2f%2fso.gushiwen.cn%2fuser%2fcollect.aspx'
data = {
#动态变化的请求参数
'__VIEWSTATE': __VIEWSTATE,
'__VIEWSTATEGENERATOR': 'C93BE1AE',
'from': 'http://so.gushiwen.cn/user/collect.aspx',
'email': '15027900535',
'pwd': 'bobo@15027900535',
'code': result,
'denglu': '登录'
}
page_text = sess.post(login_url,headers=headers,data=data).text
with open('../gushiwen.html', 'w', encoding='utf-8') as fp:
fp.write(page_text)
六:协程基础
import asyncio
import time
async def get_request(url):
print('正在请求:',url)
time.sleep(2)
print('请求结束!')
def func(task):#必须携带一个参数,值的就是add_done_callback的调用者
data = task.result() #result()返回任务对象对应特殊函数内部的返回值
print(data)
#协程对象
c = get_request('www.1.com')
#基于协程创建一个任务对象
task = asyncio.ensure_future(c)
#给任务对象绑定回调函数
task.add_done_callback(func)
#创建了一个事件循环对象
loop = asyncio.get_event_loop()
#装载任务对象和启动事件循环
loop.run_until_complete(task)
七:多任务异步协程
import asyncio
import time
start = time.time()
async def get_request(url):
print('正在请求:',url)
await asyncio.sleep(2)
print('请求结束!')
urls = [
'www.1.com',
'www.2.com',
'www.3.com'
]
tasks = []
for url in urls:
c = get_request(url)
task = asyncio.ensure_future(c)
tasks.append(task)
loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.wait(tasks))
print('总耗时:',time.time()-start)
八:多任务异步爬虫
import asyncio
import time
import requests
import aiohttp
from lxml import etree
start = time.time()
#requets是不支持异步
# async def get_request(url):
# page_text = requests.get(url).text
# return page_text
async def get_request(url):
#创建一个请求对象
async with aiohttp.ClientSession() as req:
#发起指定请求
#注意:get和post和requests中的区别在于proxies参数。在aiohttp设置代理使用的时proxy='http://ip:port'
async with await req.get(url) as response:
#text()字符串相应数据,read()byte类型的相应数据
page_text = await response.text()
return page_text
def parse(task):
page_text = task.result()
tree = etree.HTML(page_text)
data = tree.xpath('//a[@id="feng"]/text()')[0]
print(data)
urls = [
'http://127.0.0.1:5000/bobo',
'http://127.0.0.1:5000/tom',
'http://127.0.0.1:5000/jay'
]
tasks = []
for url in urls:
c = get_request(url)
task = asyncio.ensure_future(c)
task.add_done_callback(parse)
tasks.append(task)
loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.wait(tasks))
print('总耗时:',time.time()-start)
九:多线程
from multiprocessing.dummy import Pool #线程池
import requests
import time
start = time.time()
def get_request(url):
page_text = requests.get(url).text
print(len(page_text))
urls = [
'http://127.0.0.1:5000/bobo',
'http://127.0.0.1:5000/tom',
'http://127.0.0.1:5000/jay'
]
pool = Pool(3)
#可以使用get_requsts对urls列表中的每一个列表元素进行操作
pool.map(get_request,urls)
print('总耗时:',time.time()-start)
浙公网安备 33010602011771号