爬虫之requests

requests

requests介绍

#介绍：使用requests可以模拟浏览器的请求，比起之前用到的urllib，requests模块的api更加便捷（本质就是封装了urllib3）

#注意：requests库发送请求将网页内容下载下来以后，并不会执行js代码，这需要我们自己分析目标站点然后发起新的request请求

#安装：pip3 install requests

#各种请求方式：常用的就是requests.get()和requests.post()

# 建议在正式学习requests前，先熟悉下HTTP协议

# requests模块不仅仅用来做爬虫,也可以和第三方通信

官网链接：http://docs.python-requests.org/en/master/

requests模块发送GET请求

import requests
res = requests.get('https://www.cnblogs.com/')
print(res.text)	# 返回的文本内容

requests请求带参数

"""
GET 请求带参数 有两种方法：
	1. 直接拼
	2.放在params参数中
"""
# 1.直接拼
import requests
res=requests.get('https://zzk.cnblogs.com/s?w=python')
print(res.text)

# 2.放在params参数中
import requests
res = requests.get('https://zzk.cnblogs.com/s?', params={'w': 'python开发'})
print(res.text)


"""
url 编码和解码
"""

# 编码
dict1 ={'wd': '百度翻译','age':19}
url_data = parse.urlencode(dict1)
print(url_data)	# 输出结果：wd=%E7%99%BE%E5%BA%A6%E7%BF%BB%E8%AF%91&age=19
# 也可以指直接转汉字
res = parse.quote('清秋')
print(res)	# 输出结果：%E6%B8%85%E7%A7%8B

# 解码
s = '%E6%B8%85%E7%A7%8B'
res = parse.unquote(s)
print(res)	# 输出结果：清秋

"""
请求头中带header
请求头由key-value组成

重要的请求头：
user-agent：客户端是什么
referer:上一次访问的地址
    -Referer: https://www.lagou.com/gongsi/
    -图片防盗链 ：后端判断referer信息，如果不是自己的网站，直接禁止
cookie：登陆信息
"""

import requests
header = {
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36'}
res = requests.get('https://www.sogou.com/web?query=%E7%BE%8E%E5%A5%B3', headers=header)
s = res.text

s = s.encode('utf-8')
with open('meinv.html', 'wb') as f:
    f.write(s)

requests带cookie

"""
如果cookie是登陆的，可以 看到登陆后的信息，如果不是登陆的，就看不到登陆后的信息
1.直接放在请求头中
2.用cookie参数传递
"""

# 方式一:直接放在请求头中
header = {
    'cookie':'Hm_lvt_c29657ca36c6c88e02fed9a397826038=1647351162; UM_distinctid=17f8dc7f8c6135-054d035d4c493b-977173c-144000-17f8dc7f8c7b26; CNZZDATA4603183=cnzz_eid=1344889251-1647346854-null&ntime=1647346854; _jzqa=1.947821944457220600.1647351168.1647351168.1647351168.1; _jzqc=1; _jzqy=1.1647351168.1647351168.1.jzqsr=baidu.-; _jzqckmp=1; CNZZDATA1260462072=1724340417-1647346855-|1647346855; Qs_lvt_201322=1647351507; __xsptplusUT_422=1; mediav={"eid":"179539","ep":"","vid":"Q(#EN)(mrx8]j`Uie1`z","ctn":"","vvid":"Q(#EN)(mrx8]j`Uie1`z","_mvnf":1,"_mvctn":0,"_mvck":0,"_refnf":0}; Hm_lpvt_c29657ca36c6c88e02fed9a397826038=1647351852; cto_bundle=MJCSmV9SaVFQY0paU285YjJqb3JJOXZZSlc2aU4xVHpYYjExJTJGWWxWZnB2T0RrWjdwRHlFSmZZVjRuWHJUTnFwMEhvWDhSRHBycG0zU0E2Y1Y5UjRNTnNldXM4eFZiZ2tLSnVqVUhmNU1GNGtzSVpGVnklMkY3ZWU2ZG9rZFJFR3RpemxEdU93cXFoUjg4N1lyM0pPbkFraFlabXJ3JTNEJTNE; Qs_pv_201322=1209103881048477000,845297458786979100,2017492991513516500,3297133156938577400,1930333815762069200; __xsptplus422=422.1.1647351177.1647351863.5#4|||||##fQCIy6rCZJYgftShWP7WiELT6hVUzWwY#; _qzja=1.662462923.1647351172787.1647351172787.1647351172787.1647351852474.1647351863528.673344554%40qq_com.1.0.12.1; _qzjb=1.1647351172787.12.0.0.0; _qzjc=1; _qzjto=12.1.0; _jzqb=1.34.10.1647351168.1; ECS_ID=9ea8ebba2ff892009285d8adc7177b00557c5f23; ECS[visit_times]=10; ECS[username]=673344554@qq.com; ECS[user_id]=67458; ECS[password]=1068c8376518c265238bfd2548f78350'
}
res=requests.get('http://www.aa7a.cn/',headers=header)
print('673344554@qq.com' in res.text) 

# 方式二：用cookie参数传递（需要把cookie中的所有k，value输入进去）
res=requests.get('http://www.aa7a.cn/',cookies={'Hm_lvt_c29657ca36c6c88e02fed9a397826038':"1647351162"})
print('673344554@qq.com' in res.text)  # false

post请求模拟登陆

# request发送post请求模拟登陆
body = {
    "username": "673344554@qq.com",
    "password": "zhq123.",
    "captcha": "YMFG",
    "remember": "1",
    "ref": "http://www.aa7a.cn/",
    "act": "act_login",
}
# data 就是请求体内容
res = requests.post('http://www.aa7a.cn/user.php', data=body)
# 响应中会有登陆信息,cookies就是登陆的cookie
print(res.cookies)
# 访问首页，带着认证过后的cookie，首页有我的个人信息
res1 = requests.get('http://www.aa7a.cn/', cookies=res.cookies)
print('673344554@qq.com' in res1.text)  # True



# 自动处理cookie，只要登陆了，以后不用手动携带，自动携带
body = {
    "username": "673344554@qq.com",
    "password": "zhq123.",
    "captcha": "YMFG",
    "remember": "1",
    "ref": "http://www.aa7a.cn/",
    "act": "act_login",
}
# 得到一个session对象
session = requests.Session()
# 以后再发请求，使用session发送，它会自动处理cookie
res = session.post('http://www.aa7a.cn/user.php', data=body)
print(res.cookies)
# 访问首页，带着认证过后的cookie，首页有我的个人信息
res1 = session.get('http://www.aa7a.cn/', cookies=res.cookies)
print('673344554@qq.com' in res1.text)  # True

补充 post方法参数 json和data

'''
data=None,     使用的编码是urlencoded
json=None      使用的编码是 application/json
'''

requests.post(url='xxxxxxxx',
              data={'xxx':'yyy'}) #没有指定请求头,#默认的请求头:application/x-www-form-urlencoed

#如果我们自定义请求头是application/json,并且用data传值, 则服务端取不到值
requests.post(url='',
              data={'':1,},
              headers={
                  'content-type':'application/json'
              })

requests.post(url='',
              json={'':1,},
              ) #默认的请求头:application/json

响应Response

respone=requests.get('https://www.cnblogs.com/')
print(len(respone.text))  # 返回数据--》转成文本
print(len(respone.content)) # 返回数据---》二进制内容--->图片，视频

print(respone.status_code) # 返回状态码 ---》200成功
print(respone.headers)    # 响应头字典
print(respone.cookies)   # 响应的cookie
print(respone.cookies.get_dict()) # 把cookie对象转成字典 cookieJar对象
print(respone.cookies.items())  # 获取所有cookie的value值

print(respone.url)          # 请求地址
print(respone.history)     # http是无状态无连接，每次都是新的请求--》如果有重定向--》里面放之前访问的地址

print(respone.encoding)   # 响应的编码格式

关闭：response.close()

response.iter_content()   # 图片，视频，不要一次性存储，分片存储到本地，使用它

编码问题

"""
有些网站不是使用utf8编码,可能使用GBK编码，需要使用response.encoding改编码
"""
import requests
response=requests.get('http://www.autohome.com/news')
response.encoding='gbk' #汽车之家网站返回的页面内容为gb2312编码的，而requests的默认编码为ISO-8859-1，如果不设置成gbk则中文乱码
print(response.text)

获取二进制内容

# 一次性存放
res=requests.get('https://gimg2.baidu.com/image_search/src=http%3A%2F%2Fup.enterdesk.com%2Fedpic%2F09%2F3a%2Fbc%2F093abce7b31f4c8ffdbf345375ff4abb.jpg&refer=http%3A%2F%2Fup.enterdesk.com&app=2002&size=f9999,10000&q=a80&n=0&g=0n&fmt=auto?sec=1649909879&t=0654ccfaf735ff2591dd9f246b9afe22')
with open('a.jpg','wb') as f:
    f.write(res.content)
 
"""
如果一个文件很大 一次性存放会很吃内存，所以我们可以分段存放
res.iter_content(1024) ：每次存1024
"""
# 分段存放
res=requests.get('https://gimg2.baidu.com/image_search/src=http%3A%2F%2Fup.enterdesk.com%2Fedpic%2F09%2F3a%2Fbc%2F093abce7b31f4c8ffdbf345375ff4abb.jpg&refer=http%3A%2F%2Fup.enterdesk.com&app=2002&size=f9999,10000&q=a80&n=0&g=0n&fmt=auto?sec=1649909879&t=0654ccfaf735ff2591dd9f246b9afe22')
with open('b.jpg','wb') as f:
    for line in res.iter_content(1024):
        f.write(line)

解析json

import requests
response=requests.get('http://httpbin.org/get')
# import json
# res1=json.loads(response.text) #太麻烦
res2=response.json() #直接获取json数据
print(res2)

ssl认证

"""
有些https网站 ca证书不是正规机构签发的 浏览器会弹出不安全连接，浏览器访问可以点击继续
但是我们代码跑 是不能点击的 有次就有了ssl认证
"""

#证书验证(大部分网站都是https)
import requests
respone=requests.get('https://www.12306.cn') #如果是ssl请求,首先检查证书是否合法,不合法则报错,程序终端



#改进1:去掉报错,但是会报警告
import requests
respone=requests.get('https://www.12306.cn',verify=False) #不验证证书,报警告,返回200
print(respone.status_code)

#改进2:去掉报错,并且去掉警报信息
import requests
from requests.packages import urllib3
urllib3.disable_warnings() #关闭警告
respone=requests.get('https://www.12306.cn',verify=False)
print(respone.status_code)

#改进3:手动携带证书
"""
很多网站都是https,但是不用证书也可以访问,大多数情况都是可以携带也可以不携带证书
知乎\百度等都是可带可不带
有硬性要求的,则必须带，比如对于定向的用户,拿到证书后才有权限访问某个特定网站
"""
import requests
respone=requests.get('https://www.12306.cn',
                     cert=('/path/server.crt',
                           '/path/key'))
print(respone.status_code)

使用代理

"""
 后端都会限制ip的访问频率，爬虫速度太快，超过了限速，这是我们需要切换一下ip地址即可
 使用代理IP 限制的是我代理的而不是我本机的
"""
import requests
proxies={
    'http':'220.168.52.245:53548',#带用户名密码的代理,@符号前是用户名与密码
}
respone=requests.get('https://www.12306.cn',
                     proxies=proxies)
print(respone.status_code)

#支持socks代理,安装:pip install requests[socks]
import requests
proxies = {
    'http': 'socks5://user:pass@host:port',
    'https': 'socks5://user:pass@host:port'
}
respone=requests.get('https://www.12306.cn',
                     proxies=proxies)

print(respone.status_code)

搭建代理池

"""
搭建代理池---》开源的（参照：python，flask+爬虫）
原理：
	1.通过爬虫去免费网站爬取 免费代理
	2. 验证一下，存到库中（redis）
	3.以后要用它 就是请求一个接口，随机返回一个代理地址
"""

# 地址 ：https://github.com/jhao104/proxy_pool

# 下载
git clone git@github.com:jhao104/proxy_pool.git
    
# 安装依赖
pip install -r requirements.txt

搭建步骤

# 下载
git clone git@github.com:jhao104/proxy_pool.git
    
# 安装依赖
pip install -r requirements.txt

# 第三步：修改配置--》项目路径下的settings.py
	# 配置API服务
    HOST = "0.0.0.0"               # IP
    PORT = 5000                    # 监听端口
    # 配置数据库
    DB_CONN = 'redis://127.0.0.1:8888/0'
    # 配置 ProxyFetcher--->配置爬取哪几个免费代理的网站
    PROXY_FETCHER = [
        "freeProxy01",      # 这里是启用的代理抓取方法名，所有fetch方法位于fetcher/proxyFetcher.py
        "freeProxy02",
        # ....
    ]
    
# 启动项目
	# 启动调度程序（爬取代理）
    python3 proxyPool.py schedule
    # 启动webApi服务（提供接口）
    python3 proxyPool.py server

使用代理池

import requests
# 返回随机得代理
res=requests.get('http://127.0.0.1:5010/get/')
# 随机代理IP
print(res.json()['proxy'])


## 咱们的爬虫，随机使用代理去访问
proxies={
    'http':res.json()['proxy']
}
respone=requests.get('https://www.12306.cn',
                     proxies=proxies)

print(respone.status_code)

requests 超时设置，认证设置，异常处理，上传文件

超时设置

"""
两种超时:float or tuple
timeout=0.1 #代表接收数据的超时时间
timeout=(0.1,0.2)#0.1代表链接超时  0.2代表接收数据的超时时间
"""

## 超时设置
import requests
# 一秒钟没返回就抛异常
respone=requests.get('https://www.baidu.com',timeout=1)
print(respone.status_code)

## 认证设置
"""
特别古老的项目中(老版本的路由器)
一访问弹出输入用户名密码得框
"""
import requests
from requests.auth import HTTPBasicAuth
r=requests.get('xxx',auth=HTTPBasicAuth('user','password'))
print(r.status_code)

## 异常处理

认证设置

"""
官网链接：http://docs.python-requests.org/en/master/user/authentication/

认证设置:登陆网站是,弹出一个框,要求你输入用户名密码（与alter很类似），此时是无法获取html的
但本质原理是拼接成请求头发送
        r.headers['Authorization'] = _basic_auth_str(self.username, self.password)
一般的网站都不用默认的加密方式，都是自己写
那么我们就需要按照网站的加密方式，自己写一个类似于_basic_auth_str的方法
得到加密字符串后添加到请求头
        r.headers['Authorization'] =func('.....')
"""

#看一看默认的加密方式吧，通常网站都不会用默认的加密设置
import requests
from requests.auth import HTTPBasicAuth
r=requests.get('xxx',auth=HTTPBasicAuth('user','password'))
print(r.status_code)

#HTTPBasicAuth可以简写为如下格式
import requests
r=requests.get('xxx',auth=('user','password'))
print(r.status_code)

异常处理

import requests
from requests.exceptions import * #可以查看requests.exceptions获取异常类型

try:
    r=requests.get('http://www.baidu.com',timeout=0.00001)
except ReadTimeout:
    print('读数据超时')
except ConnectionError: #网络不通
    print('网络不通')
except Timeout:
    print('超时')
except Exception as e:
    print('未知错误',e)

上传文件

import requests
files={'file':open('a.jpg','rb')}
respone=requests.post('http://httpbin.org/post',files=files)
print(respone.status_code)

posted @ 2022-03-16 21:21 yang_night 阅读(178) 评论(0) 收藏举报

刷新页面返回顶部

栽了清秋

爬虫之requests

requests

requests介绍

requests模块发送GET请求

requests请求带参数

requests带header

requests带cookie

post请求模拟登陆

补充 post方法参数 json和data

响应Response

编码问题

获取二进制内容

解析json

ssl认证

使用代理

搭建代理池

搭建步骤

使用代理池

requests 超时设置，认证设置，异常处理，上传文件

超时设置

认证设置

异常处理

上传文件

公告