Python requests模块

一、介绍

介绍:使用requests可以模拟浏览器的请求,比起之前用到的urllib,requests模块的api更加便捷(本质就是封装了urllib3)

注意:requests库发送请求将网页内容下载下来以后,并不会执行js代码,这需要我们自己分析目标站点然后发起新的request请求

安装:pip3 install requests

二、基于GET请求

import requests

head = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36'
}

url = 'https://www.baidu.com'
params = {
    'query': 'xxx'
}
Cookies={'user_session':'wGMHFJKgDcmRIVvcA14_Wrt_3xaUyJNsBnPbYzEL6L0bHcfc'}

response = requests.get(url=url, params=params, headers=head, cookies=Cookies)
print(response.text)

三、基于POST请求

import requests

head = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36'
}

url = 'https://www.baidu.com'
data = {
    'query': 'xxx'
}
Cookies={'user_session':'wGMHFJKgDcmRIVvcA14_Wrt_3xaUyJNsBnPbYzEL6L0bHcfc'}

response = requests.post(url=url, data=data, headers=head, cookies=Cookies)
print(response.text)

四、response属性

import requests
respone=requests.get('http://www.jianshu.com')
# respone属性
print(respone.text)
print(respone.content)

print(respone.status_code)
print(respone.headers)
print(respone.cookies)
print(respone.cookies.get_dict())
print(respone.cookies.items())

print(respone.url)
print(respone.history)

print(respone.encoding)

print(respone.json())

五、处理cookie

基于session自动处理cookie

1、创建一个空白的session对象

2、需要使用session对象发起请求,自动捕获服务器产生的cookie并存储在session对象中

3、使用携带cookie的session对象,对目标网址发起请求,此时请求将携带cookie

import requests

if __name__ == '__main__':
    header = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'
    }
    url = 'https://xueqiu.com/statuses/hot/listV2.json'
    params = {
        'since_id': '-1',
        'max_id': '460965',
        'size': '15',
    }
    # 创建一个空白session对象
    session = requests.Session()
  	# 用session对象发起请求,获取cookie
    session.get(url='https://xueqiu.com/', headers=header)
	# 用获得cookie的session对象向目标网址发起请求
    response = session.get(url=url, headers=header, params=params)
    response.encoding = 'utf-8'
    for item in response.json().get('items'):
        user = item['original_status']['user']['screen_name']
        content = item['original_status']['user']['description']
        print(f'{user}:{content}')

注意:session对象至少需要发起两次请求

六、代理操作

import random

import requests

from lxml import etree

if __name__ == '__main__':
    header = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'
    }
	# 获取代理ip url
    proxy_url = 'http://webapi.http.zhimacangku.com/getip'
    proxy_json = requests.get(url=proxy_url, headers=header).json()
    # 构建代理池格式{'http/https':'ip:port'}
    proxy_pool = []
    for proxy in proxy_json['data']:
        proxy_ip = proxy['ip']
        proxy_port = proxy['port']
        proxy_dic = {
            'https': f'{proxy_ip}:{proxy_port}'
        }
        proxy_pool.append(proxy_dic)

    while 1:
        url = f'https://www.kuaidaili.com/free/inha/1/'
        # 使用random模块中chioce函数随机使用代理池代理访问
        response = requests.get(url=url, headers=header, proxies=random.choice(proxy_pool))
        response.encoding = 'utf-8'
        tree = etree.HTML(response.text)
        tr_list = tree.xpath('//*[@id="list"]/table//tr[position()>1]')
        for tr in tr_list:
            address = tr.xpath('./td[5]/text()')[0]
            ip = tr.xpath('./td[1]/text()')[0]
            port = tr.xpath('./td[2]/text()')[0]
            print(f'{address} {ip}:{port}')

七、编码问题

#编码问题
import requests
response=requests.get('http://www.autohome.com/news')
# response.encoding='gbk' #汽车之家网站返回的页面内容为gb2312编码的,而requests的默认编码为ISO-8859-1,如果不设置成gbk则中文乱码
print(response.text)

八、案例

(一)古诗词网模拟登陆

import requests

from lxml import etree

from verification_code import verification_img_code

if __name__ == '__main__':
    header = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'
    }
    url = 'https://so.gushiwen.cn/user/login.aspx'

    # 创建空白session对象
    session = requests.Session()
    # 捕获服务器端返回的cookie
    main_response = session.get(url=url, headers=header)
    main_response.encoding = 'utf-8'
    tree = etree.HTML(main_response.text)
    # 动态获取参数
    __VIEWSTATE = tree.xpath('//*[@id="__VIEWSTATE"]/@value')[0]
    # 下载获取验证码图片
    img_code = 'https://so.gushiwen.cn' + tree.xpath('//*[@id="imgCode"]/@src')[0]
    img_code_content = session.get(url=img_code, headers=header).content
    with open('./img_code.jpg', 'wb') as fp:
        fp.write(img_code_content)
    # 使用图鉴识别验证码
    img_code = verification_img_code('./img_code.jpg', 3)

    data = {
        '__VIEWSTATE': __VIEWSTATE,
        '__VIEWSTATEGENERATOR': 'C93BE1AE',
        'from': '',
        'email': '*******',
        'pwd': '*********',
        'code': img_code,
        'denglu': '登录'
    }
    # 发起登录post请求
    login_response = session.post(url=url, data=data, headers=header)
    login_tree = etree.HTML(login_response.text)
    with open('./test.html', 'w', encoding='utf-8') as fp:
        fp.write(login_response.text)

(二)爬取防盗链图片

import requests

from lxml import etree

if __name__ == '__main__':
    header = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
        # 请求头带上Referer即可爬取防盗链图片
        'Referer': 'https://blog.sina.com.cn/s/blog_19d48b90a01030dym.html?tj = 1'
    }
    url = 'https://blog.sina.com.cn/s/blog_19d48b90a01030dym.html?tj=1'
    response = requests.get(url=url, headers=header)
    response.encoding = 'utf-8'
    tree = etree.HTML(response.text)
    div_list = tree.xpath('//*[@id="sina_keyword_ad_area2"]/div')
    # 构建图片真实下载地址列表
    real_link_list = []
    for div in div_list:
        img_link = div.xpath('./a/img/@real_src')
        real_link_list.extend(img_link)

    i = 0
    for img in real_link_list:
        img_content = requests.get(url=img, headers=header).content
        with open(f'./sina_img/{i}.jpg', 'wb') as fp:
            fp.write(img_content)
        i += 1
        print(f'{i} 下载成功!')
posted @ 2023-03-04 03:28  与鹿逐秋  阅读(36)  评论(0编辑  收藏  举报