Python 网络通信

0x01 urllib.request 模块

发送 GET 请求

如果要发送 HTTP/HTTPS 的 GET 请求,则可以使用 urllib.request 模块的 Request 对象。

from urllib import request
url = "http://www.baidu.com"   # 需要访问的地址
req = request.Request(url)   # 获取请求对象
with request.urlopen(req) as response:   # 开始请求
    data = response.read()   # 读取响应数据
    data = data.decode("utf-8")   # 解码,不然会乱码
    code = response.getcode()   # 获取响应码
    print(code)
    print('~~~~~~~~~~~~~~~~~~~~~~~~~~')
    rep_url = response.geturl()   # 获取响应数据的URL
    print(rep_url)
    print('~~~~~~~~~~~~~~~~~~~~~~~~~~')
    print(data)

发送 POST 请求

如果要发送 HTTP/HTTPS 的 POST 请求,则其发送流程与发送 GET 请求非常类似。

from urllib import request   #用于发送 HTTP 请求并处理响应
from urllib import parse   #用于将字典参数编码为 URL 查询字符串格式
url = "http://xx.xx.xx.xx/vul/xss/xsspost/post_login.php"   # 需要访问的地址
params = {"username": "admin", "password": "123456"}   # 准备POST参数
params_str = parse.urlencode(params)   # 编码
print(params_str)
params_bytes = params_str.encode()   # 将参数字符串转换成字节序列,POST 发送数组需要字节序列的形式发送
req = request.Request(url, data=params_bytes)   # 准备请求对象
with request.urlopen(req) as response:   # 开始请求
    code = response.getcode()   # 获取响应状态码
    print(code)
    data = response.read().decode()   # 获取响应体数据
    print(data)

下载文件

from urllib import request   #用于发送网络请求并处理响应
import os   #用于与操作系统交互,如路径操作和文件管理
url = "http://www.test.com"   # GET 请求,需要访问的地址
base_path = os.path.abspath('.')   # 当前文件的地址
file_path = os.path.join(base_path, "teacher.html")   # 下载文件的path
request.urlretrieve(url, filename=file_path)

0x02 requests 模块

上面我们介绍了 urllib 模块的使用,有一个比 urllib 更加"人性化"的模块,那就是 requests 库,使用它可以更加便捷的发起各种请求。
安装 requests 模块:pip install requests
image

发送 GET 请求

简单请求

import requests
url="http://www.test.com"
res=requests.get(url)
data=res.text
print(data)

添加请求头

import requests
url="https://www.test.com"
header={
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198Safari/537.36'
}
res=requests.get(url)
data=res.text
print(data)

添加请求参数

import requests
url = "http://www.test.com/views/detailsp.php"
header = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198Safari/537.36'
}
params = {
'id': 1
}
res = requests.get(url=url, params=params, headers=header)
data = res.text
print(data)

响应字符编码设置

import requests
url = "http://www.test.com"
header = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198Safari/537.36'
}
params = {
'id': 10
}
res = requests.get(url=url, params=params, headers=header)
data = res.text
print(data)
print(res.content.decode("utf-8"))

查看请求 URL,响应编码,状态码

import requests
url = "http://www.test.com"
header = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198Safari/537.36'
}
params = {
'id': 10
}
res = requests.get(url=url, params=params, headers=header)
print(res.url)
print(res.encoding)
print(res.status_code)

获取响应的 cookie

import requests
url = "http://www.test.com"
header = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198Safari/537.36'
}
params = {
'id': 10
}
res = requests.get(url=url, params=params, headers=header)
print(res.url)
print(res.encoding)
print(res.status_code)
cookie_data = list(res.cookies)
for cookie in cookie_data:
	print(cookie.value)

发送 POST 请求

和 GET 一致,只是方法名字变成了 POST。

import requests
url = "http://www.baidu.com"
header = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36(KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'
}
data = {
'wd': 'python'
}
res = requests.post(url=url, headers=header, data=data)
print(res.url)
print(res.encoding)
print(res.status_code)
print(res.content.decode("utf-8"))

挂代理

请求时,先将请求发给代理服务器,代理服务器请求目标服务器,然后目标服务器将数据传给代理服务器,代理服务器再将数据给爬虫。
直接请求

import requests
url = "http://httpbin.org/ip"
header = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36(KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'
}
res = requests.get(url=url, headers=header)
print(res.text)
#结果:显示的是自己的IP

利用代理请求

import requests
url = "http://httpbin.org/ip"
header = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36(KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'
}
proxy = {
'http': 'http://127.0.0.1:7890',
'https': 'http://127.0.0.1:7890'
}
res = requests.get(url=url, proxies=proxy, headers=header)
print(res.text)

爬虫实战

将官网大咖讲师名字和职位爬取出来,写进 csv 里面。

import requests
import re
teachers = []
def get_teachers(url):
    """
    获取老师的方法
    :param url:
    :return:
    """
    res = requests.get(url)
    pattern = r'<h6>(\w+)<span>(\w+)</span></h6>'
    res = re.findall(pattern, res.content.decode('utf-8'))
    return res
for page in range(1, 4):
    url = 'http://www.test.com/teacher'
    if page > 1:
        url = url + '-' + str(page) + '.html'
    else:
        url = url + '.html'
    teachers += get_teachers(url)
for item in teachers:
    print("{0}:{1}".format(item[0], item[1]))
posted @ 2026-02-10 05:56  77板烧鸡腿堡  阅读(3)  评论(0)    收藏  举报