urllib模块
1、urllib介绍
除了requests模块可以发送请求之外, urllib模块也可以实现请求的发送,只是操作方法略有不同!
urllib在python中分为urllib和urllib2,在python3中为urllib
2、urllib的基本方法介绍
2.1 urllib.Request
构造简单请求
import urllib.request
url = 'http://www.baidu.com'
# 进行请求
response = urllib.request.urlopen(url)
# print(response)
# 获取状态码
print(response.getcode())
# 获取URL
print(response.geturl())
# 获取请求头
print(response.getheaders())
# 读取响应
print(response.read().decode('UTF-8'))
# 下载数据 保存文件名称为baidu.html
urllib.request.urlretrieve(url, filename='baidu.html')
传入headers参数
import urllib
#构造headers
headers = {"User-Agent" : "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)"}
#构造请求
request = urllib.request.Request(url, headers = headers)
#发送请求
response = urllib.request.urlopen(request)
传入data参数 实现发送post请求(示例)
import urllib.request
import urllib.parse
import json
# 拿到地址
url = 'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword'
headers = {
'User-Agent':' Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'
}
# 获取到当前浏览器请求所携带的表单数据
formData = {
'cname': '',
'pid': '',
'keyword': '北京',
'pageIndex': 1,
'pageSize': 10,
}
# 对表单数据进行转码
formData = urllib.parse.urlencode(formData).encode('UTF-8')
# 发送post请求
request = urllib.request.Request(url, data=formData, headers=headers)
response = urllib.request.urlopen(request)
# 返回json数据 转换成字典
data = json.loads(response.read().decode('UTF-8'))
# print(data['Table1'])
for l in data['Table1']:
print(l)
抓取多页数据-方法一
import urllib.request
import urllib.parse
import json
# 拿到地址
url = 'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword'
headers = {
'User-Agent':' Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'
}
for i in range(1, 11):
# 获取到当前浏览器请求所携带的表单数据
formData = {
'cname': '',
'pid': '',
'keyword': '北京',
'pageIndex': i,
'pageSize': 10,
}
# 对表单数据进行转码
formData = urllib.parse.urlencode(formData).encode('UTF-8')
# 发送post请求
request = urllib.request.Request(url, data=formData, headers=headers)
response = urllib.request.urlopen(request)
# 返回json数据 转换成字典
data = json.loads(response.read().decode('UTF-8'))
# print(data['Table1'])
print(f'第{i}页数据')
for l in data['Table1']:
print(l)
抓取多页数据-方法二
import urllib.request
import urllib.parse
import json
# 拿到地址
url = 'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword'
headers = {
'User-Agent':' Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'
}
# 获取到当前浏览器请求所携带的表单数据
formData = {
'cname': '',
'pid': '',
'keyword': '北京',
'pageIndex': 1,
'pageSize': 100,
}
# 对表单数据进行转码
formData = urllib.parse.urlencode(formData).encode('UTF-8')
# 发送post请求
request = urllib.request.Request(url, data=formData, headers=headers)
response = urllib.request.urlopen(request)
# 返回json数据 转换成字典
data = json.loads(response.read().decode('UTF-8'))
print(data['Table1'])
for l in data['Table1']:
print(l)
URL转码-把密文转成明文
import urllib.request
url = 'https://www.baidu.com/s?wd=%E8%BF%AA%E4%B8%BD%E7%83%AD%E5%B7%B4'
print(urllib.request.unquote(url))
URL转码-把明文转成密文
import urllib.request
url = 'https://www.baidu.com/s?wd='
key_word = '迪丽热巴'
# url转码
url += urllib.request.quote(key_word)
headers = {
'User-Agent':' Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'
}
# 构造请求对象
request = urllib.request.Request(url, headers=headers)
response = urllib.request.urlopen(request)
print(response.read().decode('utf-8'))
# with open('baidu.html', 'wb') as f:
# f.write(response.read())
2.2 response.read()
获取响应的html字符串,bytes类型
#发送请求
response = urllib.request.urlopen("http://www.baidu.com")
#获取响应
response.read()
3、urllib请求百度首页的完整例子
import urllib
import json
url = 'http://www.baidu.com'
#构造headers
headers = {"User-Agent" : "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)"}
#构造请求
request = urllib.request.Request(url, headers = headers)
#发送请求
response = urllib.request.urlopen(request)
#获取html字符串
html_str = response.read().decode('utf-8')
print(html_str)
浙公网安备 33010602011771号