Python爬虫 #002 urllib
2.1-普通用法
# urllib 是python自带库
import urllib
from urllib import request
url = 'http://www.baidu.com/'
response = urllib.request.urlopen(url)
# decode设置编码格式
text = response.read().decode('utf-8')
print(text)
2.2-请求头
方法一:
import urllib
from urllib import request
url = 'https://movie.douban.com/top250'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3730.400 QQBrowser/10.5.3805.400'
}
#构建请求头
request1 = request.Request(url=url, headers=headers)
response = urllib.request.urlopen(request1)
print(response.read().decode())
方法二:
import urllib
from urllib import request
url = 'https://movie.douban.com/top250'
request1 = request.Request(url)
#注意下面的请求头不是键值对
request1.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3730.400 QQBrowser/10.5.3805.400')
response = urllib.request.urlopen(request1)
print(response.read().decode())
2.3-get 带参
import urllib
from urllib import request
from urllib import parse
# 该网址能测试get和post请求
url = 'http://httpbin.org/get?%s'
params = {'name':'刘', 'age':20}
# urlencode()函数把传入的参数对,转化为标准的url格式
params = urllib.parse.urlencode(params)
# get请求,请求内容都是在网址暴露的
response = urllib.request.urlopen(url=url%(params))
print(response.read().decode())
# tip:用Charles抓包工具可以看到,传入的数据
2.4-post带参
import urllib
from urllib import request
from urllib import parse
# 网址和get时不一样
url = 'http://httpbin.org/post'
params = {'name':'刘', 'age':'20'}
# encode() 是编码 ,把str >>> bytes
params = urllib.parse.urlencode(params).encode()
response = urllib.request.urlopen(url=url, data=params)
print(response.read().decode())
#tip:抓包工具能看请求时传入的数据
2.4-ip代理
import urllib
from urllib import request
url = 'http://httpbin.org/ip'
#不使用代理ip
response = urllib.request.urlopen(url=url)
print('ip:',response.read().decode())
#使用代理IP,网址西刺代理,tip:代理ip,有HTTP和HTTPS,且端口前有 :符号
#构建代理
ph = urllib.request.ProxyHandler({'http':'222.95.240.191:3000'})
opener = urllib.request.build_opener(ph)
#代理打开网址
response = opener.open(url)
print('代理ip:',response.read().decode())
#tip:显示与计算机请求失败,一般是代理IP问题,
# 有些反爬会封IP,所以需要代理IP。
本文来自博客园,作者:{枫_Null},转载请注明原文链接:https://www.cnblogs.com/fengNull/articles/15488745.html

浙公网安备 33010602011771号