python--爬虫基础

urllib2

urllib2的使用

from urllib.request import *

# 设置头信息,两种方法
# 1.直接在request中填入参数
header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36"}
request = Request("此处填写Url(必须以http或https开头)", headers=header )

# 2.设置header
request = Request(url)
request.add_header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36")
request.get_header("User-Agent")    # 获取User-Agent

response = urlopen(request)

html = response.read()
print(html)

url转码

在url中使用汉字时,浏览器会自动将汉字转码,而在编写爬虫程序时,需要我们进行手动转码

from urllib.parse import *
# 编码
items = {"name": "张三"}
print(urllencode(items))    # 结果为:name=%E5%BC%A0%E4%B8%89

# 解码
urlquote()

跳过https认证

urllib2在使用urlopen打开读取https网站数据时会自动生成证书,但在访问某些未通过CA认证的证书时会报错(例如12306网站),这时,需要跳过ssl认证

from urllib.request import *
import ssl

context = ssl._create_unverified_context()
request = Request(url)
response = urlopen(request, context = context)

print(response.read())

Handler处理器

因为urllib2的urlopen方法不支持代理、cookie等功能,所以要用到Handler处理器。

  • 添加代理
from urllib.request import *

# 代理开关
proxySwitch = True

httpProxy = ProxyHandler({"http":"账号(可选):密码(可选):112.95.224.58:8118"})

# 无代理处理器对象
nullProxy = ProxyHandler({})

if (proxySwitch):
    opener = build_opener(httpProxy)
else:
    opener = build_opener(nullProxy)

# 构建一个全局opener,之后所有请求都可以用urlopen()发送,也附带Handler的功能
install_opener(opener)
request = Request("http://www.baidu.com")
response = urlopen(request)  #如未install_opener(),则使用response = opener.open(request)发送请求

print(response.read())

利用cookieJar模拟浏览器登陆

from http.cookiejar import CookieJar
from urllib.parse import urlencode
from urllib.request import *

cookie = CookieJar()

handler = HTTPCookieProcessor(cookie)
opener = build_opener(handler)
# 添加header
opener.addheaders = [("User-Agent", "xxx")]

url = "xxxxx"
# 登陆的用户名和密码
data = {"username": "xxx", "password": "xxx"}
data = urlencode(data)
request = Request(url, data = data)
response = opener.open(request)
print(response.read())
posted @ 2019-01-06 16:51  Leur  阅读(145)  评论(0编辑  收藏  举报