【爬虫】基本库的使用[urllib]

urllib

'''
    urllib库由四个模块组成
        request:最基本的HTTP请求模块
        error:异常处理模块
        parse:一个工具模块,对URL进行解析
        robotparser:识别robots.txt,判断哪些网站可以爬,使用较少
'''
import urllib.request
import urllib.parse
# 一、发送请求
'''
    urlopen()
    * 返回的响应是一个HTTPResponse类型的对象
    可选参数:
        url:网址(必须)
        data=None:添加该参数时需要使用bytes方法将参数转化为字节流编码格式的内容,即bytes类型
                  添加该参数后,请求方式变为POST
        timeout参数:用于设置超时时间,单位为秒,若超过设置的时间还未得到响应,则抛出异常

'''
response = urllib.request.urlopen("https://www.runoob.com/python/att-string-decode.html")
# 调用read方法可以获得网页的响应内容,调用status属性可以得到响应结果的状态码
print(response.read().decode("utf-8"))
print(response.status)
print(response.getheaders())
print(response.getheader('Server'))
# 参数:data
data = bytes(urllib.parse.urlencode({'name':'germey'}), encoding='utf-8')
response = urllib.request.urlopen('https://www.httpbin.org/post', data=data)
print(response.read().decode('utf-8'))
# 参数:timeout
response = urllib.request.urlopen("https://www.httpbin.org/post", timeout=0.1)
print(response.read())              # 抛出超时异常

'''
    Request类
    构造参数:
        url:必传,网址
        data:如果传数据,必须传bytes类型,若数据是字典,可以先用urllib.parse中的urlencode方法进行编码
        headers:请求头,也可使用add_header添加
        origin_req_host:请求方的host名称或IP地址
        unverifiable:请求是否是无法验证的,默认为false
        method:指示请求使用方法,如GET、POST(字符串)
'''
# 基本用法
request = urllib.request.Request(
    "https://www.runoob.com/python/att-string-decode.html")
response = urllib.request.urlopen(request)
print(response.read().decode('utf-8'))
# 传入多个参数构建Request类
url = "https://www.httpbin.org/post"
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36 Edg/100.0.1185.29',
    'Host': 'www.httpbin.org'
}
dict = {'name': 'germy'}
data = bytes(urllib.parse.urlencode(dict), encoding='utf-8')
req = urllib.request.Request(
    url=url, data=data, headers=headers, method='POST')
response = urllib.request.urlopen(req)
print(response.read().decode('utf-8'))

'''
    高级用法(如Cookie代理、代理设置)
    使用Handler 
    Handler专门用于处理登陆验证、处理Cookie、代理设置,其中:
    * HTTPDefaultErrorHandler用于处理HTTP响应错误
    * HTTPRedirectHandler用于处理重定向
    * HTTPCookieProcessor用于处理Cookie
    * ProxyHandler用于处理代理设置
    * HTTPPasswordMgr用于管理密码,它维护用户名密码的对照表
    * HTTPBasicAuthHandler用于管理认证
    以上所有类均继承自 BaseHandler
'''


# 验证:使用另一个较为重要的类OpenerDirector
from urllib.request import HTTPPasswordMgrWithDefaultRealm, HTTPBasicAuthHandler, build_opener
from urllib.error import URLError

username = 'admin'
password = 'admin'
url = 'https://ssr3.scrape.center/'

p = HTTPPasswordMgrWithDefaultRealm()
p.add_password(None, url, username, password)
auth_handler = HTTPBasicAuthHandler(p)
opener = build_opener(auth_handler)

try:
    result = opener.open(url)
    html = result.read().decode('utf-8')
    print(html)
except URLError as e:
    print(e.reason)

# 代理,添加代理可以这么做:
# 暂时不可行,错误信息:
# [WinError 10061] 由于目标计算机积极拒绝,无法连接。
from urllib.request import ProxyHandler

proxy_handler = ProxyHandler({
    'http':'http://127.0.0.1:12220',
    'https':'https://127.0.0.1:12220'
})

opener = build_opener(proxy_handler)
try:
    response = opener.open('https://www.baidu.com')
    print(response.read().decode('utf-8'))
except URLError as e:
    print(e.reason)

# Cookie
import http.cookiejar, urllib.request
cookie = http.cookiejar.CookieJar()
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = build_opener(handler)
response = opener.open('https://www.cnblogs.com/RickSchanze/')
for item in cookie:
    print(item.name + "=" + item.value)

'''
    Cookie输出文件格式
    使用MozillaCookiejar,在生成文件时用到,是CookieJar的子类
'''
import urllib.request, http.cookiejar
filename = 'cookie.txt'
cookie = http.cookiejar.MozillaCookieJar(filename)
# 如果想要保存成LWP格式就用这句
cookie = http.cookiejar.LWPCookieJar(filename)
handler = urllib.request.HTTPCookieProcessor
opener = urllib.request.build_opener(handler)
response = opener.open("https://blog.csdn.net/holyjesus/article/details/100835712")
cookie.save(ignore_discard=True, ignore_expires=True)

'''
    处理异常
    URLError:继承自OSError,是error异常模块的基类
    HTTPError:专门处理HTTP请求错误
        HTTPError具有三个属性
            code:返回HTTP状态码
            headers:返回请求头
'''

# URLError
try:
    response = request.urlopen("https://cuiqingcai.com/404")
except error.URLError as e:
    print(e.reason)     # Not Found

# HTTPError
from urllib import request, error
try:
    response = request.urlopen("https://cuiqingcai.com/404")
except error.HTTPError as e:
    print(e.reason, e.code, e.headers, sep="\n")

# 更好的写法:因为URLError是HTTPError的父类
try:
    response = request.urlopen("https://cuiqingcai.com/404")
except error.HTTPError as e:
    print(e.reason, e.code, e.headers, sep="\n")
except error.URLError as e:
    print(e.reason)
else:
    print('Request Successfully')

# 有时reason可能不是一个字符串,可能是一个对象
import socket
try:
    response = urllib.request.urlopen("https://www.baidu.com", timeout=0.01)
except urllib.error.URLError as e:
    print(type(e.reason))
    if isinstance(e.reason, socket.timeout):
        print('TIME OUT')

解析链接

'''
    解析链接
        使用urllib.parse
    urlparse()
        返回值:是一个元组,既可以用属性名获取内容,也可以用索引
        urlstring:必填项,待解析的URL
        scheme:默认协议,如果待解析网址没有带协议则会使用此项
        allow_fragments:是否忽略fragment,若为False,则fragment部分会被忽略,它会被解析为path、params或query的一部分
        而fragment为空
    urlunparse()
        用于构造url,该方法接受一个可迭代对象,长度必须为6
    urlsplit()
        与urlunparse十分相似,但不再单独解析params,而把他合并至path,只返回5个结果
    urlunsplit()
        与urlunparse十分相似,唯一的区别是参数长度必须为5
    urljoin()
        第一个参数为base_url,第二个参数为新的连接
        base_url提供了三项内容:scheme、netloc、path,如果新的链接不存在这三项,就予以补充,
        若存在则使用新连接里面的,base_url中的不起作用
    urlencode()
        将字典转变为构造参数
    parse_qs()
        将GET请求参数转回字典
    parse_qsl()
        将GET参数转成元组
    quote()
        将参数转化为URL编码,中文→URL编码
    unquote()
        URL编码→中文
'''

# urlparse()
from urllib.parse import urlparse

result = urlparse('http://www.baidu.com/index.html;user?id=5#comment')
print(type(result))
print(result)

result = urlparse('http://www.baidu.com/index.html;user?id=5#comment', allow_fragments=False)
print(result)

result = urlparse('http://www.baidu.com/index.html#comment', allow_fragments=False)
print(result)

# urlunparse()
from urllib.parse import urlunparse

data = ['https', 'www.baidu.com', 'index.html', 'user', 'a=6', 'comment']
print(urlunparse(data))

# urlsplit()
from urllib.parse import urlsplit
result = urlsplit('http://www.baidu.com/index.html;user?id=5#comment')
print(result)

# urlunsplit()
from urllib.parse import urlunsplit

data = ['https', 'www.baidu.com', 'index.html', 'a=6', 'comment']
print(urlunsplit(data))
print("\n")

# urljoin()
from urllib.parse import urljoin

print(urljoin('https://www.baidu.com', 'FAQ.html'))
print(urljoin('https://www.baidu.com', 'https://cuiqingcai.com/FAQ.html'))
print(urljoin("https://www.baidu.com/about.html", "https://cuiqingcai.com/FAQ.html"))
print(urljoin("https://www.baidu.com/about.html", "https://cuiqingcai.com/FAQ.html?question=2"))
print(urljoin("https://www.baidu.com?wd=abc", "https://cuiqingcai.com/index.php"))
print(urljoin("https://www.baidu.com", "?category=2#comment"))
print(urljoin("www.baidu.com", "?category=2#comment"))
print(urljoin("www.baidu.com#comment", "?category=2"))

# urlencode()
from urllib.parse import urlencode
params = {
    'name':'germey',
    'age':25
}
base_url = "https://www.baidu.com?"
url = base_url + urlencode(params)
print(url)

# parse_qs
from urllib.parse import parse_qs
query = "name=germey&age=25"
print(parse_qs(query))

from urllib.parse import parse_qsl
from urllib.parse import parse_qs
query = "name=germey&age=25"
print(parse_qsl(query))

# quote()
from urllib.parse import quote
keyword = '壁纸'
url = 'https:www.baidu.com/s?wd=' + quote(keyword)
print(url)

# unquote()
from urllib.parse import unquote
url = "https:www.baidu.com/s?wd=%E5%A3%81%E7%BA%B8"
print(unquote(url))

分析Robots协议

'''
    分析Robots协议
    使用urllib.robotparser.RobotGileParser(url='')
    常用方法:
        set_url:设置robots.txt文件的链接
        read:读取robots.txt
        parse:解析robots.txt
        can_fetch:第一个参数为User-Agent,第二个为要抓的URL,返回User-Agent指示的引擎是否可以抓取这个URL
'''
from urllib.robotparser import RobotFileParser
rp = RobotFileParser()
rp.set_url("https://www.baidu.com/robots.txt")
rp.read()
print(rp.can_fetch("Baiduspider", "https://www.baidu.com"))
print(rp.can_fetch("Baiduspider", "https://www.baidu.com/homepage/"))
print(rp.can_fetch("Googlebot", "https://www.baidu.com/homepage/"))

# 也可以使用parse
rp = RobotFileParser()
rp.parse(urllib.request.urlopen("https://www.baidu.com/robots.txt").read().decode('utf-8').split("\n"))
print(rp.can_fetch("Baiduspider", "https://www.baidu.com"))
print(rp.can_fetch("Baiduspider", "https://www.baidu.com/homepage/"))
print(rp.can_fetch("Googlebot", "https://www.baidu.com/homepage/"))
print(urllib.request.urlopen("https://www.baidu.com/robots.txt").read().decode('utf-8'))

  发现详尽的记不太现实,以后只贴实战代码

posted @ 2022-04-05 22:48  帝皇の惊  阅读(30)  评论(0)    收藏  举报