urllib的一些资料

import urllib.request
# 发送请求
file = urllib.request.urlopen('http://www.baidu.com')
# 将网页内容读取出来
data = file.read() # 读取所有，file.readline()读取一行
dataline = file.readline() # 还有file.readlines()读取全部内容，与read不同的是返回列表
# 保存在本地，命名为*.html等网页格式
fhandle = open("D:/Python35/myweb/part4/1.html", "wb")
fhandle.write(data)
fhandle.close()

"""或者使用urllib.request.urlretrieve(url, filename='本地文件地址')"""
filename = urllib.request.urlretrieve('http://edu.51cto.com', filename='D://Python35/myweb/part4/2.html')
# Urlretrieve执行的时候，会产生一些缓存，如果我们像清除这些缓存信息，可以使用urlcleanup()进行清除
urllib.request.urlcleanup()

"""除此之外，urllib中还有一些常见的用法"""
# 返回当前环境有关信息
file.info() # <http.cliend.HTTPMessage object at 0x000000003623D68>
# 返回状态码
file.getcode() # 200
# 获取当前爬取的URL地址
file.geturl() # 'http://www.baidu.com'
# url中编码不符合ASCII的字符：&urllib.request.quote()
urllib.request.quote("http://www.sina.com.cn") # "http%3Awww.sina.com.cn"
# url解码
urllib.request.unquote("http%3Awww.sina.com.cn") # "http://www.sina.com.cn"

"""爬取csdn的网站，设置headers"""
import urllib.request
url = 'http://blog.csdn.net/weiwei_pig/article/details/51178225'
file = urllib.request.urlopen(url)
# 获取将会失败，我们需要设置headers
# 方法1：使用build_opener()修改报头
headers = ('User-Agent','Mozilla/5.0.............')
opener = urllib.request.build_opener()
opener.addheaders = [headers]
data = opener.open(url).read()
fhandle = open('D:/Python35/myweb/part4/3html', 'wb')
fhandle.write(data)
fhandle.close()
# 方法2：使用add_header()添加报头，创建Request对象
import urllib.request
url = 'http://blog.csdn.net/weiwei_pig/article/details/51178226'
req = urllib.request.request(url)
req.add_header('User-Agent', 'Mozilla/5.0...................')
data = urllib.request.urlopen(req).read()

"""超时设置timeout"""
import urllib.request
for i in range(1,100):
try:
file=urllib.request.urlopen('http://yum.iqianyue.com', timeout=1)
data = file.read()
print(len(data))
except Exception as e:
print('出现异常-->'+str(e))

"""HTTP请求实战"""
"""GET请求实例，请求百度"""
import urllib.request
keywd = 'hello'
url = 'http://www.baidu.com/s?wd=' + keywd
req = urllib.request.Request(url)
data = urllib.request.urlopen(req).read()
fhandle = open('D://Python35/myweb/par4/4.html', 'wb')
fhandle.write(data)
fhandle.close()
# 如果keywd改编为‘中文’则会因为编码原因报错一下方法结局
import urllib.request
url = 'http://www.baidu.com/s?wd='
key = '中文'
key_code = urllib.request.quote(key)
url_all = url + key_code
req = urllib.request.Request(url_all)
data = urllib.request.urlopen(req).read()
fh = open('D:/Python35/myweb/part4/5.html', 'wb')
fh.write(data)
fh.close()

"""
总结一下就是：
1.构建对应的URL地址，该URL地址包含Get请求的字段名和字段信息，并且URL地址满足get请求的格式，及‘http：//网址？字段名1&字段名2’
2.以对应的URL为参数，构建Request对象。
3.通过urlopen()打开构建Request对象
4.按需进行后续的处理操作，比如读取网页的内容read()、将内容写入文件
"""

"""POST请求实例"""
# 测试使用：http://www.iqianyue.com/mypost/
# 1.设置好URL网址
# 2.构建表单数据，并使用urllib.parse.urlencode对数据进行编码处理
# 3.创建Request对象，参数包括URL地址和要传递的数据。
# 4.使用add_header()添加头信息，模拟浏览器进行爬取
# 5.使用urllib.request.urlopen()打开响应的Request对象，完成信息的传递
# 6.后续处理，比如读取网页内容、将内容写入文件
import urllib.request
import urllib.parse
url = 'http://iqianyue.com/mypost/'
# 将数据使用urllencode编码后处理，使用encode（）设置微utf-8编码
postdata = urllib.parse.urlencode({"name":"ceo@iqianyue.com","pass":"aA123456"}).encode('utf-8')
req = urllib.request.Request(url, postdata)
req.add_header('User-Agent','Mozilla/5.0.............')
data = urllib.urlopen(req).read()
fhandle = open('D:/Python35/myweb/patr4/6.html')
fhandle.write(data)
fhandle.close()

"""代理服务器的设置：http://yum.iqianyue.com/proxy"""
def use_proxy(proxy_addr, url):
import urllib.request
proxy = urllib.request.ProxyHandler({'http':proxy_addr})
opener = urllib.request.build_opener(proxy, urllib.request.HTTPHandler)
urllib.request.install_opener(opener)
data = urllib.request.urlopen(url).read().decode('utf-8')
return data
proxy_addr = "202.75.210.45:7777"
data = use_proxy(proxy_addr, 'http://www.baidu.com')
print(len(data))
# urllib.request.ProxyHandler()设置对应的代理服务器信息，设置格式为urllib.request.ProxyHandler({'http':'代理服务器地址'})
# 使用urllib.request.build_opener()创建一个自定义的opener对象，第一个参数为代理信息，第二个参数为urllib.request.HTTPHandler类
# 使用urllib.request.install_opener()创建全局默认的opener对象，使用urlopen（）时也会使用我们安装的opener对象，之后直接使用urllib.request.urlopen()打开对应网址

"""DebugLog实战"""
# 1.分别使用urllib.request.HTTPHandler()和urllib.request.HTTPSHandler()将debuglevel设置为1.
# 2.使用urllib.request.build_opener()创建自定义的opener对象，并使用1设置的值为参数
# 3.使用urllib.request.install_opener()创建全局默认的opener对象，这样在使用urlopen()时，也会使用我们安装的opener对象
# 4.进行后续响应的操作，比如urlopen()等
import urllib.request
httphd = urllib.request.HTTPHandler(debuglevel=1)
httpshd = urllib.request.HTTPSHandler(debuglevel=1)
opener = urllib.request.build_opener(httphd, httpshd)
urllib.request.install_opener(opener)
data = urllib.request.urlopen('http://edu.51cto.com')

"""异常处理神器——URLError实战"""
# 1.URLError类，2.URLError中的子类HTTPError
import urllib,request
import urllib.error
try:
urllib.request.urlopen('http://blog.csdn.net')
except urllib.error.URLError as e:
print(e.code)
print(e.reason)
# 一般来说产生URLError的原因有：
# 1.连接不上服务器
# 2.远程URL不存在
# 3.无网络
# 4.触发了HTTPError
# 我们可以将上面的URLError替换为HTTPError
"""
200 OK 一切正常
301 Moved Permanently 重新定向到新的URL，永久性
302 Found 重新定向到临时的URL，非永久性
304 Not Modified 请求的资源未更新
400 Bed Requeat 非法请求
401 Unauthorized 请求未经授权
403 Forbidden 禁止访问
404 Not Found 没有找到对应页面
500 Internal Server error 服务器内部出现错误
501 Not Implemented 服务器不支持实现请求所需要的功能
"""

# 正则表达式
import re
re.match() # 从开始位置匹配.span()过滤信息，返回索引，开头没有则None
re.search() # 扫描整个全文
re.compile() # 对整个正则表达式进行预编译，编译后使用findall()全部找出，返回列表
re.sub() # 替换某些自负床的功能，正则，要替换的字符串，源字符串，替换次数

四个模块
urllib.request 请求模块
urllib.error 异常处理模块
urllib.parse url解析模块
urllib.robotparser robots.txt解析模块
Python2：import urllib2
response = urllib2.urlopen('http://www.baidu.com')
Python3 import urllib.request
response = urllib.request.urlopen('http://www.baidu.com')

urlopen（url，data，timeout）
url:目标网址
data:post请求传输的数据
timeout:超时设置

response：
获取响应数据：read(),读取到网页源码的2进制数据response.read().decode()
响应类型，HTTPResponse
状态码：response.status
响应头：response.getheaders() 获取列表（列表内元组（元组内是对应的数据））
response.getheader('Server') 获取对应的值

request：
创建request对象：request = urllib.Request('http://www.baidu.com')
response = urllib.urlopen(request)
添加请求头，数据以及method
# 方法一：
url = ''
headers = {'UA':'Mozila'}
dict = {'name':'Germey'}
data = bytes(parse.urlenode(dict), encoding='utf8')
req = request.Request(url,data=data,headers=headers,method='POST')
response = request.urlopen(req)
# 方法二：
req.add_header('UA','Mozilla....')

handler:
proxy_handler = urllib.request.ProxyHandler({'':'','',''})
opener = urllib.request.build_opener(proxy_handler)
response = opener.open('http://www.baidu.com')

cookie:
cookie = http.cookiejar.CooikeJar()
headler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(headler)
有save方法，load方法读取

异常处理：

posted @ 2018-12-22 13:04 Ksitigarbha 阅读(124) 评论(0) 收藏举报

Ksitigarbha

urllib的一些资料

公告