网络请求之urllib库

# 另一种方法:
# quote( ) 和 unquote( )

from urllib import parse
url = 'https://www.baidu.com/s?wd=中国'

#quote()将字符串进行编码
str2 = parse.quote(str1)   
print(str2)                
# >>>https://www.baidu.com/s?wd=%E4%B8%AD%E5%9B%BD

str3 = parse.unquote(str2) #解码字符串
print(str3)                
# >>>https://www.baidu.com/s?wd=中国

 

目录:

 1-urllib库之urlopen函数用法

 2-urllib库之urlretrieve函数用法

 3-urllib库之参数编码和解码函数

 4-urllib库之urlparse和urlsplit函数用法

 5-【实战】用Request爬取拉勾网职位信息

 6-ProxyHandler实现代理ip

 7- 爬虫使用cookie模拟登录

 8- cookie信息的加载与保存

 

 1-urllib库之urlopen函数用法                                                                                    

# urllib是python3自带的库,不需要下载
from urllib import request

rep = request.urlopen('http://www.baidu.com')
# 返回是一个文件句柄对象<http.client.HTTPResponse object at 0x0000000002CFF048>
print(rep)
# 句柄对象有方法read(),readline(),readlines()
print(rep.read(10))
# 返回状态码
print(rep.getcode())

 2-urllib库之urlretrieve函数用法                                                                         

# urlretrieve函数的用法 用于直接将返回值取回本地,retrieve(translate:取回)
request.urlretrieve('http://www.baidu.com','baidu.html')

 3-urllib库之参数编码和解码函数                                                                           

from urllib import parse

# urlencode函数用于对url参数进行编码
params = {'name':'张三',"age":18,'greet':'hello world'}
result = parse.urlencode(params)
print(result)


# parse_qs函数用于对url参数进行解码
params = 'wd=%E5%AE%B6%E5%B8%B8%E8%8F%9C'
result = parse.parse_qs(params)
print(result)

>>>name=%E5%BC%A0%E4%B8%89&age=18&greet=hello+world
>>>{'wd': ['家常菜']}

 

 4-urllib库之urlparse和urlsplit函数用法                                                                    

from urllib import parse

# urlparse和urlsplit用于对url进行分割,urlparse和urlsplit的区别是urlparse多一个参数params,一般不怎么用
url = 'http://www.baidu.com/s;hello?wd=python&username=abc#1'

result1 = parse.urlparse(url)
result2 = parse.urlsplit(url)

print(result1)
print(result2)

# print('scheme:',result.scheme)
# print('netloc:',result.netloc)
# print('path:',result.path)
# # print('params:',result.params)
# print('query:',result.query)
# print('fragment:',result.fragment)

>>>ParseResult(scheme='http', netloc='www.baidu.com', path='/s', params='hello', query='wd=python&username=abc', fragment='1')
>>>SplitResult(scheme='http', netloc='www.baidu.com', path='/s', query='wd=python&username=abc', fragment='1')

5-【实战】用Request爬取拉勾网职位信息

小提示:json.cn可以对JSON数据进行格式化

from urllib import request,parse

url = '...'
headers = '...'
data = '...'

# data数据需要注意会出现问题,headers要进行处理,现在机制改变了数据取不到,寻找其它解决办法
req = request.Request(url,
                    headers=headers,
                    data=parse.urlencode(data).encode('utf-8'),
                    method='POST')

# Request对象得到的是一个对象,还要调用urlopen方法返回数据
resp = request.urlopen(req)
print(resp.read().decode('utf-8'))

6-ProxyHandler实现代理ip

# 这一部分百度可以搜拉勾网数据爬取
# 参考https://blog.csdn.net/qq_14998713/article/details/79134312
# 小总结:生成随机请求头,携带cookie,使用代理IP,没准过几天又变化呢
# 随机应变吧 ,代码要分块写的

from urllib import  request,parse


# 测试代理
url = 'http://httpbin.org/ip'
handler = request.ProxyHandler({"http":"http://119.101.114.32:9999"})
opener = request.build_opener(handler,request.HTTPHandler)
request.install_opener(opener)
print(request.urlopen(url).read())
# url
url ='https://www.lagou.com/jobs/positionAjax.json?city=%E6%B7%B1%E5%9C%B3&needAddtionalResult=false'
# 数据
data = {
    'first':'true',
    'pn':2,
    'kd':'python'
}
# 请求头
headers = {
    'User-Agent':'"Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0)"',
    # 'Referer':'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=',
    "Cookie": "user_trace_token=20180122190616-4411246f-ff64-11e7-b4a1-525400f775ce; LGUID=20180122190616-44112bfa-ff64-11e7-b4a1-525400f775ce; X_HTTP_TOKEN=2d753a5ac2b3e6e2423fcb4022407518; gate_login_token=197ff91d128acf987e1608ff3bf9333c3c2c1b88eabfedfb; index_location_city=%E6%9D%AD%E5%B7%9E; PRE_UTM=m_cf_cpt_baidu_pc; PRE_HOST=bzclk.baidu.com; PRE_SITE=http%3A%2F%2Fbzclk.baidu.com%2Fadrc.php%3Ft%3D06KL00c00f7Ghk60yUKm0FNkUsKKdyNp00000PW4pNb00000LbFd7H.THL0oUh11x60UWdBmy-bIy9EUyNxTAT0T1Y3nh7bmvcLmH0snj0LryRk0ZRqPjNKwH0LwbN7fH7Awbw7PjKafRDsfbc3PDPKf1I7n1b0mHdL5iuVmv-b5Hnsn1nznjR1njfhTZFEuA-b5HDv0ARqpZwYTZnlQzqLILT8UA7MULR8mvqVQ1qdIAdxTvqdThP-5ydxmvuxmLKYgvF9pywdgLKW0APzm1YzP10LPf%26tpl%3Dtpl_10085_15730_11224%26l%3D1500117464%26attach%3Dlocation%253D%2526linkName%253D%2525E6%2525A0%252587%2525E9%2525A2%252598%2526linkText%253D%2525E3%252580%252590%2525E6%25258B%252589%2525E5%25258B%2525BE%2525E7%2525BD%252591%2525E3%252580%252591%2525E5%2525AE%252598%2525E7%2525BD%252591-%2525E4%2525B8%252593%2525E6%2525B3%2525A8%2525E4%2525BA%252592%2525E8%252581%252594%2525E7%2525BD%252591%2525E8%252581%25258C%2525E4%2525B8%25259A%2525E6%25259C%2525BA%2526xp%253Did%28%252522m6c247d9c%252522%29%25252FDIV%25255B1%25255D%25252FDIV%25255B1%25255D%25252FDIV%25255B1%25255D%25252FDIV%25255B1%25255D%25252FH2%25255B1%25255D%25252FA%25255B1%25255D%2526linkType%253D%2526checksum%253D220%26ie%3Dutf-8%26f%3D3%26tn%3Dbaiduhome_pg%26wd%3D%25E6%258B%2589%25E9%2592%25A9%25E7%25BD%2591%26oq%3D%2525E6%25258B%252589%2525E5%25258B%2525BE%2525E7%2525BD%252591%2525E7%252588%2525AC%2525E8%252599%2525AB%26rqlang%3Dcn%26prefixsug%3D%2525E6%25258B%252589%2525E9%252592%2525A9%2525E7%2525BD%252591%26rsp%3D1%26inputT%3D277; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2F%3Futm_source%3Dm_cf_cpt_baidu_pc; fromsite=bzclk.baidu.com; utm_source=""; JSESSIONID=ABAAABAAAFDABFG399ECCCCEDB8F54778C63A5054EDD7B0; _putrc=D71643F76AF6F41F; login=true; unick=%E6%9D%A8%E5%87%8C%E9%94%8B; _ga=GA1.2.253347541.1516619174; _gid=GA1.2.2037776006.1516619174; _ga=GA1.3.253347541.1516619174; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1516619174,1516619191,1516623170,1516623176; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1516623224; LGSID=20180122201252-91c897a5-ff6d-11e7-a5bc-5254005c3644; LGRID=20180122201345-b1ca9540-ff6d-11e7-a5bc-5254005c3644"
           }
# 构建一个Request对象
rep = request.Request(url=url,data=parse.urlencode(data).encode('utf-8'),headers=headers,method='POST')
# 返回值
response = request.urlopen(rep).read().decode('utf-8')
print(response)

7- 爬虫使用cookie模拟登录 ,

#encoding: utf-8

# 大鹏董成鹏主页:http://www.renren.com/880151247/profile
# 人人网登录url:http://www.renren.com/PLogin.do

from urllib import request
from urllib import parse
from http.cookiejar import CookieJar

headers = {
    'User-Agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36"
}

def get_opener():
    # 1. 登录
    # 1.1 创建一个cookiejar对象
# 该模块主要的类CookieJar,FileCookiejar,MozillaCookieJar,LWPCokieJar。这四个类的作用分别如下:
# 1.CookieJar:管理HTTP cookie值,存储HTTP请求生成的cookie,向传出的HTTP请求添加cookie的对象。整个都存储在内存中,
# 对CookieJar实例进行垃圾回收后cookie也将她丢失
# 2.FileCookieJar(filename,delayload=None,policy=None):从CookieJar派生而来,用下创建FileCookieJar实例,
检索cookie信息并将cookie存储到文件中。filename是存储cookie的文件名。delayload为True时支持延迟访问
文件,即只有在需要时才读取文件或在文件中存储数据
# 3.MozillaCookieJar(filename,delayload=None,policy=None):从FileCookJar派生而来,创建与Mozilla浏览器cookies.txt
# 兼容的FileCookie实例
# 4.LWPCookieJar(filename,delayload=None,policy=None):从FileCookieJar派生而来,创建与libwww-per标准的Set-Cookie3
文件格式兼容的FileCookieJar实例。
    cookiejar = CookieJar()
    # 1.2 使用cookiejar创建一个HTTPCookieProcess对象
    handler = request.HTTPCookieProcessor(cookiejar)
    # 1.3 使用上一步创建的handler创建一个opener
    opener = request.build_opener(handler)
    return opener


def login_renren(opener):
    # 1.4 使用opener发送登录的请求(人人网的邮箱和密码)
    data = {
        'email':"970138074@qq.com",
        'password': "pythonspider"
    }
    login_url = "http://www.renren.com/PLogin.do"
    req = request.Request(login_url,data=parse.urlencode(data).encode('utf-8'),headers=headers)
    opener.open(req)


def visit_profile(opener):
    # 2. 访问个人主页
    dapeng_url = "http://www.renren.com/880151247/profile"
    # 获取个人主页的页面的时候,不要新建一个opener
    # 而应该使用之前的那个opener,因为之前的那个opener已经包含了
    # 登录所需要的cookie信息
    req = request.Request(dapeng_url,headers=headers)
    resp = opener.open(req)
    with open('renren.html','w',encoding='utf-8') as fp:
        fp.write(resp.read().decode('utf-8'))


if __name__ == '__main__':
    opener = get_opener()
    login_renren(opener)
    visit_profile(opener)

8- cookie信息的加载与保存

将cookie保存到文件中,以后如果需要再使用可以直接读取

from urllib import request
from http.cookiejar import MozillaCookieJar # 火狐


cookiejar = MozillaCookieJar('cookie.txt')
# 加载
cookiejar.load(ignore_discard=True)
handler = request.HTTPCookieProcessor(cookiejar)
opener = request.build_opener(handler)

resp = opener.open('http://www.baidu.com')

for cookie in cookiejar:
    print(cookie)

# 保存,save方法的文件名如果上面指定了就不需要再填写, ignore_discard=True 这个参数用于保存过期的cookie信息
 # cookiejar.save(ignore_discard=True)

    返回顶部                                       

posted @ 2018-12-19 23:37  wsg-python  阅读(874)  评论(0)    收藏  举报