urllib库 -- 实战

request.urlopen 请求百度网页

from urllib.request import urlopen

url = urlopen('http://www.baidu.com')
print(url.read().decode('utf8'))

request.Request 携带User-Agent头模拟真实浏览器(header内容是通过chrome浏览器f12 network Doc中Headers的内容)

from urllib.request import urlopen,Request

req = Request('http://www.baidu.com')
req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36')
resp = urlopen(req)
print(resp.read().decode('utf8'))

 利用urllib.request.Request urllib.request.urlopen urllib.parse 写一个简单的爬虫

from urllib.request import Request
from urllib.request import urlopen
from urllib import parse

req = Request("http://www.thsrc.com.tw/tw/TimeTable/SearchResult")   #台湾高铁网列车查询页面
# parse.urlencode 来实现一个HTTP POST请求,其内容是通过chrome浏览器f12 --> network --> Doc中Headers -->> Request Headers中的内容 postData
= parse.urlencode([ ('StartStation','2f940836-cedc-41ef-8e28-c2336ac8fe68'), ('EndStation','a7a04c89-900b-4798-95a3-c01c455622f4'), ('SearchDate','2017/01/06'), ('SearchTime','06:00'), ('SearchWay','DepartureInMandarin') ])
#添加两个请求头部来模拟真是浏览器,否则有些网站会通过头部请求来过滤爬虫 req.add_header(
'Origin','http://www.thsrc.com.tw') req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36')
resp
= urlopen(req,data=postData.encode('utf-8')) print(resp.read().decode('utf-8'))

 

posted @ 2017-01-04 22:55  Vincen_shen  阅读(116)  评论(0)    收藏  举报