urllib库 -- 实战
request.urlopen 请求百度网页
from urllib.request import urlopen url = urlopen('http://www.baidu.com') print(url.read().decode('utf8'))
request.Request 携带User-Agent头模拟真实浏览器(header内容是通过chrome浏览器f12 network Doc中Headers的内容)
from urllib.request import urlopen,Request req = Request('http://www.baidu.com') req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36') resp = urlopen(req) print(resp.read().decode('utf8'))
利用urllib.request.Request urllib.request.urlopen urllib.parse 写一个简单的爬虫
from urllib.request import Request from urllib.request import urlopen from urllib import parse req = Request("http://www.thsrc.com.tw/tw/TimeTable/SearchResult") #台湾高铁网列车查询页面
# parse.urlencode 来实现一个HTTP POST请求,其内容是通过chrome浏览器f12 --> network --> Doc中Headers -->> Request Headers中的内容 postData = parse.urlencode([ ('StartStation','2f940836-cedc-41ef-8e28-c2336ac8fe68'), ('EndStation','a7a04c89-900b-4798-95a3-c01c455622f4'), ('SearchDate','2017/01/06'), ('SearchTime','06:00'), ('SearchWay','DepartureInMandarin') ])
#添加两个请求头部来模拟真是浏览器,否则有些网站会通过头部请求来过滤爬虫 req.add_header('Origin','http://www.thsrc.com.tw') req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36')
resp = urlopen(req,data=postData.encode('utf-8')) print(resp.read().decode('utf-8'))