二.爬虫的开端:
爬虫的一些小的例子:(5)
利用python的原始的库写爬虫:
urlopen()
from urllib import request
resp = request.urlopen('http://www.baidu.com')
print(read())
2.urlretrieve()
from urllib import request
request.urlretrieve('http://www.baidu.com','baidu.html')
查找刘德华
from urllib import request
from urllib import parse
url = "https://www.baidu.com/s"
params = {'wd':"刘德华"}
qs = parse.urlencode(params)
url = url + "?" + qs
resp = request.urlretrieve(url)
print(resp.read())parse_qs函数
from urllib import parse
params = {'name':'张三',"age":18,'greet':'hello world'}
qs=parse.urlencode(params)
print(qs)
result=parse.parse_qs(qs)
print(result)例子
# urlparse和urlsplit函数 urlsplit没有params属性
from urllib import request,parse
url = 'http://www.baidu.com/s;hello?username=zhiliao'
result = parse.urlsplit(url)
#result = parse.urlparse(url)
print('scheme',result.scheme)s
print('netloc',result.netloc)
print('path',result.path)
print('query',result.query)
url = 'http://www.baidu.com/s?wd=python&username=abc#1'
result = parse.urlparse(url)
print(result)
# request 拉钩网站测试 https://www.lagou.com/landing-page/pc/search.html?utm_source=m_cf_cpt_baidu_pcbt
from urllib import request
url='https://www.lagou.com/landing-page/pc/search.html?utm_source=m_cf_cpt_baidu_pcbt'
# resp = request.urlopen(url)
# print(resp)
headers = {
'Use-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:97.0) Gecko/20100101 Firefox/97.0'
}
req =request.Request(url,headers=headers)
resp = request.urlopen(req)
print(resp.read())
浙公网安备 33010602011771号