Python爬虫 #002 urllib

2.1-普通用法

# urllib 是python自带库
import urllib
from urllib import request

url = 'http://www.baidu.com/'

response = urllib.request.urlopen(url)
# decode设置编码格式
text = response.read().decode('utf-8')
print(text)

2.2-请求头

方法一:

import urllib
from urllib import request

url = 'https://movie.douban.com/top250'

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3730.400 QQBrowser/10.5.3805.400'
}
#构建请求头
request1 = request.Request(url=url, headers=headers)

response = urllib.request.urlopen(request1)
print(response.read().decode())

方法二:

import urllib
from urllib import request

url = 'https://movie.douban.com/top250'

request1 = request.Request(url)
#注意下面的请求头不是键值对
request1.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3730.400 QQBrowser/10.5.3805.400')

response = urllib.request.urlopen(request1)
print(response.read().decode())

2.3-get 带参

import urllib
from urllib import request
from urllib import parse

#  该网址能测试get和post请求
url = 'http://httpbin.org/get?%s'

params = {'name':'刘', 'age':20}
# urlencode()函数把传入的参数对,转化为标准的url格式
params = urllib.parse.urlencode(params)
# get请求,请求内容都是在网址暴露的
response = urllib.request.urlopen(url=url%(params))
print(response.read().decode())

# tip:用Charles抓包工具可以看到,传入的数据

2.4-post带参

import urllib
from urllib import request
from urllib import parse

# 网址和get时不一样
url = 'http://httpbin.org/post'

params = {'name':'刘', 'age':'20'}
# encode() 是编码 ,把str >>> bytes
params = urllib.parse.urlencode(params).encode()

response = urllib.request.urlopen(url=url, data=params)
print(response.read().decode())

#tip:抓包工具能看请求时传入的数据

2.4-ip代理

import urllib
from urllib import request

url = 'http://httpbin.org/ip'

#不使用代理ip
response = urllib.request.urlopen(url=url)
print('ip:',response.read().decode())

#使用代理IP,网址西刺代理,tip:代理ip,有HTTP和HTTPS,且端口前有 :符号
#构建代理
ph = urllib.request.ProxyHandler({'http':'222.95.240.191:3000'})
opener = urllib.request.build_opener(ph)

#代理打开网址
response = opener.open(url)
print('代理ip:',response.read().decode())

#tip:显示与计算机请求失败,一般是代理IP问题,
# 有些反爬会封IP,所以需要代理IP。
posted @ 2023-06-28 22:53  枫_Null  阅读(9)  评论(0)    收藏  举报