爬虫 1

1. 使用requests 发送GET、POST请求。

"""使用requests库，发送请求"""
import requests
from fake_useragent import UserAgent


# GET 请求
def requests_get(headers):
    url = "https://www.baidu.com/s"
    params = {
        "wd": "快代理"
    }
    r = requests.get(url, headers=headers, params=params)
    print('---响应值为: ', r.status_code, '---编码为: ', r.encoding)
    r.encoding = r.apparent_encoding
    print(r.request.headers)
    print(r.text)


# POST 请求
def requests_post(headers):
    url = "https://www.kuaidaili.com/login/"
    login_info = {
        "username": "...",
        "passwd": "..."
    }
    r = requests.post(url, headers=headers, data=login_info)
    print('---响应值为: ', r.status_code, '---编码为: ', r.encoding)
    print(r.text)

def main():
    
    headers = {
        'User-Agent': UserAgent().chrome
    }
    # 发送get请求
    requests_get(headers)
    
    # 发送post请求
    requests_post(headers)


if __name__ == '__main__':
    main()

简单示例: 爬取唯美女生网站中某页面数据

"""爬取 唯美女生 网站图片"""
import requests
import re
import os
import time


# 1. 请求网页
myheaders = {'User-Agent': 'Mozilla/5.0'}
# url = "http://pic.netbian.com"
url = 'http://pic.netbian.com/4kmeinv'
response = requests.get(url, headers=myheaders)

# 2. 处理响应数据， 正则匹配
html = response.text
img_urls = re.findall('<img src="(.*?)" alt=".*?">', html)
print(img_urls)

# 3. 下载图片
if not os.path.exists('彼岸图片'):
    os.mkdir('彼岸图片')
for img_url in img_urls:
    time.sleep(1)
    img_name = img_url.split('/')[-1]
    response = requests.get((url + img_url), headers=myheaders)
    with open('彼岸图片/' + img_name, 'wb') as f:
        f.write(response.content)

2. 使用urllib 发送GET、POST请求，当需要手动传参时，可以使用urllib.parse中的：quote、urlencode来进行“中文”转码

"""使用 urllib 创建爬虫"""
from urllib.request import urlopen
from urllib.request import Request      # 包装爬虫


url = 'https://www.baidu.com'

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36'

}
# 创建Resquest对象，来包装请求
request = Request(url, headers=headers, method='GET', data=None)
# 发送请求
response = urlopen(request)
# 打印信息
print('状态码：', response.getcode(), '真实请求地址：', response.geturl(), '状态码：', response.status, '请求头：', response.getheaders())
info = response.read().decode()
print(info)

a. GET请求

"""GET 请求，当需要手动传参，且参数为中文时,需要将参数转码
    单个参数时可以使用：quote,转码
    多个参数时：urlencode， 转码
"""
from urllib.request import urlopen, Request
from urllib.parse import quote

# 单个参数时可以使用：quote,转码
# url = "https://www.baidu.com/s?wd={}".format(quote("科技"))
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36"
}
# request = Request(url, headers=headers)
# response = urlopen(request, timeout=3)
# info = response.read().decode()
# print(info)


# 多个参数时：urlencode， 转码
from urllib.parse import urlencode

args = {
    'wd': "科技",
    'ie': 'utf-8'
    # ...
}

url = 'https://www.baidu.com/s?{}'.format(urlencode(args))
print(url)

request2 = Request(url, headers=headers)
response2 = urlopen(request2, timeout=3)
info2 = response2.read().decode()
print(info2)

b. POST请求，在request对象创建时，封装参数进去。

"""爬虫， 简单的post请求"""

from urllib.request import Request, urlopen
from urllib.parse import urlencode
from fake_useragent import UserAgent


url = "http://www.sxt.cn/index/login/login.html"

# 参数（此处为登录的账号和密码）
form_data = {
    'user': '...',
    'password': '...'
}
f_data = urlencode(form_data)   # 转码，防止其中有中文而报错

# 修改请求头
headers = {
    "User-Agent": UserAgent().chrome
}

# 创建request对象，发送请求
try:
    request = Request(url, data=f_data.encode(), headers=headers, )
    response = urlopen(request)
    info = response.read()
    # print(info.decode())
except Exception as err:
    print('err', err)

posted @ 2020-04-26 18:13 黑无常阅读(296) 评论(0) 收藏举报

刷新页面返回顶部

黑无常

爬虫 1

公告