爬虫-request方法使用

一、爬虫数据采集

1、按照采集对象分类

1、全网采集

2、全站采集

3、具体网站的指定数据采集

2、采集方案分类

1、利用http协议采集-页面分析

2、利用api接口采集-app数据采集

3、利用目标网站的api采集-微博、github、twitter、facebook

二、request库爬虫

首先需要安装request依赖包

进入虚拟环境,安装request包

 三、request实例

实例一:获取百度源码

 request_test.py

import requests

res=requests.get("http://www.baidu.com")
print(res.text)

运行结果:

实例二:获取POST和GET请求

http_server.py

# socket服务端
import socket
import threading
import json

server = socket.socket()
# 绑定到0.0.0.0 8000端口
server.bind(('0.0.0.0', 8000))
server.listen()


# 获取客户端连接并启动线程去处理

def handle_sock(sock, addr):
    while True:
        tmp_data = sock.recv(1024)
        tmp_data = tmp_data.decode("utf8")
        print(tmp_data)
        request_line = tmp_data.splitlines()[0]
        print(request_line)
        if request_line:
            method = request_line.split()[0]
            path = request_line.split()[1]
            if method == "GET":
                response_template = '''HTTP/1.1 200 OK\r\nContent-Type: text/html\r\n\r
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="utf-8">
    <title>Title</title>
</head>
<body>
<form action="/" method="POST">
    <input type="text" value="name" />
    <input type="password" value="password">
    <input type="submit" value="登录">
</form>
</body>
</html>
        '''
                sock.send(response_template.encode("utf8"))
                # sock.close()
                break
            elif method == "POST":
                response_template = '''HTTP/1.1 200 OK\r\nContent-Type: application/json

{}
                '''
                data = [
                    {
                        "name": "django打造在线教育",
                        "teacher": "bobby",
                        "url": "https://coding.imooc.com/class/78.html"
                    },
                    {
                        "name": "python高级编程",
                        "teacher": "bobby",
                        "url": "https://coding.imooc.com/class/200.html"
                    },
                    {
                        "name": "scrapy分布式爬虫",
                        "teacher": "bobby",
                        "url": "https://coding.imooc.com/class/92.html"
                    },
                    {
                        "name": "diango rest framework打造生鲜电商",
                        "teacher": "bobby",
                        "url": "https://coding.imooc.com/class/131.html"
                    },
                    {
                        "name": "tornado从入门到精通",
                        "teacher": "bobby",
                        "url": "https://coding.imooc.com/class/290.html"
                    },
                ]
                sock.send((response_template.format(json.dumps(data)).encode("utf8")))
                # sock.close()
                break


while True:
    # 阻塞等待连接
    socket, add = server.accept()

    # 启动一个线程去处理新的用户连接
    client_thread = threading.Thread(target=handle_sock, args=(socket, add))
    client_thread.start()

 request_test.py  

import requests

# res=requests.get("http://www.baidu.com")
# print(res.text)
url="http://127.0.0.1:8000"
params={
    "username":"bobby",
    "password":"bobby"
}
# response = requests.get(url,params=params)
# print(response.text)
res=requests.post(url,data=params)
print(res.text)
print(res.encoding)
print(res.json())

 先运行httpServer.py,在运行request_test.py

运行结果如下:

 实例三:打印请求状态码

import requests

response = requests.get("https://www.baidu.com")
print(response.status_code)

  输出结果:

 实例四:打印header请求头部

http_server.py

# socket服务端
import socket
import threading
import json

server = socket.socket()
# 绑定到0.0.0.0 8000端口
server.bind(('0.0.0.0', 8000))
server.listen()


# 获取客户端连接并启动线程去处理

def handle_sock(sock, addr):
    while True:
        tmp_data = sock.recv(1024*10)
        tmp_data = tmp_data.decode("utf8")
        print(tmp_data)
        request_line = tmp_data.splitlines()[0]
        print(request_line)
        if request_line:
            method = request_line.split()[0]
            path = request_line.split()[1]
            if method == "GET":
                response_template = '''HTTP/1.1 200 OK\r\nContent-Type: text/html\r\nAccess-Controller-Allow-Origin:https://localhost:63342\r\n\r
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="utf-8">
    <title>Title</title>
</head>
<body>
<form action="/" method="POST">
    <input type="text" value="name" />
    <input type="password" value="password">
    <input type="submit" value="登录">
</form>
</body>
</html>
        '''
                sock.send(response_template.encode("utf8"))
                # sock.close()
                break
            elif method == "POST":
                response_template = '''HTTP/1.1 200 OK\r\nContent-Type: application/json\r\n

{}
                '''
                data = [
                    {
                        "name": "django打造在线教育",
                        "teacher": "bobby",
                        "url": "https://coding.imooc.com/class/78.html"
                    },
                    {
                        "name": "python高级编程",
                        "teacher": "bobby",
                        "url": "https://coding.imooc.com/class/200.html"
                    },
                    {
                        "name": "scrapy分布式爬虫",
                        "teacher": "bobby",
                        "url": "https://coding.imooc.com/class/92.html"
                    },
                    {
                        "name": "diango rest framework打造生鲜电商",
                        "teacher": "bobby",
                        "url": "https://coding.imooc.com/class/131.html"
                    },
                    {
                        "name": "tornado从入门到精通",
                        "teacher": "bobby",
                        "url": "https://coding.imooc.com/class/290.html"
                    },
                ]
                sock.send((response_template.format(json.dumps(data)).encode("utf8")))
                sock.close()
                break


while True:
    # 阻塞等待连接
    socket, add = server.accept()

    # 启动一个线程去处理新的用户连接
    client_thread = threading.Thread(target=handle_sock, args=(socket, add))
    client_thread.start()

request_test.py

import requests

url="http://127.0.0.1:8000"
my_headers={
    "user-agent":"requests",
    "imooc_uid":"321"
}
response = requests.get(url=url,headers=my_headers)
print(response.headers)

 在http_server.py上打上断点

 可以看到requests.get方法传递过来的headers头部参数值

 输出结果如下:

访问百度的headers

import requests

response = requests.get("https://www.baidu.com")
print(response.headers)

输出结果如下:

实例五:默认请求参数Content-Type为Content-Type: application/x-www-form-urlencoded

http_server.py

# socket服务端
import socket
import threading
import json

server = socket.socket()
# 绑定到0.0.0.0 8000端口
server.bind(('0.0.0.0', 8000))
server.listen()


# 获取客户端连接并启动线程去处理

def handle_sock(sock, addr):
    while True:
        tmp_data = sock.recv(1024*10)
        tmp_data = tmp_data.decode("utf8")
        print(tmp_data)
        request_line = tmp_data.splitlines()[0]
        print(request_line)
        if request_line:
            method = request_line.split()[0]
            path = request_line.split()[1]
            if method == "GET":
                response_template = '''HTTP/1.1 200 OK\r\nContent-Type: text/html\r\nAccess-Controller-Allow-Origin:https://localhost:63342\r\n\r
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="utf-8">
    <title>Title</title>
</head>
<body>
<form action="/" method="POST">
    <input type="text" value="name" />
    <input type="password" value="password">
    <input type="submit" value="登录">
</form>
</body>
</html>
        '''
                sock.send(response_template.encode("utf8"))
                # sock.close()
                break
            elif method == "POST":
                response_template = '''HTTP/1.1 200 OK\r\nContent-Type: application/json\r\n

{}
                '''
                data = [
                    {
                        "name": "django打造在线教育",
                        "teacher": "bobby",
                        "url": "https://coding.imooc.com/class/78.html"
                    },
                    {
                        "name": "python高级编程",
                        "teacher": "bobby",
                        "url": "https://coding.imooc.com/class/200.html"
                    },
                    {
                        "name": "scrapy分布式爬虫",
                        "teacher": "bobby",
                        "url": "https://coding.imooc.com/class/92.html"
                    },
                    {
                        "name": "diango rest framework打造生鲜电商",
                        "teacher": "bobby",
                        "url": "https://coding.imooc.com/class/131.html"
                    },
                    {
                        "name": "tornado从入门到精通",
                        "teacher": "bobby",
                        "url": "https://coding.imooc.com/class/290.html"
                    },
                ]
                sock.send((response_template.format(json.dumps(data)).encode("utf8")))
                sock.close()
                break


while True:
    # 阻塞等待连接
    socket, add = server.accept()

    # 启动一个线程去处理新的用户连接
    client_thread = threading.Thread(target=handle_sock, args=(socket, add))
    client_thread.start()

request_test.py

import requests

url="http://127.0.0.1:8000"
params={
    "username":"bobby",
    "password":"bobby"
}
res=requests.post(url,data=params)
print(res.encoding) 

断点输出结果如下:

四、data和Json参数都可以传递两种数据类型。1、字符串 2、dict

当为data时

request_test.py

import json
import requests

url="http://127.0.0.1:8000"
params={
    "username":"bobby",
    "password":"bobby"
}
res=requests.post(url,data=json.dumps(params))
print(res.encoding)

   http_server.json

# socket服务端
import socket
import threading
import json

server = socket.socket()
# 绑定到0.0.0.0 8000端口
server.bind(('0.0.0.0', 8000))
server.listen()


# 获取客户端连接并启动线程去处理

def handle_sock(sock, addr):
    while True:
        tmp_data = sock.recv(1024*10)
        tmp_data = tmp_data.decode("utf8")
        print(tmp_data)
        request_line = tmp_data.splitlines()[0]
        print(request_line)
        if request_line:
            method = request_line.split()[0]
            path = request_line.split()[1]
            if method == "GET":
                response_template = '''HTTP/1.1 200 OK\r\nContent-Type: text/html\r\nAccess-Controller-Allow-Origin:https://localhost:63342\r\n\r
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="utf-8">
    <title>Title</title>
</head>
<body>
<form action="/" method="POST">
    <input type="text" value="name" />
    <input type="password" value="password">
    <input type="submit" value="登录">
</form>
</body>
</html>
        '''
                sock.send(response_template.encode("utf8"))
                # sock.close()
                break
            elif method == "POST":
                response_template = '''HTTP/1.1 200 OK\r\nContent-Type: application/json\r\n

{}
                '''
                data = [
                    {
                        "name": "django打造在线教育",
                        "teacher": "bobby",
                        "url": "https://coding.imooc.com/class/78.html"
                    },
                    {
                        "name": "python高级编程",
                        "teacher": "bobby",
                        "url": "https://coding.imooc.com/class/200.html"
                    },
                    {
                        "name": "scrapy分布式爬虫",
                        "teacher": "bobby",
                        "url": "https://coding.imooc.com/class/92.html"
                    },
                    {
                        "name": "diango rest framework打造生鲜电商",
                        "teacher": "bobby",
                        "url": "https://coding.imooc.com/class/131.html"
                    },
                    {
                        "name": "tornado从入门到精通",
                        "teacher": "bobby",
                        "url": "https://coding.imooc.com/class/290.html"
                    },
                ]
                sock.send((response_template.format(json.dumps(data)).encode("utf8")))
                sock.close()
                break


while True:
    # 阻塞等待连接
    socket, add = server.accept()

    # 启动一个线程去处理新的用户连接
    client_thread = threading.Thread(target=handle_sock, args=(socket, add))
    client_thread.start()

 执行结果如下:

当为json时

request_test.py

import json
import requests

url="http://127.0.0.1:8000"
params={
    "username":"bobby",
    "password":"bobby"
}
res=requests.post(url,json=json.dumps(params))
print(res.encoding)

 http_server.json

# socket服务端
import socket
import threading
import json

server = socket.socket()
# 绑定到0.0.0.0 8000端口
server.bind(('0.0.0.0', 8000))
server.listen()


# 获取客户端连接并启动线程去处理

def handle_sock(sock, addr):
    while True:
        tmp_data = sock.recv(1024*10)
        tmp_data = tmp_data.decode("utf8")
        print(tmp_data)
        request_line = tmp_data.splitlines()[0]
        print(request_line)
        if request_line:
            method = request_line.split()[0]
            path = request_line.split()[1]
            if method == "GET":
                response_template = '''HTTP/1.1 200 OK\r\nContent-Type: text/html\r\nAccess-Controller-Allow-Origin:https://localhost:63342\r\n\r
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="utf-8">
    <title>Title</title>
</head>
<body>
<form action="/" method="POST">
    <input type="text" value="name" />
    <input type="password" value="password">
    <input type="submit" value="登录">
</form>
</body>
</html>
        '''
                sock.send(response_template.encode("utf8"))
                # sock.close()
                break
            elif method == "POST":
                response_template = '''HTTP/1.1 200 OK\r\nContent-Type: application/json\r\n

{}
                '''
                data = [
                    {
                        "name": "django打造在线教育",
                        "teacher": "bobby",
                        "url": "https://coding.imooc.com/class/78.html"
                    },
                    {
                        "name": "python高级编程",
                        "teacher": "bobby",
                        "url": "https://coding.imooc.com/class/200.html"
                    },
                    {
                        "name": "scrapy分布式爬虫",
                        "teacher": "bobby",
                        "url": "https://coding.imooc.com/class/92.html"
                    },
                    {
                        "name": "diango rest framework打造生鲜电商",
                        "teacher": "bobby",
                        "url": "https://coding.imooc.com/class/131.html"
                    },
                    {
                        "name": "tornado从入门到精通",
                        "teacher": "bobby",
                        "url": "https://coding.imooc.com/class/290.html"
                    },
                ]
                sock.send((response_template.format(json.dumps(data)).encode("utf8")))
                sock.close()
                break


while True:
    # 阻塞等待连接
    socket, add = server.accept()

    # 启动一个线程去处理新的用户连接
    client_thread = threading.Thread(target=handle_sock, args=(socket, add))
    client_thread.start()

 当为Json时,数据结果如下,数据结果转换为Json

 request_test.py

import json
import requests

url="http://127.0.0.1:8000"
params={
    "username":"bobby",
    "password":"bobby"
}
res=requests.post(url,json=params)
print(res.encoding)

  http_server.json

# socket服务端
import socket
import threading
import json

server = socket.socket()
# 绑定到0.0.0.0 8000端口
server.bind(('0.0.0.0', 8000))
server.listen()


# 获取客户端连接并启动线程去处理

def handle_sock(sock, addr):
    while True:
        tmp_data = sock.recv(1024*10)
        tmp_data = tmp_data.decode("utf8")
        print(tmp_data)
        request_line = tmp_data.splitlines()[0]
        print(request_line)
        if request_line:
            method = request_line.split()[0]
            path = request_line.split()[1]
            if method == "GET":
                response_template = '''HTTP/1.1 200 OK\r\nContent-Type: text/html\r\nAccess-Controller-Allow-Origin:https://localhost:63342\r\n\r
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="utf-8">
    <title>Title</title>
</head>
<body>
<form action="/" method="POST">
    <input type="text" value="name" />
    <input type="password" value="password">
    <input type="submit" value="登录">
</form>
</body>
</html>
        '''
                sock.send(response_template.encode("utf8"))
                # sock.close()
                break
            elif method == "POST":
                response_template = '''HTTP/1.1 200 OK\r\nContent-Type: application/json\r\n

{}
                '''
                data = [
                    {
                        "name": "django打造在线教育",
                        "teacher": "bobby",
                        "url": "https://coding.imooc.com/class/78.html"
                    },
                    {
                        "name": "python高级编程",
                        "teacher": "bobby",
                        "url": "https://coding.imooc.com/class/200.html"
                    },
                    {
                        "name": "scrapy分布式爬虫",
                        "teacher": "bobby",
                        "url": "https://coding.imooc.com/class/92.html"
                    },
                    {
                        "name": "diango rest framework打造生鲜电商",
                        "teacher": "bobby",
                        "url": "https://coding.imooc.com/class/131.html"
                    },
                    {
                        "name": "tornado从入门到精通",
                        "teacher": "bobby",
                        "url": "https://coding.imooc.com/class/290.html"
                    },
                ]
                sock.send((response_template.format(json.dumps(data)).encode("utf8")))
                sock.close()
                break


while True:
    # 阻塞等待连接
    socket, add = server.accept()

    # 启动一个线程去处理新的用户连接
    client_thread = threading.Thread(target=handle_sock, args=(socket, add))
    client_thread.start()

 此时dict值转换为json

 浏览器和requests最终都是需要拼接满足http的字符串。

posted @ 2024-05-10 23:25  leagueandlegends  阅读(26)  评论(0)    收藏  举报