爬虫-request方法使用
一、爬虫数据采集
1、按照采集对象分类
1、全网采集
2、全站采集
3、具体网站的指定数据采集
2、采集方案分类
1、利用http协议采集-页面分析
2、利用api接口采集-app数据采集
3、利用目标网站的api采集-微博、github、twitter、facebook
二、request库爬虫
首先需要安装request依赖包
进入虚拟环境,安装request包


三、request实例
实例一:获取百度源码

request_test.py
import requests
res=requests.get("http://www.baidu.com")
print(res.text)
运行结果:

实例二:获取POST和GET请求

http_server.py
# socket服务端
import socket
import threading
import json
server = socket.socket()
# 绑定到0.0.0.0 8000端口
server.bind(('0.0.0.0', 8000))
server.listen()
# 获取客户端连接并启动线程去处理
def handle_sock(sock, addr):
while True:
tmp_data = sock.recv(1024)
tmp_data = tmp_data.decode("utf8")
print(tmp_data)
request_line = tmp_data.splitlines()[0]
print(request_line)
if request_line:
method = request_line.split()[0]
path = request_line.split()[1]
if method == "GET":
response_template = '''HTTP/1.1 200 OK\r\nContent-Type: text/html\r\n\r
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<title>Title</title>
</head>
<body>
<form action="/" method="POST">
<input type="text" value="name" />
<input type="password" value="password">
<input type="submit" value="登录">
</form>
</body>
</html>
'''
sock.send(response_template.encode("utf8"))
# sock.close()
break
elif method == "POST":
response_template = '''HTTP/1.1 200 OK\r\nContent-Type: application/json
{}
'''
data = [
{
"name": "django打造在线教育",
"teacher": "bobby",
"url": "https://coding.imooc.com/class/78.html"
},
{
"name": "python高级编程",
"teacher": "bobby",
"url": "https://coding.imooc.com/class/200.html"
},
{
"name": "scrapy分布式爬虫",
"teacher": "bobby",
"url": "https://coding.imooc.com/class/92.html"
},
{
"name": "diango rest framework打造生鲜电商",
"teacher": "bobby",
"url": "https://coding.imooc.com/class/131.html"
},
{
"name": "tornado从入门到精通",
"teacher": "bobby",
"url": "https://coding.imooc.com/class/290.html"
},
]
sock.send((response_template.format(json.dumps(data)).encode("utf8")))
# sock.close()
break
while True:
# 阻塞等待连接
socket, add = server.accept()
# 启动一个线程去处理新的用户连接
client_thread = threading.Thread(target=handle_sock, args=(socket, add))
client_thread.start()
request_test.py
import requests
# res=requests.get("http://www.baidu.com")
# print(res.text)
url="http://127.0.0.1:8000"
params={
"username":"bobby",
"password":"bobby"
}
# response = requests.get(url,params=params)
# print(response.text)
res=requests.post(url,data=params)
print(res.text)
print(res.encoding)
print(res.json())
先运行httpServer.py,在运行request_test.py
运行结果如下:



实例三:打印请求状态码
import requests
response = requests.get("https://www.baidu.com")
print(response.status_code)
输出结果:

实例四:打印header请求头部
http_server.py
# socket服务端
import socket
import threading
import json
server = socket.socket()
# 绑定到0.0.0.0 8000端口
server.bind(('0.0.0.0', 8000))
server.listen()
# 获取客户端连接并启动线程去处理
def handle_sock(sock, addr):
while True:
tmp_data = sock.recv(1024*10)
tmp_data = tmp_data.decode("utf8")
print(tmp_data)
request_line = tmp_data.splitlines()[0]
print(request_line)
if request_line:
method = request_line.split()[0]
path = request_line.split()[1]
if method == "GET":
response_template = '''HTTP/1.1 200 OK\r\nContent-Type: text/html\r\nAccess-Controller-Allow-Origin:https://localhost:63342\r\n\r
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<title>Title</title>
</head>
<body>
<form action="/" method="POST">
<input type="text" value="name" />
<input type="password" value="password">
<input type="submit" value="登录">
</form>
</body>
</html>
'''
sock.send(response_template.encode("utf8"))
# sock.close()
break
elif method == "POST":
response_template = '''HTTP/1.1 200 OK\r\nContent-Type: application/json\r\n
{}
'''
data = [
{
"name": "django打造在线教育",
"teacher": "bobby",
"url": "https://coding.imooc.com/class/78.html"
},
{
"name": "python高级编程",
"teacher": "bobby",
"url": "https://coding.imooc.com/class/200.html"
},
{
"name": "scrapy分布式爬虫",
"teacher": "bobby",
"url": "https://coding.imooc.com/class/92.html"
},
{
"name": "diango rest framework打造生鲜电商",
"teacher": "bobby",
"url": "https://coding.imooc.com/class/131.html"
},
{
"name": "tornado从入门到精通",
"teacher": "bobby",
"url": "https://coding.imooc.com/class/290.html"
},
]
sock.send((response_template.format(json.dumps(data)).encode("utf8")))
sock.close()
break
while True:
# 阻塞等待连接
socket, add = server.accept()
# 启动一个线程去处理新的用户连接
client_thread = threading.Thread(target=handle_sock, args=(socket, add))
client_thread.start()
request_test.py
import requests
url="http://127.0.0.1:8000"
my_headers={
"user-agent":"requests",
"imooc_uid":"321"
}
response = requests.get(url=url,headers=my_headers)
print(response.headers)
在http_server.py上打上断点


可以看到requests.get方法传递过来的headers头部参数值
输出结果如下:

访问百度的headers
import requests
response = requests.get("https://www.baidu.com")
print(response.headers)
输出结果如下:

实例五:默认请求参数Content-Type为Content-Type: application/x-www-form-urlencoded
http_server.py
# socket服务端
import socket
import threading
import json
server = socket.socket()
# 绑定到0.0.0.0 8000端口
server.bind(('0.0.0.0', 8000))
server.listen()
# 获取客户端连接并启动线程去处理
def handle_sock(sock, addr):
while True:
tmp_data = sock.recv(1024*10)
tmp_data = tmp_data.decode("utf8")
print(tmp_data)
request_line = tmp_data.splitlines()[0]
print(request_line)
if request_line:
method = request_line.split()[0]
path = request_line.split()[1]
if method == "GET":
response_template = '''HTTP/1.1 200 OK\r\nContent-Type: text/html\r\nAccess-Controller-Allow-Origin:https://localhost:63342\r\n\r
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<title>Title</title>
</head>
<body>
<form action="/" method="POST">
<input type="text" value="name" />
<input type="password" value="password">
<input type="submit" value="登录">
</form>
</body>
</html>
'''
sock.send(response_template.encode("utf8"))
# sock.close()
break
elif method == "POST":
response_template = '''HTTP/1.1 200 OK\r\nContent-Type: application/json\r\n
{}
'''
data = [
{
"name": "django打造在线教育",
"teacher": "bobby",
"url": "https://coding.imooc.com/class/78.html"
},
{
"name": "python高级编程",
"teacher": "bobby",
"url": "https://coding.imooc.com/class/200.html"
},
{
"name": "scrapy分布式爬虫",
"teacher": "bobby",
"url": "https://coding.imooc.com/class/92.html"
},
{
"name": "diango rest framework打造生鲜电商",
"teacher": "bobby",
"url": "https://coding.imooc.com/class/131.html"
},
{
"name": "tornado从入门到精通",
"teacher": "bobby",
"url": "https://coding.imooc.com/class/290.html"
},
]
sock.send((response_template.format(json.dumps(data)).encode("utf8")))
sock.close()
break
while True:
# 阻塞等待连接
socket, add = server.accept()
# 启动一个线程去处理新的用户连接
client_thread = threading.Thread(target=handle_sock, args=(socket, add))
client_thread.start()
request_test.py
import requests
url="http://127.0.0.1:8000"
params={
"username":"bobby",
"password":"bobby"
}
res=requests.post(url,data=params)
print(res.encoding)
断点输出结果如下:


四、data和Json参数都可以传递两种数据类型。1、字符串 2、dict
当为data时
request_test.py
import json
import requests
url="http://127.0.0.1:8000"
params={
"username":"bobby",
"password":"bobby"
}
res=requests.post(url,data=json.dumps(params))
print(res.encoding)
http_server.json
# socket服务端
import socket
import threading
import json
server = socket.socket()
# 绑定到0.0.0.0 8000端口
server.bind(('0.0.0.0', 8000))
server.listen()
# 获取客户端连接并启动线程去处理
def handle_sock(sock, addr):
while True:
tmp_data = sock.recv(1024*10)
tmp_data = tmp_data.decode("utf8")
print(tmp_data)
request_line = tmp_data.splitlines()[0]
print(request_line)
if request_line:
method = request_line.split()[0]
path = request_line.split()[1]
if method == "GET":
response_template = '''HTTP/1.1 200 OK\r\nContent-Type: text/html\r\nAccess-Controller-Allow-Origin:https://localhost:63342\r\n\r
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<title>Title</title>
</head>
<body>
<form action="/" method="POST">
<input type="text" value="name" />
<input type="password" value="password">
<input type="submit" value="登录">
</form>
</body>
</html>
'''
sock.send(response_template.encode("utf8"))
# sock.close()
break
elif method == "POST":
response_template = '''HTTP/1.1 200 OK\r\nContent-Type: application/json\r\n
{}
'''
data = [
{
"name": "django打造在线教育",
"teacher": "bobby",
"url": "https://coding.imooc.com/class/78.html"
},
{
"name": "python高级编程",
"teacher": "bobby",
"url": "https://coding.imooc.com/class/200.html"
},
{
"name": "scrapy分布式爬虫",
"teacher": "bobby",
"url": "https://coding.imooc.com/class/92.html"
},
{
"name": "diango rest framework打造生鲜电商",
"teacher": "bobby",
"url": "https://coding.imooc.com/class/131.html"
},
{
"name": "tornado从入门到精通",
"teacher": "bobby",
"url": "https://coding.imooc.com/class/290.html"
},
]
sock.send((response_template.format(json.dumps(data)).encode("utf8")))
sock.close()
break
while True:
# 阻塞等待连接
socket, add = server.accept()
# 启动一个线程去处理新的用户连接
client_thread = threading.Thread(target=handle_sock, args=(socket, add))
client_thread.start()
执行结果如下:

当为json时
request_test.py
import json
import requests
url="http://127.0.0.1:8000"
params={
"username":"bobby",
"password":"bobby"
}
res=requests.post(url,json=json.dumps(params))
print(res.encoding)
http_server.json
# socket服务端
import socket
import threading
import json
server = socket.socket()
# 绑定到0.0.0.0 8000端口
server.bind(('0.0.0.0', 8000))
server.listen()
# 获取客户端连接并启动线程去处理
def handle_sock(sock, addr):
while True:
tmp_data = sock.recv(1024*10)
tmp_data = tmp_data.decode("utf8")
print(tmp_data)
request_line = tmp_data.splitlines()[0]
print(request_line)
if request_line:
method = request_line.split()[0]
path = request_line.split()[1]
if method == "GET":
response_template = '''HTTP/1.1 200 OK\r\nContent-Type: text/html\r\nAccess-Controller-Allow-Origin:https://localhost:63342\r\n\r
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<title>Title</title>
</head>
<body>
<form action="/" method="POST">
<input type="text" value="name" />
<input type="password" value="password">
<input type="submit" value="登录">
</form>
</body>
</html>
'''
sock.send(response_template.encode("utf8"))
# sock.close()
break
elif method == "POST":
response_template = '''HTTP/1.1 200 OK\r\nContent-Type: application/json\r\n
{}
'''
data = [
{
"name": "django打造在线教育",
"teacher": "bobby",
"url": "https://coding.imooc.com/class/78.html"
},
{
"name": "python高级编程",
"teacher": "bobby",
"url": "https://coding.imooc.com/class/200.html"
},
{
"name": "scrapy分布式爬虫",
"teacher": "bobby",
"url": "https://coding.imooc.com/class/92.html"
},
{
"name": "diango rest framework打造生鲜电商",
"teacher": "bobby",
"url": "https://coding.imooc.com/class/131.html"
},
{
"name": "tornado从入门到精通",
"teacher": "bobby",
"url": "https://coding.imooc.com/class/290.html"
},
]
sock.send((response_template.format(json.dumps(data)).encode("utf8")))
sock.close()
break
while True:
# 阻塞等待连接
socket, add = server.accept()
# 启动一个线程去处理新的用户连接
client_thread = threading.Thread(target=handle_sock, args=(socket, add))
client_thread.start()
当为Json时,数据结果如下,数据结果转换为Json

request_test.py
import json
import requests
url="http://127.0.0.1:8000"
params={
"username":"bobby",
"password":"bobby"
}
res=requests.post(url,json=params)
print(res.encoding)
http_server.json
# socket服务端
import socket
import threading
import json
server = socket.socket()
# 绑定到0.0.0.0 8000端口
server.bind(('0.0.0.0', 8000))
server.listen()
# 获取客户端连接并启动线程去处理
def handle_sock(sock, addr):
while True:
tmp_data = sock.recv(1024*10)
tmp_data = tmp_data.decode("utf8")
print(tmp_data)
request_line = tmp_data.splitlines()[0]
print(request_line)
if request_line:
method = request_line.split()[0]
path = request_line.split()[1]
if method == "GET":
response_template = '''HTTP/1.1 200 OK\r\nContent-Type: text/html\r\nAccess-Controller-Allow-Origin:https://localhost:63342\r\n\r
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<title>Title</title>
</head>
<body>
<form action="/" method="POST">
<input type="text" value="name" />
<input type="password" value="password">
<input type="submit" value="登录">
</form>
</body>
</html>
'''
sock.send(response_template.encode("utf8"))
# sock.close()
break
elif method == "POST":
response_template = '''HTTP/1.1 200 OK\r\nContent-Type: application/json\r\n
{}
'''
data = [
{
"name": "django打造在线教育",
"teacher": "bobby",
"url": "https://coding.imooc.com/class/78.html"
},
{
"name": "python高级编程",
"teacher": "bobby",
"url": "https://coding.imooc.com/class/200.html"
},
{
"name": "scrapy分布式爬虫",
"teacher": "bobby",
"url": "https://coding.imooc.com/class/92.html"
},
{
"name": "diango rest framework打造生鲜电商",
"teacher": "bobby",
"url": "https://coding.imooc.com/class/131.html"
},
{
"name": "tornado从入门到精通",
"teacher": "bobby",
"url": "https://coding.imooc.com/class/290.html"
},
]
sock.send((response_template.format(json.dumps(data)).encode("utf8")))
sock.close()
break
while True:
# 阻塞等待连接
socket, add = server.accept()
# 启动一个线程去处理新的用户连接
client_thread = threading.Thread(target=handle_sock, args=(socket, add))
client_thread.start()
此时dict值转换为json

浏览器和requests最终都是需要拼接满足http的字符串。

浙公网安备 33010602011771号