flask十六:反爬虫--数据加密
一.创建页面数据
M:
from application.ext import db class News(db.Model): id = db.Column(db.Integer, primary_key=True, autoincrement=True) title = db.Column(db.String(32)) content = db.Column(db.String(256))
V:
import random from flask import (Blueprint, render_template, redirect, url_for, request, make_response, Response, abort, session) from application.ext import db from application import models as mod news_blue = Blueprint('news_blue', __name__, url_prefix="/news") @news_blue.route("/") def index(): return render_template('index.html', msg="News Views") @news_blue.route('/addnews/') def add_news(): news = mod.News() news.title = "发哥%d" % random.randrange(1000) news.content = "福利社会%d" % random.randrange(10000) db.session.add(news) db.session.commit() return news.title + news.content @news_blue.route('/getnews/') def get_news(): news_list = mod.News.query.all() return render_template('NewsList.html', news_list=news_list)
T:
<!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8"> <title>NewsList</title> </head> <body> <h2>今日头条</h2> <ul> {% for news in news_list %} <li>{{ news.title }}: {{ news.content }}</li> {% endfor %} </ul> </body> </html>
二.爬虫
import requests def get_data(): response = requests.get("http://127.0.0.1:5000/news/getnews/") print(response.content.decode('utf8')) if __name__ == '__main__': get_data()
三.数据加密传输
1.使用JS动态加载数据
<body> <h2>今日头条</h2> <script type="text/javascript"> document.write("<h2>这是JS加载的动态内容</h2>") </script> </body>
2.后端将返回的数据列表或对象等,转换为包含标签的字符串,前端改为接受包含标签的字符串
后端将原来传递到前端的news_list内容,转换为包含标签的字符串news_content
前端将原来接收的news_list内容,改为接收包含标签的字符串news_content
3.后端加密数据,前端解密数据
加密算法,可以前后端定义任何的。
4.示例:
1).前端修改:
a.修改NewsList.HTML
<!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8"> <title>NewsList</title> </head> <body> <h2>今日头条</h2> <script type="text/javascript"> {#document.write("<h2>这是JS加载的动态内容</h2>")#} document.write("{{ news_content|safe }}") document.write("{{ news_content_twice }}") </script> </body> </html>
b.增加NewsContent.html
{% for news in news_list %}<li>{{ news.title }}: {{ news.content }}</li>{% endfor %}
2).后端修改
@news_blue.route('/getnews/') def get_news(): news_list = mod.News.query.all() # return render_template('NewsList.html', news_list=news_list) news_content = render_template('NewsContent.html', news_list=news_list) return render_template('NewsList.html', news_content=news_content)
请求页面,JS加载正常。
上面完成内容的动态加载。
通过HTML页面将不能爬取内容,但是通过JS接口还是可以获取内容。
在后端加密数据,前端解密数据
3).后端加密数据
@news_blue.route('/getnews/') def get_news(): news_list = mod.News.query.all() # return render_template('NewsList.html', news_list=news_list) # 字符串 news_content = render_template('NewsContent.html', news_list=news_list) # 二进制字符串 print(1, news_content, type(news_content)) # 1 <li>发哥325: 福利社会3485</li><li>发哥867: 福利社会9714</li> <class 'str'> news_content_binary = news_content.encode("utf-8") print(2, news_content_binary, type(news_content_binary)) # 2 b'<li>\xe5\x8f\x91\xe5\x93\xa5325: \xe7\xa6\x8f\xe5\x88\xa9\xe7\xa4\xbe\xe4\xbc\x9a3485</li><li>\xe5\x8f\x91\xe5\x93\xa5867: \xe7\xa6\x8f\xe5\x88\xa9\xe7\xa4\xbe\xe4\xbc\x9a9714</li>' <class 'bytes'> # base encode编码: 接收的是二进制字符串,返回二进制字符串 # base decode编码: 接收的是二进制字符串,返回字符串 encode_content = base64.standard_b64encode(news_content_binary).decode('utf-8') print(3, encode_content, type(encode_content)) # 3 PGxpPuWPkeWTpTMyNTog56aP5Yip56S+5LyaMzQ4NTwvbGk+PGxpPuWPkeWTpTg2Nzog56aP5Yip56S+5LyaOTcxNDwvbGk+ <class 'str'> # 对字符串加盐 salt_encode_content = "abc123" + encode_content print(4, salt_encode_content, type(salt_encode_content)) # 4 abc123PGxpPuWPkeWTpTMyNTog56aP5Yip56S+5LyaMzQ4NTwvbGk+PGxpPuWPkeWTpTg2Nzog56aP5Yip56S+5LyaOTcxNDwvbGk+ <class 'str'> # 再次进行上面的加密算法 salt_encode_content_binary = salt_encode_content.encode('utf-8') encode_content_twice = base64.standard_b64encode(salt_encode_content_binary).decode('utf-8') salt_encode_content_twice = "xyz456" + encode_content_twice return render_template('NewsList.html', news_content=news_content, news_content_twice=salt_encode_content_twice)
4).前端解密
修改NewsList.HTML
<!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8"> <title>NewsList</title> <script type="text/javascript" src="{{ url_for('static', filename='js/show.js') }}"></script> </head> <body> <h2>今日头条</h2> <script type="text/javascript"> {#document.write("<h2>这是JS加载的动态内容</h2>")#} document.write("{{ news_content|safe }}"); document.write(showDeHtml("{{ news_content_twice }}")); </script> </body> </html>
增加scripts解密脚本static/js/show.js,增加解密函数showDeHtml() 、base64Decode()
function base64Decode(input) { return input; } function showDeHtml(J1){ return base64Decode( base64Decode(J1)["replace"]("xyz456", "") )["replace"]("abc123", "") }
通过HTML页面将不能爬取内容,但是通过JS接口还是可以获取内容,并可以从页面获取到解密算法。
5).隐藏JS接口和解密算法
JS解密函数,通过endpoint调用,不是。因此外部看不到url
a.修改JS调用的页面
<head> <meta charset="UTF-8"> <title>NewsList</title> {# <script type="text/javascript" src="{{ url_for('static', filename='js/show.js') }}"></script>#} <script type="text/javascript" src="{{ url_for('news_blue.get_show') }}"></script> </head>
b.添加接口
@news_blue.route('/getshow/') def get_show(): with open(os.path.join(BASE_DIR, 'application/static/js/show.js'), 'r') as file: js_content = file.read() return js_content
测试ok
通过HTML页面将不能爬取内容,并隐藏了JS接口和解密算法。
6).给上面(5)的JS脚本增加参数:如时间戳
a.修改JS调用的页面
<head> <meta charset="UTF-8"> <title>NewsList</title> {# <script type="text/javascript" src="{{ url_for('static', filename='js/show.js') }}"></script>#} {# <script type="text/javascript" src="{{ url_for('news_blue.get_show') }}"></script>#} <script type="text/javascript"> document.write('<script type="text/javascript" src="/news/getshow/?t='+Date.parse(new Date())+ '"><\/script>'); </script> </head>
b.修改/news/getshow/接口
@news_blue.route('/getshow/') def get_show(): t = request.args.get('t') try: t = int(t) except: return '1' c = time.time() * 1000 if c > t and c - t < 1000: with open(os.path.join(BASE_DIR, 'application/static/js/show.js'), 'r') as file: js_content = file.read() return js_content else: return '2'
posted on 2020-06-21 22:06 myworldworld 阅读(673) 评论(0) 收藏 举报