爬虫基础——示例:微信登陆收发消息
原理
1. URL https://wx.qq.com/
1.1 获取uuid:https://login.wx.qq.com/jslogin?<(时间戳)>
response 返回 ==> window.QRLogin.code = 200; window.QRLogin.uuid = "QaL1LOI9WQ==";
1.2 使用uuid生成二维码 <img src="https://login.weixin.qq.com/qrcode/QaL1LOI9WQ==">
2. 长轮询,等待用户扫码。https://login.wx.qq.com/cgi-bin/mmwebwx-bin/login?<(uuid tip=1 时间戳)>
2.1 如果没有人扫码,response 返回 window.code=408; 继续轮询
2.2 有人扫码,response 返回 window.code=201;window.userAvatar = <头像>,等待用户确认 https://login.wx.qq.com/cgi-bin/mmwebwx-bin/login?<(uuid tip=0 时间戳)>
2.3 确认登陆,https://login.wx.qq.com/cgi-bin/mmwebwx-bin/login?<uuid tip=0 时间戳>的response 返回 window.code=200; window.redirect_uri="<地址>"; 获取登陆cookie c1
2.4 获取凭证。 window.redirect_uri + &fun=new&version=v2 返回凭证 ,再次获取cookie c2
3. 获取用户信息。https://wx.qq.com/cgi-bin/mmwebwx-bin/webwxinit?<凭证> response 返回用户信息 User 和 SyncKey
4. 获取所有联系人。https://wx.qq.com/cgi-bin/mmwebwx-bin/webwxgetcontact?<凭证> ,该url带上 cookie(c1.update(c2)) response 返回所有联系人
5. 发送消息。post 发送 https://wx.qq.com/cgi-bin/mmwebwx-bin/webwxsendmsg?<凭证>
由于请求体是一个字典嵌套字典类型,
1. 如果字典中没有中文,发送的时候,直接使用 json 参数;
2. 如果有中文,要把它变成二进制类型 json.dumps(send_data, ensure_ascii=False).encode(encoding="utf-8"),使用 data参数,请求头加 headers={"Content-Type": "application/json"},
6. 接受消息。也是长轮询。https://webpush.wx.qq.com/cgi-bin/mmwebwx-bin/synccheck?<凭证+SyncKey> ,携带 cookie(c1.update(c2)),监听消息
6.1 如果返回 window.synccheck={retcode:"0",selector:"0"},则没有消息
6.2 如果返回 window.synccheck={retcode:"0",selector:"0"} 则有消息。 再发送 post https://wx.qq.com/cgi-bin/mmwebwx-bin/webwxsync?<凭证+synckey> response 返回消息 和新的 SyncKey
代码实现
urlpatterns = [ path('admin/', admin.site.urls), re_path(r'^login/$', views.login, name="login"), re_path(r'^index/$', views.index, name="index"), re_path(r'^check_login/$', views.check_login, name="check_login"), re_path(r'contact_all/$', views.contact_all, name="contact_all"), re_path(r'^send_msg/$', views.send_msg, name="send_msg"), re_path(r'^check_msg/$', views.check_msg, name="check_msg"), ]
from django.shortcuts import render, HttpResponse import requests, time, re, json from bs4 import BeautifulSoup # Create your views here. def login(req): ''' 生成有二维码的网页 :param req: :return: ''' # 获取生产二维码的uuid ''' https://login.wx.qq.com/jslogin?appid=wx782c26e4c19acffb&redirect_uri=https%3A%2F%2Fwx.qq.com%2Fcgi-bin%2Fmmwebwx-bin%2Fwebwxnewloginpage&fun=new&lang=zh_CN&_=1532602804064 ''' ctime = int(time.time()*1000) base_uri = "https://login.wx.qq.com/jslogin?appid=wx782c26e4c19acffb&redirect_uri=https%3A%2F%2Fwx.qq.com%2Fcgi-bin%2Fmmwebwx-bin%2Fwebwxnewloginpage&fun=new&lang=zh_CN&_={0}" url = base_uri.format(ctime) res1 = requests.get(url=url) qcode = re.findall('window.QRLogin.uuid = "(.*)";', res1.text)[0] req.session['qcode'] = qcode return render(req, "login.html", {"qcode": qcode}) def check_login(req): ''' 检测用户扫码 :param req: :return: ''' # 长轮询 res_data = {"code":408, "data": None} ctime = int(time.time() * 1000) TIP = req.GET.get('TIP') base_uri = "https://login.wx.qq.com/cgi-bin/mmwebwx-bin/login?loginicon=true&uuid={0}&tip={1}&r=694021981&_={2}" url = base_uri.format(req.session['qcode'], TIP, ctime) res1 = requests.get(url=url) print(res1.text) if "window.code=201" in res1.text: # 有人扫码 img_data = re.findall("window.code=201;window.userAvatar = '(.*)';", res1.text)[0] res_data["code"] = 201 res_data['data'] = img_data elif "window.code=200" in res1.text: # 用户确认登陆 redirect_uri = re.findall('window.redirect_uri="(.*)";', res1.text)[0] # 登陆成功后,可以获取cookies req.session['login_cookie'] = res1.cookies.get_dict() # 访问 redirect_uri, 获取凭证 redirect_url = "{0}&fun=new&version=v2&lang=zh_CN".format(redirect_uri) res2 = requests.get(url=redirect_url) # 获取cookies req.session['ticket_cookie'] = res2.cookies.get_dict() soup = BeautifulSoup(res2.text, "html.parser") ticket_dict = {} for item in soup.find(name="error").children: ticket_dict[item.name] = item.text req.session['ticket_dict'] = ticket_dict # 获取凭证 res_data["code"] = 200 req.session["is_login"] = True return HttpResponse(json.dumps(res_data)) def index(req): ''' 跳到聊天页面 :param req: :return: ''' # 获取用户信息 # https://wx.qq.com/cgi-bin/mmwebwx-bin/webwxinit?r=617941360&pass_ticket=VSSitrEOjrKhkJwzrepBNJZI7gz98fJcU3zLaKoRnYaaBMQF1XPJ76v%252FXUXXm5f4 base_uri = "https://wx.qq.com/cgi-bin/mmwebwx-bin/webwxinit?r=617941360&pass_ticket={0}" url = base_uri.format(req.session['ticket_dict']['pass_ticket']) req_data = { "BaseRequest": { "DeviceID": "e641097429558556", "Sid": req.session['ticket_dict']['wxsid'], "Skey": req.session['ticket_dict']['skey'], "Uin": req.session['ticket_dict']['wxuin'], } } res = requests.post( url=url, json=req_data ) res.encoding = "utf-8" user_data = json.loads(res.text) # 获取的微信信息 # for k,v in user_data.items(): # print(k, "-->", v) req.session['current_user_info'] = user_data['User'] req.session['init_sync_key'] = user_data['SyncKey'] # 监听消息凭证 return render(req, "index.html", {"user_data": user_data}) def contact_all(req): ''' 列出所有联系人 :param req: :return: ''' ctime = int(time.time() * 1000) all_cookie = {} all_cookie.update(req.session['login_cookie']) all_cookie.update(req.session['ticket_cookie']) base_uri = "https://wx.qq.com/cgi-bin/mmwebwx-bin/webwxgetcontact?lang=zh_CN&pass_ticket={0}&r={1}&seq=0&skey={2}" url = base_uri.format(req.session['ticket_dict']['pass_ticket'], ctime, req.session['ticket_dict']['skey']) res1 = requests.get( url=url, cookies=all_cookie ) res1.encoding = "utf-8" user_dict = json.loads(res1.text) # for item in user_dict['MemberList']: # print(item) return render(req, "contact_all.html", {"user_dict": user_dict}) def send_msg(req): ctime = str(int(time.time() * 1000)) recv = req.GET.get("recv") content = req.GET.get("content") all_cookie = {} all_cookie.update(req.session['login_cookie']) all_cookie.update(req.session['ticket_cookie']) send_data = { "BaseRequest": { "DeviceID": "e024995249607937", "Sid": req.session['ticket_dict']['wxsid'], "Skey": req.session['ticket_dict']['skey'], "Uin": req.session['ticket_dict']['wxuin'], }, "Msg": { 'ClientMsgId': ctime, 'Content': content, 'FromUserName': req.session["current_user_info"]['UserName'], 'LocalID': ctime, 'ToUserName': recv, 'Type': 1, }, "Scene": 0 } byte_send_data = json.dumps(send_data, ensure_ascii=False).encode(encoding="utf-8") # 为了转译中文 base_uri = "https://wx.qq.com/cgi-bin/mmwebwx-bin/webwxsendmsg?lang=zh_CN&pass_ticket={0}" url = base_uri.format(req.session['ticket_dict']['pass_ticket']) res1 = requests.post( url=url, data=byte_send_data, # 如果没有中文,直接用json:<字典> headers={"Content-Type": "application/json"}, # data 是字符串的话,必须要申明请求头 cookies=all_cookie ) print(res1.text) return HttpResponse("....") def check_msg(req): res_data = {"code": 201, "data": None} ctime = str(int(time.time() * 1000)) all_cookie = {} all_cookie.update(req.session['login_cookie']) all_cookie.update(req.session['ticket_cookie']) if req.GET.get("PENDING") == "1": base_uri = "https://webpush.wx.qq.com/cgi-bin/mmwebwx-bin/synccheck" ''' https://webpush.wx.qq.com/cgi-bin/mmwebwx-bin/synccheck?r={0}&skey={1}&sid={2}&uin={3}&deviceid=e971783524980667&synckey=1_684933101%7C2_684933158%7C3_684933113%7C11_684932930%7C201_1532754877%7C1000_1532754362%7C1001_1532733674&_={5} ''' sync_key_list = [] for item in req.session['init_sync_key']['List']: tmp = "%s_%s"%(item['Key'], item['Val']) sync_key_list.append(tmp) sync_key = "|".join(sync_key_list) param_data = { "r": ctime, "skey": req.session['ticket_dict']['skey'], "sid": req.session['ticket_dict']['wxsid'], "uin": req.session['ticket_dict']['wxuin'], "deviceid": "e446581143835818", "synckey": sync_key, "_": ctime } res1 = requests.get( url=base_uri, params=param_data, cookies=all_cookie ) print(res1.text) selector = re.findall('window.synccheck={retcode:"0",selector:"(.*)"}', res1.text)[0] if selector == "2": res_data["code"] = 200 res_data["data"] = "get msg" elif req.GET.get("PENDING") == "0": print(type(req.GET.get("PENDING")), "--->", req.GET.get("PENDING")) base_get_msg_url = "https://wx.qq.com/cgi-bin/mmwebwx-bin/webwxsync?sid={0}&skey={1}&lang=zh_CN&pass_ticket={2}" get_msg_url = base_get_msg_url.format(req.session['ticket_dict']['wxsid'], req.session['ticket_dict']['skey'], req.session['ticket_dict']['pass_ticket']) msg_data = { "BaseRequest": { "DeviceID": "e994644199968030", "Sid": req.session['ticket_dict']['wxsid'], "Skey": req.session['ticket_dict']['skey'], "Uin": req.session['ticket_dict']['wxuin'], }, "SyncKey": req.session['init_sync_key'], "rr": 545089680 } res1 = requests.post( url=get_msg_url, json=msg_data ) res1.encoding = "utf-8" # print(res1.text) friend_data = json.loads(res1.text) # print(type(friend_data['AddMsgCount'])) req.session['init_sync_key'] = friend_data['SyncKey'] message_list = [] for msg in friend_data['AddMsgList']: message = msg["FromUserName"]+"--->"+ msg['ToUserName']+ ":"+msg['Content'] message_list.append(message) print(msg["FromUserName"], "--->", msg['ToUserName'], ":",msg['Content']) res_data["code"] = 202 res_data["data"] = "||".join(message_list) return HttpResponse(json.dumps(res_data))
<!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8"> <title>login</title> </head> <body> <img src="https://login.weixin.qq.com/qrcode/{{ qcode }}" alt=""> <script type="text/javascript" src="/static/jquery-1.12.4.js"></script> <script> TIP = 1; function checkLogin(){ $.ajax({ url: "/check_login", type: "get", data: {"TIP": TIP}, dataType: "JSON", success: function (args) { console.log(args.code); if(args.code == 408){ // 没有人扫码,继续发请求 checkLogin(); }else if (args.code == 201) { // 有人扫码,等待确认 $("img").attr("src", args.data); TIP = 0; checkLogin(); }else if(args.code == 200){ // 已经确认 window.location.href = "/index/" } } }) } checkLogin(); </script> </body> </html>
<!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8"> <title>Title</title> </head> <body> <div> <h3>个人信息,{{ user_data.User.NickName }}</h3> <ol> {% for info in user_data.ContactList %} <li>{{ info.NickName }}</li> {% endfor %} </ol> <a href="/contact_all/">更多联系人</a> </div> <h3>公众号信息</h3> {% for msg in user_data.MPSubscribeMsgList %} <h4>{{ msg.NickName }}</h4> <ol> {% for item in msg.MPArticleList %} <li><a href="{{ item.Url }}">{{ item.Title }}</a></li> {% endfor %} </ol> {% endfor %} </body> </html>
<!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8"> <title>Title</title> </head> <body> <div><h3>发消息</h3> <p> 收件人:<input id="recv" type="text"> </p> <p> 内容:<input id="content" type="text"> </p> <p> <button id="btn">发送</button> </p> </div> <div><h3>收消息</h3> <ol id="msg"> </ol> </div> <div> <h3>联系人</h3> <ol> {% for user in user_dict.MemberList %} <li>{{ user.NickName }}-------------->{{ user.UserName }}</li> {% endfor %} </ol> </div> <script type="application/javascript" src="/static/jquery-1.12.4.js"></script> <script> $(function () { $("#btn").click(function () { console.log("test"); var recv = $("#recv").val(); var content = $("#content").val(); $.ajax({ url: "/send_msg/", type: "get", data: {"recv": recv, "content": content}, dataType: "JSON", success: function (args) { } }) }); PENDING = 1; function checkMsg() { $.ajax({ url: "/check_msg/", type: "get", data: {"PENDING": PENDING}, dataType: "JSON", success: function (args) { if(args.code == 200){ PENDING = 0; checkMsg(); }else if (args.code == 201) { PENDING = 1; console.log("pending..."); checkMsg(); }else if (args.code == 202){ $("#msg").append("<li>"+args.data+"</li>"); PENDING = 1; console.log("pending..."); checkMsg(); } } }) } checkMsg(); }) </script> </body> </html>
浙公网安备 33010602011771号