protobuf逆向解析 Python 或者 C# application/grpc-web+proto

 

先说解决方案吧,总共有两个解决方法

 

一:Python 用 blackboxprotobuf 这个模块 它像个万能解析proto的程序,这个很简单 但有个问题 时不时的会遇到解析一个文件需要一个小时以后才解析完 太费时间

先放干货,再去解释,这个代码直接运行即可

import blackboxprotobuf
import requests
import os
import datetime

# 用 blackboxprotobuf 直接解码

os.environ["NO_PROXY"]='s.wanfangdata.com.cn'
#headers={"User-Agent":UserAgent().chrome}
headers={
    "Accept":"*/*",
    "Accept-Encoding":"gzip, deflate, br",
    "Accept-Language":"zh-CN,zh;q=0.9",
    "Content-Type":"application/grpc-web+proto",
"Origin":"https://s..com.cn",
"host":"s.wanfangdata.com.cn",
    "Referer": "https://s..com.cn/paper?q=%E4%BD%9C%E8%80%85%E5%8D%95%E4%BD%8D%3A%E5%8C%97%E4%BA%AC%E5%A4%A7%E5%AD%A6&p=4",
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36"
}

deserialize_data={
  "1": {
    "1": "paper",
    "2": "作者单位:北京大学",
    "5": 1,
    "6": 20,
    "8": "\u0000"
  },
  "2": 1
}

message_type={'1': {'type': 'message', 'message_typedef': {
                   '1': {'type': 'bytes', 'name': ''},
                   '2': {'type': 'bytes', 'name': ''},
                   '5': {'type': 'int', 'name': ''},
                   '6': {'type': 'int', 'name': ''},
                   '8': {'type': 'bytes', 'name': ''}}, 'name': ''},
               '2': {'type': 'int', 'name': ''}}

form_data = bytes(blackboxprotobuf.encode_message(deserialize_data, message_type))
f_data = bytes([0,0,0,0,len(form_data)])+form_data

url='https://s..com.cn/SearchService.SearchService/search'

res = requests.post(url,data=f_data,headers=headers,timeout=10)


response_data,message_type=blackboxprotobuf.protobuf_to_json(res.content[5:-20])

print(response_data)
print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
print(message_type)

  

1.首先编写它

图1

 

 

 

 

 

 因为看不懂,所以才要用到接下来的宝贝软件: Fiddler

图2

 

 

 

 

 

 选中的这些右键保存为a.bin 文件,记住 前五位不要,后面再讲为什么

import blackboxprotobuf

with open('d://a.bin',mode="rb") as f:
    proto_info = f.read()
    print(proto_info)
    response_data2, message_type2 = blackboxprotobuf.protobuf_to_json(proto_info)
    print(response_data2)
    print(message_type2)

图3

 

 

 

看到这懂了吧

 

 

二:安装Proto,写一个检索类A,A里赋值 当成参数 post过去 得到返回数据流,  自己编写解析的一个类B,拿到传回来的bytes[] 后 用这个类B解一下;A与B怎么写下面讲

1.创建文件夹,request.proto 文件

 

 

 里面是这个

syntax = "proto3";


message SearchService {
    enum SearchFilter {
        B = 0;
    }

    //排序用
    message CommonRequest3{
        string a1=1;//出版时间
        int32 a2=2; //1
    }

    message CommonRequest4{
        string a1=1;
        string a2=2; //(Periodical OR Thesis OR Conference)
    }

    message CommonRequest {
        string searchType = 1;
        string searchWord = 2;
        CommonRequest3 a3=3;
        CommonRequest4 a4=4;
        int32 currentPage = 5;
        int32 pageSize = 6;
        repeated SearchFilter searchFilter = 8;
    }
    enum InterfaceType {
        // 定义了什么不知道,但是enum必须有一个值就是0
        DEFAUTL = 0;
    }
    message SearchRequest {
        CommonRequest commonrequest = 1; // 任意变量名
        int32 interfaceType = 2; // 任意变量名
    }
}

此处写入CMD回车

 2. 创建 info.proto

 

 

 

 

syntax = "proto3";


message SearchInfo{
int32 a1 = 1;
string a2 = 2;
int32 a3 = 3;
repeated mb4 a4=4;
mb5 a5=1002;
}

message mb5{
string a1=1;
string a2=2;
mb58 a8=8;
mb59 a9=9;
int32 a12=12;
int32 a13=13;
int32 a16=16;
int32 a18=18;
}

message mb58{
repeated string a1=1;
string a2=2;
string a3=3;
}

message mb59{
string a1=4;
}

message mb4{
string a1=1;
repeated mb42 a2=2;
string a3=3;
bytes a4=101;//期刊
bytes a5=104;//会议
bytes a6=102;//学位--暂时不查
}




message mb4101{
string a1=1;
string a2=2;
repeated string a3=3;
string a4=4;
repeated string a5=5;
repeated string a6=6;
string a8=8;
string a9=9;
string a10=10;
string a12=12;
repeated string a13=13;
repeated string a16=16;
repeated string a18=18;
string a20=20;
string a22=22;
repeated string a24=24;
string a25=25;
repeated string a27=27;
string a28=28;
string a29=29;
string a30=30;
int32 a31=31;
int32 a32=32;
int32 a33=33;
string a34=34;
string a35=35;
string a36=36;
string a37=37;
string a38=38;
repeated string a39=39;
string a40=40;
string a41=41;
repeated string a42=42;
string a43=43;
string a44=44;
string a45=45;
string a46=46;
int32 a47=47;
int32 a48=48;
int32 a49=49;
int32 a50=50;
string a53=53;
string a54=54;
string a55=55;
int32 a56=56;
string a57=57;
repeated string a58=58;
fixed32 a61=61;
int32 a62=62;
bytes a66=66;
string a67=67;
fixed32 a68=68;
fixed32 a69=69;
}


message mb42{
int32 a1=1;
int32 a2=2;
}

 

生成py文件,此处CMD

 

 

 

 

 

代码:test3.py  运行就可以了

import protobuf.info_pb2 as pbinfo
#import protobuf.journal_pb2 as pbjournal # 导包
import protobuf.request_pb2 as pb # 导包
import requests


search_request = pb.SearchService.SearchRequest()
search_request.commonrequest.searchType = "paper"
search_request.commonrequest.searchWord = '作者单位:(北京大学) or 作者单位:(湖北大学)'
search_request.commonrequest.pageSize = 20
search_request.commonrequest.currentPage = 1
search_request.commonrequest.a4.a1 = "Type"
search_request.commonrequest.a4.a2 = "(Periodical OR Thesis OR Conference)"
search_request.commonrequest.a3.a1="出版时间"
search_request.commonrequest.a3.a2=1

search_request.commonrequest.searchFilter.append(0)
search_request.interfaceType = 1


headers = {
    'Referer': 'xxxx',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
    'Content-Type': 'application/grpc-web+proto',
}


bytes_body = search_request.SerializeToString()

# 构造字节序列的头部
bytes_head = bytes([0, 0, 0, 0, len(bytes_body)])

resp = requests.post(url="https://s..com.cn/SearchService.SearchService/search",
            data=bytes_head+bytes_body,
            headers=headers)

proto_info = resp.content

print('---------------3------------------- PB4 解析的内容和格式')
first_parsed2 = pbinfo.SearchInfo()
first_parsed2.ParseFromString(proto_info[5:-20])
print(first_parsed2)
#print(first_parsed2.a4[0].a5)
print(len(first_parsed2.a4))


print('---------------4.2------------------- pbjournal 解析的内容和格式')
#first_parsedj2 = pbjournal.JournalInfo()
#first_parsedj2.ParseFromString(first_parsed2.a4[1].a4)
#print(first_parsedj2)

 

 

三 C#解析

 

运行这两句 编译成cs文件

protoc -I=./ --csharp_out=./ request.proto

protoc -I=./ --csharp_out=./ info.proto

 

public class bll
    {#region  根据单位和年下载数据
        /// <summary>
        /// 根据单位和年下载数据
        /// </summary>
        /// <param name="stype">检索类型:issn,org,term 检索式检索</param>
        /// <param name="stxt">北京大学/1111-1111</param>
        /// <param name="year">空或者2018</param>
        public static void CatchInfo(string stype,string stxt, string year = "")
        {


Console.Write($"{stxt}====开始下载 获取总条数:"); int pageSize = 50; int rowCount = GetRowCount(stype,stxt, year, year); Console.WriteLine($"{rowCount}"); Console.WriteLine(); --后面的我给删了,不能写太全了 } /// <summary> /// 返回总条数 /// </summary> /// <param name="stype">检索类型:issn,org,term 检索式检索</param> /// <param name="stxt">北京大学/1111-1111</param> /// <param name="year">空或者2018</param> /// <returns></returns> static int GetRowCount(string stype, string stxt, string syear, string eyear) { SearchInfo info = GetInfo(stype, stxt, syear, eyear, 1); int count = info.A3; return count; } /// <summary> /// 编辑检索式去检索得到返回数据 /// </summary> /// <param name="stype">检索类型:issn,org,term 检索式检索</param> /// <param name="stxt">北京大学/1111-1111</param> /// <param name="syear"></param> /// <param name="eyear"></param> /// <param name="page"></param> /// <returns></returns> static SearchInfo GetInfo(string stype,string seartxt, string syear, string eyear, int page) { //1.参数 SearchRequest sr = new SearchRequest(); CommonRequest common = new CommonRequest(); common.SearchType = "paper"; string stypr2 = ""; stypr2 = $"作者单位:(\"{seartxt}\")"; common.SearchWord = term; common.CurrentPage = page; common.PageSize = 50; common.SearchFilter.Add(0); CommonRequest4 a4 = new CommonRequest4(); a4.A1 = "Type"; a4.A2 = "(Periodical OR Thesis OR Conference)";//(Periodical OR Thesis OR Conference) common.A4 = a4; CommonRequest3 a3 = new CommonRequest3(); a3.A1 = "出版时间"; a3.A2 = 1; common.A3 = a3; sr.InterfaceType = 1; sr.Commonrequest = common; byte[] byteData = sr.ToByteArray(); //2.参数前加5个0 int d = byteData.Length; byte[] b1 = new byte[] { 0, 0, 0, 0, Convert.ToByte(d) }; byte[] b2 = byteData; byte[] b3 = new byte[b1.Length + b2.Length]; b1.CopyTo(b3, 0); b2.CopyTo(b3, b1.Length); string byteDataStr = Encoding.UTF8.GetString(b3); string url = "https://s..com.cn/SearchService.SearchService/search"; Stream stream = GetHtmlByPostUrl3(url, b3); MemoryStream ms2 = StreamToMemoryStream(stream); //掐头去尾 前5和后20 byte[] btt1 = StreamToBytes(ms2); byte[] btt2 = btt1.Skip(5).Take(btt1.Length - 25).ToArray(); //byte[] btt3 = btt2.Take(btt1.Length - 25).ToArray(); SearchInfo dat2 = SearchInfo.Parser.ParseFrom(btt2);//反序列化 return dat2; } #endregion #region 工具 static Stream GetHtmlByPostUrl3(string url, byte[] byteData, int err = 1) { try { //实例化编码方式 //UTF8Encoding encoding = new UTF8Encoding(); Encoding encoding = Encoding.GetEncoding("utf-8"); //根据请求链接参数需求对参数字符串转为二进制字符组 //byte[] byteData = encoding.GetBytes(postData); //创建请求对象 HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url); request.Host = "s.wanfangdata.com.cn"; //请求用户代理 request.UserAgent = " Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"; //请求头协议方式 request.Accept = "*/*"; //请求头语言 request.Headers.Add("Accept-Language", "zh-CN,zh;q=0.9"); //请求类型 request.ContentType = "application/grpc-web+proto"; //设置请求数据长度 request.ContentLength = byteData.Length; request.Method = "POST"; request.Referer = "https://s.wanfangdata.com.cn/advanced-search/paper"; request.CookieContainer = new CookieContainer(); //设置请求等待时间 request.Timeout = 100000; request.ReadWriteTimeout = 100000; //将请求到数据放入到流中 Stream reqStream = request.GetRequestStream(); //输出流 reqStream.Write(byteData, 0, byteData.Length); //返回对象 HttpWebResponse objResponse = (HttpWebResponse)request.GetResponse(); Stream streamResponse = objResponse.GetResponseStream(); //关闭流 reqStream.Close(); return streamResponse; } catch (Exception e) { } return null; } static MemoryStream StreamToMemoryStream(Stream stream) { MemoryStream memoryStream = new MemoryStream(); //将基础流写入内存流 const int bufferLength = 1024; byte[] buffer = new byte[bufferLength]; int actual = stream.Read(buffer, 0, bufferLength); while (actual > 0) { // 读、写过程中,流的位置会自动走。 memoryStream.Write(buffer, 0, actual); actual = stream.Read(buffer, 0, bufferLength); } memoryStream.Position = 0; return memoryStream; } static byte[] StreamToBytes(Stream stream) { byte[] bytes = new byte[stream.Length]; stream.Read(bytes, 0, bytes.Length); stream.Seek(0, SeekOrigin.Begin); return bytes; } #endregion }

 

 

 

我是遇到响应类型 application/grpc-web+proto 这个后没见过,才研究的

 

上面有个BUG,

byte[] b1 = new byte[] { 0, 0, 0, 0, Convert.ToByte(d) };
d当d>256的时候就报错了,也就是检索条件长的话就不能检索了,现在补充一下
取余数,并且位移8

 

posted @ 2022-07-25 13:04  荧屏  阅读(1404)  评论(0)    收藏  举报