爬虫起点css破解

#均为个人原创,转载请注明出处:https://www.cnblogs.com/HugJun/p/11506270.html

import requests,time,re,pprint
from fontTools.ttLib import TTFont
from io import BytesIO

def get_font(url):
    resp = requests.get(url)
    font = TTFont(BytesIO(resp.content))
    web_font_relation = font.getBestCmap()
    font.close()
    return web_font_relation


def get_html_info(url):
    headers = {
        'User-Agent': 'User-Agent:Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
    }
    html_data = requests.get(url, headers=headers)
    url_ttf_pattern = re.compile('<style>(.*?)\s*</style>',re.S)
    fonturl = re.findall(url_ttf_pattern,html_data.text)[0]
    url_ttf = re.search('woff.*?url.*?\'(.+?)\'.*?truetype', fonturl).group(1)
    print(url_ttf)
    return url_ttf,html_data.text

def get_encode_font(data,web_font_relation):
    """
    将加密的css字母替换成原本的数字
    :param data: 后台源码
    :return:
    """
    python_font_relation = {
    'one':1,
    'two':2,
    'three':3,
    'four':4,
    'five':5,
    'six':6,
    'seven':7,
    'eight':8,
    'nine':9,
    'zero':0,
    'period':'.'
    }
    

    for k,v in web_font_relation.items():
        if str(k) in data:
            print(k,v)
            fon_css = '&#'+str(k)+';'
            data = re.sub(fon_css, str(python_font_relation[v]), data)

    total_num = int(float(''.join(re.findall('<span class=".+">(.+)</span></em><cite>万字',data)))*10000)  #总字数
    total_recommend = int(float(''.join(re.findall('<span class=".+">(.+)</span></em><cite>万总推荐',data)))*10000)  #总推荐
    week_recommend = int(float(''.join(re.findall('<span class=".+">(.+)</span></em><cite>周推荐',data)))*10000)  #周推荐
    print(total_num,total_recommend,week_recommend)


def main(url):
    fan_info, data = get_html_info(url) 
    web_font_relation = get_font(fan_info)
    get_encode_font(data, web_font_relation)

"""程序主入口"""
if __name__=='__main__':
    url = 'https://book.qidian.com/info/1115277'  # 选取某一小说
    main(url)

 

posted @ 2019-09-11 14:55  小君~  阅读(374)  评论(0编辑  收藏  举报