网站更新内容:请访问: https://bigdata.ministep.cn/

html文本添加style格式

python requests读取的网页内容添加格式

方法一:

# -*- coding: utf-8 -*-
"""
requests结果集的css格式化处理
"""python
## common/handle_markdown_style.py
import sys, os
base_path = os.path.dirname(os.path.dirname(
                            os.path.abspath(__file__)))
sys.path.append(base_path)

def handle_style_link(title,content):
    file='common/common.css'
    with open(file, "r") as f:
        style = f.read()
    head_html = """
    <head>
      <meta charset="UTF-8">
      <meta name="viewport" content="width=device-width, initial-scale=1.0">
      <title>{title}</title>
      <script src="http://code.jquery.com/jquery-latest.js"></script>
      <style>{style}</style>
    </head>
    """.format(title=title,style=style)
    content = head_html+content
    return content       

#title = 'a'
#file='common.css'
#content ='<p>hello world<p>'
#html = handle_style_link(title,content,file)
#print(html)


def handle_style_css(content):
    #background: url(https://images.cnblogs.com/cnblogs_com/echolun/1775155/t_20052715354197e4eed2-a992-4976-acf0-ccb6fb34d308.png?a=1590593770539);
    head = """
    <head>
      <meta charset="UTF-8">
      <meta name="viewport" content="width=device-width, initial-scale=1.0">
      <title>Document</title>
      <script src="http://code.jquery.com/jquery-latest.js"></script>

        <style>
    
        code {
            font-family: "Courier New",sans-serif !important;
            font-size: 12px !important;
            line-height: 20px;
            background-color: #f5f5f5 !important;
            border: 1px solid #ccc !important;
            padding: 0 5px !important;
            border-radius: 3px !important;
            line-height: 1.8;
            margin: 1px 5px;
            vertical-align: middle;
            display: inline-block;
            overflow-x: auto;
        }
    pre{
      box-shadow: rgba(0, 0, 0, 0.55) 0px 0px 10px !important;
      margin:10px;
    }
    
    pre code{
        border: none !important;
        padding: 15px !important;
        display: block;
        overflow-x: auto;
        background-color: #f5f5f5 !important;
        margin: auto;
        height: auto;
    }

    pre:before{
            content: '';
            display: block;
            background: url(https://mmbiz.qpic.cn/mmbiz_png/TR2XGABn6Mo5uaCDliaPCyBgiaxaYyUj7Yv6IBUstpDjx81XffnGILwibd7jEmkaexGGck50OrIRkz8JQcJiakPgBg/640?wx_fmt=png);
            height: 35px;
            width: 100%;
            background-size: 40px;
            background-repeat: no-repeat;
            background-color: #fff;
            margin-bottom: -7px;
            border-radius: 5px;
            background-position: 10px 10px;
        }
        
    p{
            font-size:14px;
     }    
        </style>
    </head>

    """
    content = head+content
    return content 

##cnblogs.py
from common import handle_markdown_style as handle_style

title = page_title
post = content[0]
content = etree.tostring(post, encoding=str)
html = handle_style.handle_style_link(title,content)

with open("post4.html", "w") as f:
    f.write(html)

另外一种方法:直接在文件写入引用

def handle_style_css(content):
    #background: url(https://images.cnblogs.com/cnblogs_com/echolun/1775155/t_20052715354197e4eed2-a992-4976-acf0-ccb6fb34d308.png?a=1590593770539);
    head = """
    <head>
      <meta charset="UTF-8">
      <meta name="viewport" content="width=device-width, initial-scale=1.0">
      <title>Document</title>
      <script src="http://code.jquery.com/jquery-latest.js"></script>

        <style>
    
        code {
            font-family: "Courier New",sans-serif !important;
            font-size: 12px !important;
            line-height: 20px;
            background-color: #f5f5f5 !important;
            border: 1px solid #ccc !important;
            padding: 0 5px !important;
            border-radius: 3px !important;
            line-height: 1.8;
            margin: 1px 5px;
            vertical-align: middle;
            display: inline-block;
            overflow-x: auto;
        }
    pre{
      box-shadow: rgba(0, 0, 0, 0.55) 0px 0px 10px !important;
      margin:10px;
    }
    
    pre code{
        border: none !important;
        padding: 15px !important;
        display: block;
        overflow-x: auto;
        background-color: #f5f5f5 !important;
        margin: auto;
        height: auto;
    }

    pre:before{
            content: '';
            display: block;
            background: url(https://mmbiz.qpic.cn/mmbiz_png/TR2XGABn6Mo5uaCDliaPCyBgiaxaYyUj7Yv6IBUstpDjx81XffnGILwibd7jEmkaexGGck50OrIRkz8JQcJiakPgBg/640?wx_fmt=png);
            height: 35px;
            width: 100%;
            background-size: 40px;
            background-repeat: no-repeat;
            background-color: #fff;
            margin-bottom: -7px;
            border-radius: 5px;
            background-position: 10px 10px;
        }
        
    p{
            font-size:14px;
     }    
        </style>
    </head>

    """
    content = head+content
    return content   

post = content[0]
content_html = etree.tostring(post, encoding=str)
content = handle_style_css(content_html)
#content = handle_style_code(content_html)
#content = handle_style_pre_code(content)

with open("post4.html", "w") as f:
    f.write(content)

补充:读取url文档

from bs4 import BeautifulSoup
from lxml import etree
import requests 
import re
import time

def get_html_loop(url,loop=2):
    response_url = None
    statuscode = 0
    response_html = None
    page_title = None
    if url is None or len(url) ==0:
        print('one invalid url is found!')
        return response_html
    print('url:'+url)
    for i in range(loop):
        try:
            response = requests.get(url)
            response.encoding = 'utf-8'
            if response.status_code == 200:
                response_html=response.text
                response_url = response.url
                statuscode = response.status_code
                soup = BeautifulSoup(response_html)
                #page_title = soup.html.head.title.string
                page_title = soup.title.string
            else:
                print('response from %s is invalid!' % url)
            break
        except:
            print("Connection refused by the server..")
            print("Let me sleep for 5 seconds")
            print("ZZzzzz...")
            time.sleep(1)
            print("Was a nice sleep, now let me continue...")
            continue
    return response_html,page_title
# url = "https://tv.cctv.com/lm/xwlb/day/20210222.shtml"
# response_html = get_html_loop(url)
# print(response_html)

def get_content(response_html,title_xpath,content_xpath):
    html =  etree.HTML(response_html)
    title = html.xpath(title_xpath)
    content = html.xpath(content_xpath)
    return title,content

url = 'https://www.cnblogs.com/ministep/p/14548421.html'
response_html,page_title = get_html_loop(url)
print(page_title)
title_xpath =  '//*[@id="topics"]/div/h1//text()'
content_xpath=  '//*[@id="topics"]/div/div[2]'
title,content =get_content(response_html,title_xpath,content_xpath)
print('*'*30)
print(title)
print('*'*30)
print(content)
posted @ 2021-03-17 20:01  ministep88  阅读(480)  评论(1)    收藏  举报
网站更新内容:请访问:https://bigdata.ministep.cn/