python requests读取的网页内容添加格式
方法一:
# -*- coding: utf-8 -*-
"""
requests结果集的css格式化处理
"""python
## common/handle_markdown_style.py
import sys, os
base_path = os.path.dirname(os.path.dirname(
os.path.abspath(__file__)))
sys.path.append(base_path)
def handle_style_link(title,content):
file='common/common.css'
with open(file, "r") as f:
style = f.read()
head_html = """
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>{title}</title>
<script src="http://code.jquery.com/jquery-latest.js"></script>
<style>{style}</style>
</head>
""".format(title=title,style=style)
content = head_html+content
return content
#title = 'a'
#file='common.css'
#content ='<p>hello world<p>'
#html = handle_style_link(title,content,file)
#print(html)
def handle_style_css(content):
#background: url(https://images.cnblogs.com/cnblogs_com/echolun/1775155/t_20052715354197e4eed2-a992-4976-acf0-ccb6fb34d308.png?a=1590593770539);
head = """
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Document</title>
<script src="http://code.jquery.com/jquery-latest.js"></script>
<style>
code {
font-family: "Courier New",sans-serif !important;
font-size: 12px !important;
line-height: 20px;
background-color: #f5f5f5 !important;
border: 1px solid #ccc !important;
padding: 0 5px !important;
border-radius: 3px !important;
line-height: 1.8;
margin: 1px 5px;
vertical-align: middle;
display: inline-block;
overflow-x: auto;
}
pre{
box-shadow: rgba(0, 0, 0, 0.55) 0px 0px 10px !important;
margin:10px;
}
pre code{
border: none !important;
padding: 15px !important;
display: block;
overflow-x: auto;
background-color: #f5f5f5 !important;
margin: auto;
height: auto;
}
pre:before{
content: '';
display: block;
background: url(https://mmbiz.qpic.cn/mmbiz_png/TR2XGABn6Mo5uaCDliaPCyBgiaxaYyUj7Yv6IBUstpDjx81XffnGILwibd7jEmkaexGGck50OrIRkz8JQcJiakPgBg/640?wx_fmt=png);
height: 35px;
width: 100%;
background-size: 40px;
background-repeat: no-repeat;
background-color: #fff;
margin-bottom: -7px;
border-radius: 5px;
background-position: 10px 10px;
}
p{
font-size:14px;
}
</style>
</head>
"""
content = head+content
return content
##cnblogs.py
from common import handle_markdown_style as handle_style
title = page_title
post = content[0]
content = etree.tostring(post, encoding=str)
html = handle_style.handle_style_link(title,content)
with open("post4.html", "w") as f:
f.write(html)
另外一种方法:直接在文件写入引用
def handle_style_css(content):
#background: url(https://images.cnblogs.com/cnblogs_com/echolun/1775155/t_20052715354197e4eed2-a992-4976-acf0-ccb6fb34d308.png?a=1590593770539);
head = """
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Document</title>
<script src="http://code.jquery.com/jquery-latest.js"></script>
<style>
code {
font-family: "Courier New",sans-serif !important;
font-size: 12px !important;
line-height: 20px;
background-color: #f5f5f5 !important;
border: 1px solid #ccc !important;
padding: 0 5px !important;
border-radius: 3px !important;
line-height: 1.8;
margin: 1px 5px;
vertical-align: middle;
display: inline-block;
overflow-x: auto;
}
pre{
box-shadow: rgba(0, 0, 0, 0.55) 0px 0px 10px !important;
margin:10px;
}
pre code{
border: none !important;
padding: 15px !important;
display: block;
overflow-x: auto;
background-color: #f5f5f5 !important;
margin: auto;
height: auto;
}
pre:before{
content: '';
display: block;
background: url(https://mmbiz.qpic.cn/mmbiz_png/TR2XGABn6Mo5uaCDliaPCyBgiaxaYyUj7Yv6IBUstpDjx81XffnGILwibd7jEmkaexGGck50OrIRkz8JQcJiakPgBg/640?wx_fmt=png);
height: 35px;
width: 100%;
background-size: 40px;
background-repeat: no-repeat;
background-color: #fff;
margin-bottom: -7px;
border-radius: 5px;
background-position: 10px 10px;
}
p{
font-size:14px;
}
</style>
</head>
"""
content = head+content
return content
post = content[0]
content_html = etree.tostring(post, encoding=str)
content = handle_style_css(content_html)
#content = handle_style_code(content_html)
#content = handle_style_pre_code(content)
with open("post4.html", "w") as f:
f.write(content)
补充:读取url文档
from bs4 import BeautifulSoup
from lxml import etree
import requests
import re
import time
def get_html_loop(url,loop=2):
response_url = None
statuscode = 0
response_html = None
page_title = None
if url is None or len(url) ==0:
print('one invalid url is found!')
return response_html
print('url:'+url)
for i in range(loop):
try:
response = requests.get(url)
response.encoding = 'utf-8'
if response.status_code == 200:
response_html=response.text
response_url = response.url
statuscode = response.status_code
soup = BeautifulSoup(response_html)
#page_title = soup.html.head.title.string
page_title = soup.title.string
else:
print('response from %s is invalid!' % url)
break
except:
print("Connection refused by the server..")
print("Let me sleep for 5 seconds")
print("ZZzzzz...")
time.sleep(1)
print("Was a nice sleep, now let me continue...")
continue
return response_html,page_title
# url = "https://tv.cctv.com/lm/xwlb/day/20210222.shtml"
# response_html = get_html_loop(url)
# print(response_html)
def get_content(response_html,title_xpath,content_xpath):
html = etree.HTML(response_html)
title = html.xpath(title_xpath)
content = html.xpath(content_xpath)
return title,content
url = 'https://www.cnblogs.com/ministep/p/14548421.html'
response_html,page_title = get_html_loop(url)
print(page_title)
title_xpath = '//*[@id="topics"]/div/h1//text()'
content_xpath= '//*[@id="topics"]/div/div[2]'
title,content =get_content(response_html,title_xpath,content_xpath)
print('*'*30)
print(title)
print('*'*30)
print(content)