微信公众号文章,HTML格式转为markdown格式
注意
微信HTML格式是极其混乱的,可能会出现a标签里面出现h标签,span标签里面包含p标签等各种情况,所以请仔细核对转换后的md文件内容
核心代码
文件名wx_html_to_md.py
点击查看代码
import re
from bs4 import Tag, NavigableString
__is_scrap_link = None
__is_scrap_img = None
__is_scrap_table = None
def __process_space(pagr_text: str):
if ")[" in pagr_text:
return pagr_text.strip()
# 将括号替换
pagr_text = pagr_text.replace("[", "【").replace("]", "】")
pagr_text = pagr_text.replace("(", "(").replace(")", ")")
pagr_text = re.sub(r"([^$])([$])([^$])", r"\g<1>\\\g<2>\g<3>", pagr_text)
pagr_text = re.sub(r"([^\\])([\{\}\_\`])", r"\g<1>\\\g<2>", pagr_text)
pagr_text = pagr_text.replace(r"$*", r"\$\*")
# 博客园转义$需要三个斜杠,{}需要两次
pagr_text = pagr_text.replace(r"\$", r"\\\$")
if r"$\{" in pagr_text:
# 博客园的表格需要这样替换
pagr_text = pagr_text.replace(r"\}", r"\} ")
# 去除空格
return pagr_text
def md_toc(body: Tag):
"""替换目录"""
toc = body.select_one(".toc")
if toc:
toc.replace_with("[TOC]")
def md_u(body: Tag):
"""修改下划线为粗体"""
selector_list = [
"u",
'[style*="text-decoration: underline;"]',
]
for selector in selector_list:
for u in body.select(selector):
pagr_text = u.get_text(strip=True)
if not pagr_text:
_md_decompose(u)
continue
if len(pagr_text) > 50:
continue
pagr_text = __process_space(pagr_text)
u.replace_with("<u>%s</u>" % pagr_text)
def md_h(body: Tag):
"""修改标题"""
h = body.find_all(["h1", "h2"]) # "h3", "h4", "h5", "h6" 这些认为是粗体
for i in h:
n = int(i.name[1]) + 1 # 先加一级,最多就是二级标题,不允许设置一级标题
pagr_text = i.get_text(strip=True)
if not pagr_text:
_md_decompose(i)
continue
if len(pagr_text) > 50:
continue
pagr_text = __process_space(pagr_text)
i.replace_with("#" * n + " " + pagr_text + "\n")
def md_table(body: Tag):
"""改变表格"""
tbs = body.find_all("table")
for table_item in tbs:
if not __is_scrap_table:
_md_decompose(table_item)
continue
desc = table_item.select_one("caption > span") or ""
if desc:
# 居中,注意换行的要多次,不然下面的表格展示不出来
desc = "\n\n <center> %s </center> \n\n" % desc.get_text()
ths = table_item.select_one("thead > tr") # 表头
trs = table_item.select("tbody > tr") # 表行
if not ths: # 防止格式不对,没有表头
ths, trs = trs[0], trs[1:]
head = mid = "|"
for th in ths.find_all(re.compile("th|td")):
style = th.get("style") or "center"
th_text = th.get_text().strip()
if "left" in style:
head += " " + th_text + " |"
mid += " :--- |"
elif "right" in style:
head += " " + th_text + " |"
mid += " ---: |"
else:
head += " " + th_text + " |"
mid += " :--: |"
body_text = desc + head + "\n" + mid + "\n"
temp_text = ""
for tr in trs:
row = "|"
for td in tr.find_all("td"):
style = th.get("style") or "center"
td_text = td.get_text().strip()
td_text = __process_space(td_text)
# 去掉了两头的\n,如果还有\n那就表示 表格中间有换行
td_text = td_text.replace("\n\n", " <br> ").replace("\n", " <br> ")
if "left" in style:
row += " " + td_text + " |"
elif "right" in style:
row += " " + td_text + " |"
else:
row += " " + td_text + " |"
if row != "|":
temp_text += row + "\n"
body_text += temp_text
if __is_scrap_table:
table_item.replace_with(body_text)
else:
_md_decompose(table_item)
def md_pre(body: Tag):
"""修改代码块部分"""
return
def md_code(body: Tag):
"""修改单独的代码"""
codes = body.find_all("code")
for i in codes:
if not i.parent.name == "pre":
i.replace_with(f"`{i.get_text()}`")
def md_img(body: Tag):
"""修改图片"""
imgs = body.find_all("img")
for i in imgs:
if not i.decomposed:
src = i.get("src") or i.get("data-src")
if __is_scrap_img:
i.replace_with(f"")
else:
_md_decompose(i)
def md_a(body: Tag):
"""修改链接"""
links = body.find_all("a")
for a in links:
if not a.decomposed:
# 拿到a连接的指向地址,不要使用a.href!!! 会返回None,同img.src 参考上方代码
href = a.get("href")
if href and href[0] == "#":
# 是跳转标签,那就不转化,csdn的目录就这样
continue
if __is_scrap_link:
a.replace_with(f"[{a.text}]({href})")
else:
_md_decompose(a)
def md_ol_ul(body: Tag):
"""修改列表"""
li_list = body.find_all(["ul", "ol"])
for li_obj in li_list:
start = "*" if li_obj.name == "ul" else 1
for li_item in li_obj.find_all("li"):
pagr_text = li_item.get_text(strip=True)
pagr_text = __process_space(pagr_text)
if pagr_text.strip():
if li_obj.name == "ol":
li_item.replace_with("%s. %s \n" % (start, pagr_text))
start += 1
else:
li_item.replace_with("%s %s \n" % (start, pagr_text))
def md_em(body: Tag):
"修改斜体"
for item in body.find_all(["em"]):
xing = "*"
title_text = item.get_text().strip()
if not title_text:
_md_decompose(item)
continue
if not title_text.startswith(xing):
title_text = "%s%s%s " % (xing, title_text, xing)
item.replace_with(title_text)
def md_b(body: Tag):
"""修改斜体粗体"""
black_font = []
selectors = [
"strong",
"b",
"h3",
"h4",
"h5",
"h6",
'[style*="font-weight: bold;"]',
'[style*="font-weight:bold;"]',
'[style*="font-family:黑体;"]',
# '[style*="font-family:微软雅黑;"]', # 有的地方不是粗体
]
unique_elements_id = set() # 初始化空集合
for selector in selectors:
for item in body.select(selector):
now_id = id(item)
if now_id in unique_elements_id:
continue
unique_elements_id.add(now_id)
black_font.append(item)
for item in black_font:
xing = "**" if item.name != "em" else "*"
title_text = item.get_text().strip()
if not title_text:
_md_decompose(item)
continue
if not title_text.startswith(xing):
# 防止格式重复
title_text = "%s%s%s " % (xing, title_text, xing)
item.replace_with(title_text)
def md_span(body: Tag):
"""修改spam"""
tag_list = body.find_all("span")
for tag in tag_list:
if tag.find_all(["p", "section"]):
# span标签里面还有p就跳过
continue
pagr_text = tag.get_text()
pagr_text = __process_space(pagr_text)
tag.replace_with(pagr_text)
def has_sub_tag(node, tag_name):
# 检查一个节点是否含有相同标签的子节点(叶节点)
if not isinstance(node, Tag) or node.name != tag_name:
return False
# 检查节点是否只包含文本节点或其他非指定标签名的子节点
for child in node.descendants:
if child: # 忽略None类型的子节点(例如空白文本节点)
if (
not isinstance(child, NavigableString)
and isinstance(child, Tag)
and child.name == tag_name
):
return False
return True
def md_p(body: Tag):
"""修改段落, 要从最底层的修改入手,不然bs会从最上层提取text,子节点的格式无法体现"""
tags_to_check = ["p", "section"]
lowest_tags = []
for section in body.find_all(tags_to_check):
if not section:
continue
if not section.get_text().strip():
_md_decompose(section)
continue
if not section.find_all(name=tags_to_check):
# tag.find_all(recursive=False) 检查当前标签是否存在直接子标签,若返回空列表则说明是叶子节点
lowest_tags.append(section)
# 在这里可以对lowest_sections中的每个最底层节点进行操作
for tag in lowest_tags:
pagr_text = tag.get_text()
pagr_text = __process_space(pagr_text)
pagr_text = pagr_text.strip()
if not pagr_text:
_md_decompose(tag)
continue
pagr_text += " \n" if len(pagr_text.replace("*", "")) >= 2 else ""
tag.replace_with(pagr_text)
def _md_decompose(tag: Tag):
"由叶及根的倒序删除空标签,如果是一脉单传的节点,会一直向上删除"
current = tag
current_parent = tag.parent
while (
current_parent
and len(current_parent.contents) == 1
and not current.text.strip()
and current.name != "img"
and not current.find_all("img")
):
current = current_parent
current_parent = current.parent
current.decompose() # 删除空标签
def md_clear_tag(body: Tag):
# 遍历所有标签并删除空标签
for tag in body.find_all(True):
if tag.name == "br":
tag.replace_with("\n")
continue
if (
tag.name != "img"
and not tag.find_all("img")
and not tag.get_text(strip=True)
): # 检查标签是否为空
_md_decompose(tag)
if tag.name == "noscript":
_md_decompose(tag)
def transform(body: Tag, scrap_img, scrap_link, scrap_table) -> str:
"""整合所有修改,尽量不要改变顺序
如果有包含顺序,那就需要由内而外
"""
global __is_scrap_img
__is_scrap_img = scrap_img
global __is_scrap_link
__is_scrap_link = scrap_link
global __is_scrap_table
__is_scrap_table = scrap_table
if not isinstance(body, Tag):
raise Exception()
md_clear_tag(body)
md_toc(body)
md_a(body)
md_img(body)
md_u(body)
md_h(body) # h需要在b标签之前,因为有的html在h标签也加入了粗体样式
md_b(body)
md_em(body) # 修改斜体,需要在b标签之后,因为有的html在b标签也加入了斜体样式
md_code(body)
md_table(body)
# remove_div(body)
md_pre(body) # 目前是不需要的, wx的pre没有明显的显示效果,且及其混乱
md_ol_ul(body)
md_span(body)
# md_clear(body)
md_p(body)
# 最后合并连续的加粗
arctile_content = body.get_text()
arctile_content = re.sub(r"\*\*\s?\*\*", "", arctile_content)
arctile_content = re.sub(r"</u>\s?<u>", "", arctile_content)
# 很多空格,那就视为分段
arctile_content = re.sub(r"[ ]{10,}", " \n", arctile_content)
return arctile_content.strip()
运行代码
点击查看代码
import re
from bs4 import BeautifulSoup
import requests
from common_data import publish_headers
from wx_html_to_md import transform
def url_to_md_main(
url: str, scrap_img=False, scrap_link=False, scrap_table=False
) -> str:
url = url.strip()
html_content = requests.get(url, headers=publish_headers).text
return html_to_md_main(html_content, scrap_img, scrap_link, scrap_table)
def html_to_md_main(html_content: str, scrap_img, scrap_link, scrap_table) -> str:
if not html_content:
raise Exception("html content 为空")
replace_tuple = (
"‬",
"",
"&",
"&",
"'",
"'",
""",
'"',
">",
">",
"<",
"<",
"¥",
"¥",
"\\x0a",
"\n",
"\\x26lt;",
"<",
"\\x26gt;",
">",
"\\x26quot;",
'"',
"\\x00",
"",
"\ufeff",
"",
)
for i in range(0, len(replace_tuple), 2):
html_content = html_content.replace(replace_tuple[i], replace_tuple[i + 1])
html_content = re.sub(r"[▼↓■●✦▇❖★👇]", "", html_content)
soup = BeautifulSoup(markup=html_content, features="lxml")
# wx有两种,一种是左边轮播图片,一种是手机上常见的上下, 注意 js_content都会有,所以放在后面
data_bs = soup.select_one(".share_content_page")
if data_bs:
# 这种格式数据在js里面,只能正则提取
article_img = re.findall(r"cdn_url: '(.*?)'", html_content)
res = ""
for img_url in article_img:
res += "\n" % img_url
article_content = re.findall(r'window.desc = "(.*?)"', html_content, flags=re.S)
article_content = article_content or re.findall(
r"var ContentNoEncode = window.a_value_which_never_exists \|\| \'(.*?)\';\n",
html_content,
flags=re.S,
)
article_content = article_content or [""]
article_content = article_content[0]
res += article_content
else:
data_bs = soup.select_one("#js_content")
if not data_bs:
exception_info = soup.body.get_text(separator=",", strip=True)
raise ValueError(exception_info)
res = transform(data_bs, scrap_img, scrap_link, scrap_table)
res = re.sub(r"\n[\*\.0-9\s]*\n", "\n", res)
return res
if __name__ == "__main__":
while True:
url = input("请输入url:")
res = url_to_md_main(url)
with open("test.md", mode="w", encoding="utf-8") as fp:
fp.write(res)
print("\n结果写入到了 test.md ,请及时提取")
本文来自博客园,作者:喝茶看猴戏,转载请注明原文链接:https://www.cnblogs.com/zdwzdwzdw/p/18838279

浙公网安备 33010602011771号