微信公众号文章,HTML格式转为markdown格式

注意

微信HTML格式是极其混乱的,可能会出现a标签里面出现h标签,span标签里面包含p标签等各种情况,所以请仔细核对转换后的md文件内容

核心代码

文件名wx_html_to_md.py

点击查看代码
import re

from bs4 import Tag, NavigableString


__is_scrap_link = None
__is_scrap_img = None
__is_scrap_table = None


def __process_space(pagr_text: str):
    if ")[" in pagr_text:
        return pagr_text.strip()
        # 将括号替换
    pagr_text = pagr_text.replace("[", "【").replace("]", "】")
    pagr_text = pagr_text.replace("(", "(").replace(")", ")")
    pagr_text = re.sub(r"([^$])([$])([^$])", r"\g<1>\\\g<2>\g<3>", pagr_text)
    pagr_text = re.sub(r"([^\\])([\{\}\_\`])", r"\g<1>\\\g<2>", pagr_text)
    pagr_text = pagr_text.replace(r"$*", r"\$\*")
    # 博客园转义$需要三个斜杠,{}需要两次

    pagr_text = pagr_text.replace(r"\$", r"\\\$")
    if r"$\{" in pagr_text:
        # 博客园的表格需要这样替换
        pagr_text = pagr_text.replace(r"\}", r"\}&nbsp;")
    # 去除空格
    return pagr_text


def md_toc(body: Tag):
    """替换目录"""
    toc = body.select_one(".toc")
    if toc:
        toc.replace_with("[TOC]")


def md_u(body: Tag):
    """修改下划线为粗体"""

    selector_list = [
        "u",
        '[style*="text-decoration: underline;"]',
    ]

    for selector in selector_list:
        for u in body.select(selector):
            pagr_text = u.get_text(strip=True)
            if not pagr_text:
                _md_decompose(u)
                continue
            if len(pagr_text) > 50:
                continue
            pagr_text = __process_space(pagr_text)
            u.replace_with("<u>%s</u>" % pagr_text)


def md_h(body: Tag):
    """修改标题"""

    h = body.find_all(["h1", "h2"])  #  "h3", "h4", "h5", "h6" 这些认为是粗体
    for i in h:
        n = int(i.name[1]) + 1  # 先加一级,最多就是二级标题,不允许设置一级标题
        pagr_text = i.get_text(strip=True)
        if not pagr_text:
            _md_decompose(i)
            continue
        if len(pagr_text) > 50:
            continue
        pagr_text = __process_space(pagr_text)
        i.replace_with("#" * n + " " + pagr_text + "\n")


def md_table(body: Tag):
    """改变表格"""
    tbs = body.find_all("table")
    for table_item in tbs:
        if not __is_scrap_table:
            _md_decompose(table_item)
            continue

        desc = table_item.select_one("caption > span") or ""
        if desc:
            # 居中,注意换行的要多次,不然下面的表格展示不出来
            desc = "\n\n <center> %s </center> \n\n" % desc.get_text()

        ths = table_item.select_one("thead > tr")  # 表头
        trs = table_item.select("tbody > tr")  # 表行

        if not ths:  # 防止格式不对,没有表头
            ths, trs = trs[0], trs[1:]

        head = mid = "|"

        for th in ths.find_all(re.compile("th|td")):
            style = th.get("style") or "center"
            th_text = th.get_text().strip()
            if "left" in style:
                head += " " + th_text + "    |"
                mid += " :--- |"
            elif "right" in style:
                head += "    " + th_text + " |"
                mid += " ---: |"
            else:
                head += "  " + th_text + "   |"
                mid += " :--: |"

        body_text = desc + head + "\n" + mid + "\n"
        temp_text = ""
        for tr in trs:
            row = "|"
            for td in tr.find_all("td"):
                style = th.get("style") or "center"
                td_text = td.get_text().strip()
                td_text = __process_space(td_text)
                # 去掉了两头的\n,如果还有\n那就表示 表格中间有换行
                td_text = td_text.replace("\n\n", " <br> ").replace("\n", " <br> ")
                if "left" in style:
                    row += " " + td_text + "    |"
                elif "right" in style:
                    row += "    " + td_text + " |"
                else:
                    row += "  " + td_text + "   |"
            if row != "|":
                temp_text += row + "\n"
        body_text += temp_text
        if __is_scrap_table:
            table_item.replace_with(body_text)
        else:
            _md_decompose(table_item)


def md_pre(body: Tag):
    """修改代码块部分"""
    return


def md_code(body: Tag):
    """修改单独的代码"""
    codes = body.find_all("code")
    for i in codes:
        if not i.parent.name == "pre":
            i.replace_with(f"`{i.get_text()}`")


def md_img(body: Tag):
    """修改图片"""
    imgs = body.find_all("img")
    for i in imgs:
        if not i.decomposed:
            src = i.get("src") or i.get("data-src")
            if __is_scrap_img:
                i.replace_with(f"![]({src})")
            else:
                _md_decompose(i)


def md_a(body: Tag):
    """修改链接"""
    links = body.find_all("a")
    for a in links:
        if not a.decomposed:
            # 拿到a连接的指向地址,不要使用a.href!!! 会返回None,同img.src 参考上方代码
            href = a.get("href")
            if href and href[0] == "#":
                # 是跳转标签,那就不转化,csdn的目录就这样
                continue

            if __is_scrap_link:
                a.replace_with(f"[{a.text}]({href})")
            else:
                _md_decompose(a)


def md_ol_ul(body: Tag):
    """修改列表"""
    li_list = body.find_all(["ul", "ol"])
    for li_obj in li_list:
        start = "*" if li_obj.name == "ul" else 1
        for li_item in li_obj.find_all("li"):
            pagr_text = li_item.get_text(strip=True)
            pagr_text = __process_space(pagr_text)
            if pagr_text.strip():
                if li_obj.name == "ol":
                    li_item.replace_with("%s. %s  \n" % (start, pagr_text))
                    start += 1
                else:
                    li_item.replace_with("%s %s  \n" % (start, pagr_text))


def md_em(body: Tag):
    "修改斜体"
    for item in body.find_all(["em"]):
        xing = "*"
        title_text = item.get_text().strip()
        if not title_text:
            _md_decompose(item)
            continue
        if not title_text.startswith(xing):
            title_text = "%s%s%s " % (xing, title_text, xing)
        item.replace_with(title_text)


def md_b(body: Tag):
    """修改斜体粗体"""
    black_font = []
    selectors = [
        "strong",
        "b",
        "h3",
        "h4",
        "h5",
        "h6",
        '[style*="font-weight: bold;"]',
        '[style*="font-weight:bold;"]',
        '[style*="font-family:黑体;"]',
        # '[style*="font-family:微软雅黑;"]', # 有的地方不是粗体
    ]
    unique_elements_id = set()  # 初始化空集合
    for selector in selectors:
        for item in body.select(selector):
            now_id = id(item)
            if now_id in unique_elements_id:
                continue
            unique_elements_id.add(now_id)
            black_font.append(item)

    for item in black_font:
        xing = "**" if item.name != "em" else "*"
        title_text = item.get_text().strip()
        if not title_text:
            _md_decompose(item)
            continue
        if not title_text.startswith(xing):
            # 防止格式重复
            title_text = "%s%s%s " % (xing, title_text, xing)
        item.replace_with(title_text)


def md_span(body: Tag):
    """修改spam"""
    tag_list = body.find_all("span")
    for tag in tag_list:
        if tag.find_all(["p", "section"]):
            # span标签里面还有p就跳过
            continue
        pagr_text = tag.get_text()

        pagr_text = __process_space(pagr_text)
        tag.replace_with(pagr_text)


def has_sub_tag(node, tag_name):
    #  检查一个节点是否含有相同标签的子节点(叶节点)

    if not isinstance(node, Tag) or node.name != tag_name:
        return False

    # 检查节点是否只包含文本节点或其他非指定标签名的子节点
    for child in node.descendants:
        if child:  # 忽略None类型的子节点(例如空白文本节点)
            if (
                not isinstance(child, NavigableString)
                and isinstance(child, Tag)
                and child.name == tag_name
            ):
                return False
    return True


def md_p(body: Tag):
    """修改段落, 要从最底层的修改入手,不然bs会从最上层提取text,子节点的格式无法体现"""
    tags_to_check = ["p", "section"]
    lowest_tags = []

    for section in body.find_all(tags_to_check):
        if not section:
            continue
        if not section.get_text().strip():
            _md_decompose(section)
            continue
        if not section.find_all(name=tags_to_check):
            # tag.find_all(recursive=False)  检查当前标签是否存在直接子标签,若返回空列表则说明是叶子节点
            lowest_tags.append(section)
    # 在这里可以对lowest_sections中的每个最底层节点进行操作
    for tag in lowest_tags:
        pagr_text = tag.get_text()

        pagr_text = __process_space(pagr_text)
        pagr_text = pagr_text.strip()
        if not pagr_text:
            _md_decompose(tag)
            continue
        pagr_text += "  \n" if len(pagr_text.replace("*", "")) >= 2 else ""
        tag.replace_with(pagr_text)


def _md_decompose(tag: Tag):
    "由叶及根的倒序删除空标签,如果是一脉单传的节点,会一直向上删除"
    current = tag
    current_parent = tag.parent
    while (
        current_parent
        and len(current_parent.contents) == 1
        and not current.text.strip()
        and current.name != "img"
        and not current.find_all("img")
    ):
        current = current_parent
        current_parent = current.parent
    current.decompose()  # 删除空标签


def md_clear_tag(body: Tag):
    # 遍历所有标签并删除空标签
    for tag in body.find_all(True):

        if tag.name == "br":
            tag.replace_with("\n")
            continue
        if (
            tag.name != "img"
            and not tag.find_all("img")
            and not tag.get_text(strip=True)
        ):  # 检查标签是否为空
            _md_decompose(tag)
        if tag.name == "noscript":
            _md_decompose(tag)


def transform(body: Tag, scrap_img, scrap_link, scrap_table) -> str:
    """整合所有修改,尽量不要改变顺序
    如果有包含顺序,那就需要由内而外
    """
    global __is_scrap_img
    __is_scrap_img = scrap_img
    global __is_scrap_link
    __is_scrap_link = scrap_link
    global __is_scrap_table
    __is_scrap_table = scrap_table

    if not isinstance(body, Tag):
        raise Exception()
    md_clear_tag(body)
    md_toc(body)
    md_a(body)
    md_img(body)
    md_u(body)
    md_h(body)  # h需要在b标签之前,因为有的html在h标签也加入了粗体样式
    md_b(body)
    md_em(body)  # 修改斜体,需要在b标签之后,因为有的html在b标签也加入了斜体样式
    md_code(body)

    md_table(body)
    # remove_div(body)
    md_pre(body)  # 目前是不需要的, wx的pre没有明显的显示效果,且及其混乱
    md_ol_ul(body)

    md_span(body)

    # md_clear(body)
    md_p(body)
    # 最后合并连续的加粗
    arctile_content = body.get_text()
    arctile_content = re.sub(r"\*\*\s?\*\*", "", arctile_content)
    arctile_content = re.sub(r"</u>\s?<u>", "", arctile_content)
    # 很多空格,那就视为分段
    arctile_content = re.sub(r"[  ]{10,}", "  \n", arctile_content)

    return arctile_content.strip()

运行代码

点击查看代码
import re
from bs4 import BeautifulSoup
import requests


from common_data import publish_headers

from wx_html_to_md import transform


def url_to_md_main(
    url: str, scrap_img=False, scrap_link=False, scrap_table=False
) -> str:

    url = url.strip()
    html_content = requests.get(url, headers=publish_headers).text
    return html_to_md_main(html_content, scrap_img, scrap_link, scrap_table)


def html_to_md_main(html_content: str, scrap_img, scrap_link, scrap_table) -> str:

    if not html_content:
        raise Exception("html content 为空")
    replace_tuple = (
        "&#x202C;",
        "",
        "&amp;",
        "&",
        "&#39;",
        "'",
        "&quot;",
        '"',
        "&gt;",
        ">",
        "&lt;",
        "<",
        "&yen;",
        "¥",
        "\\x0a",
        "\n",
        "\\x26lt;",
        "<",
        "\\x26gt;",
        ">",
        "\\x26quot;",
        '"',
        "\\x00",
        "",
        "\ufeff",
        "",
    )
    for i in range(0, len(replace_tuple), 2):
        html_content = html_content.replace(replace_tuple[i], replace_tuple[i + 1])
    html_content = re.sub(r"[▼↓■●✦▇❖★👇]", "", html_content)
    soup = BeautifulSoup(markup=html_content, features="lxml")

    # wx有两种,一种是左边轮播图片,一种是手机上常见的上下, 注意 js_content都会有,所以放在后面
    data_bs = soup.select_one(".share_content_page")
    if data_bs:
        # 这种格式数据在js里面,只能正则提取

        article_img = re.findall(r"cdn_url: '(.*?)'", html_content)
        res = ""
        for img_url in article_img:
            res += "![](%s)\n" % img_url
        article_content = re.findall(r'window.desc = "(.*?)"', html_content, flags=re.S)
        article_content = article_content or re.findall(
            r"var ContentNoEncode = window.a_value_which_never_exists \|\| \'(.*?)\';\n",
            html_content,
            flags=re.S,
        )
        article_content = article_content or [""]
        article_content = article_content[0]
        res += article_content

    else:
        data_bs = soup.select_one("#js_content")
        if not data_bs:
            exception_info = soup.body.get_text(separator=",", strip=True)
            raise ValueError(exception_info)

        res = transform(data_bs, scrap_img, scrap_link, scrap_table)
    res = re.sub(r"\n[\*\.0-9\s]*\n", "\n", res)
    return res


if __name__ == "__main__":
    while True:
        url = input("请输入url:")
        res = url_to_md_main(url)
        with open("test.md", mode="w", encoding="utf-8") as fp:
            fp.write(res)
        print("\n结果写入到了 test.md ,请及时提取")

posted @ 2025-04-21 11:44  喝茶看猴戏  阅读(275)  评论(0)    收藏  举报