Python 将html格式书签转为excel

1.导出html格式书签

2.对数据做处理

提取 <a></a> 标签

可以用vscode正则替换 <a></a> 标签的ICON属性ICON=".*"

3.安装python

pip替换成清华镜像

4.描述问题生成代码

import pandas as pd  
from bs4 import BeautifulSoup  
from datetime import datetime  
import re
  
def parse_html_for_links(html_file, excel_file):  
    
    # 使用BeautifulSoup解析HTML文件  
    with open(html_file, 'r', encoding='utf-8') as file:  
        html_content = file.read()  
    soup = BeautifulSoup(html_content, 'lxml')  # 或者使用'html.parser'  
  
    # 准备存储结果的DataFrame  
    data = []  
  
    # 查找所有的<a>标签  
    for link in soup.find_all('a'):  
        # 提取文本内容和href属性  
        text = link.get_text(strip=True)  
        
        href = link.get('href')  
        if re.search("csdn",href):
            linkType = "CSDN"
        elif re.search("jianshu",href):
            linkType = "简书"
        elif re.search("cnblogs",href):
            linkType = "博客园"
        elif re.search("zhihu",href):
            linkType = "知乎"
        elif re.search("gitee",href):
            linkType = "gitee"
        elif re.search("ruanyifeng",href):
            linkType = "阮一峰"
        elif re.search("v2ex",href):
            linkType = "v2ex"
        elif re.search("juejin",href):
            linkType = "掘金"
        elif re.search("oschina",href):
            linkType = "开源中国"
        elif re.search("douban",href):
            linkType = "豆瓣"
        elif re.search("doc88",href):
            linkType = "道客巴巴"
        elif re.search("pmcaff",href):
            linkType = "pmcaff"
        elif re.search("github",href):
            linkType = "github"
        elif re.search("bilibili",href):
            linkType = "bilibili"
        elif re.search("weixin",href):
            linkType = "微信公众号"
        else:
            linkType = "其他"
  
        # 尝试提取额外的自定义属性，比如add_date（如果不存在则为None）  
        add_date = link.get('add_date') 
        dt_object = datetime.fromtimestamp(int(add_date))  
        # 将时间戳格式转为年月日时分秒
        formatted_date = dt_object.strftime('%Y-%m-%d %H:%M:%S')  
        year = formatted_date[:4]
        month = formatted_date[5:7]
        monthStatistic = f"{year}.{month}"
  
        # 将结果添加到DataFrame的数据中  
        data.append({  
            '标题': text,  
            '链接': href,  
            '添加日期': formatted_date,
            "链接类型" : linkType,  
            "月份" : monthStatistic
        })  
  
    # 创建DataFrame  
    df = pd.DataFrame(data)  
  
    # 将DataFrame写入Excel文件  
    df.to_excel(excel_file, index=False, engine='openpyxl')  
  
# 替换为你的HTML和Excel文件路径  
html_file_path = 'd:\\favorites_2024_7_20.html'  
excel_file_path = 'd:\\favorites_202407201254.xlsx'  
parse_html_for_links(html_file_path, excel_file_path)  
  
print(f"数据已成功写入 {excel_file_path}")

5.结果展示

posted @ 2025-01-05 10:02 LHX2018 阅读(58) 评论(0) 收藏举报

刷新页面返回顶部