批量将指定文件夹及其子文件夹中的所有 .docx 文件转换为 .md (Markdown) 文件

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import os
import sys
import mammoth

"""
批量将指定文件夹及其子文件夹中的所有 .docx 文件转换为 .md (Markdown) 文件
- 修复版 v2.1: 修正 mammoth messages TypeError (Message对象转str)
- 支持递归 + 中文路径 + 复杂表格/图片/样式
- 依赖: mammoth (pip install mammoth)
- 输出: 原目录同名 .md (跳过已存在)
- 作者: AI Assistant
用法: python doc2md.py "C:\Users\admin\source\repos\xxx\doc"
"""

def convert_docx_to_md(input_path, output_path):
    """
    将单个 docx 文件转换为 md (v2.1 修复 messages 转 str)
    """
    try:
        with open(input_path, "rb") as docx_file:
            result = mammoth.convert_to_markdown(docx_file)
        
        # 安全处理警告: Message对象转str (避免 TypeError)
        if result.messages:
            warnings = [msg.message for msg in result.messages]
            print(f"⚠ 警告 ({os.path.basename(input_path)}): {'; '.join(warnings)}")
        
        # 写入 MD (UTF-8)
        with open(output_path, "w", encoding="utf-8") as md_file:
            md_file.write(result.value)
        
        return True
    except Exception as e:
        raise Exception(f"转换失败: {str(e)}")

def recursive_convert(folder_path):
    """
    递归批量转换
    """
    converted_count = 0
    skipped_count = 0
    warning_count = 0
    
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            file_lower = file.lower()
            if file_lower.endswith('.docx'):
                input_file = os.path.join(root, file)
                name, _ = os.path.splitext(file)
                output_file = os.path.join(root, name + '.md')
                
                if os.path.exists(output_file):
                    print(f"⚠ 已存在MD，跳过: {output_file}")
                    skipped_count += 1
                    continue
                
                try:
                    convert_docx_to_md(input_file, output_file)
                    print(f"✓ 转换成功: {input_file} → {output_file}")
                    converted_count += 1
                except Exception as e:
                    print(f"✗ 转换失败 {input_file}: {e}")
                    skipped_count += 1
            
            elif file_lower.endswith('.doc'):
                print(f"⚠ 跳过 .doc: {os.path.join(root, file)}")
                skipped_count += 1
    
    print(f"\n🎉 完成! 成功: {converted_count}, 跳过: {skipped_count}")

if __name__ == "__main__":
    if len(sys.argv) > 1:
        folder_path = sys.argv[1]
    else:
        folder_path = input("请输入文件夹路径: ").strip().rstrip(os.sep)
    
    if not os.path.exists(folder_path):
        print("❌ 文件夹不存在!")
        sys.exit(1)
    
    print(f"开始递归转换文件夹: {folder_path}")
    recursive_convert(folder_path)
posted @ 2025-12-02 13:35 ChasingDreams 阅读(9) 评论(0) 收藏举报
刷新页面返回顶部
低级码农

批量将指定文件夹及其子文件夹中的所有 .docx 文件转换为 .md (Markdown) 文件

公告