#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
import sys
import mammoth
"""
批量将指定文件夹及其子文件夹中的所有 .docx 文件转换为 .md (Markdown) 文件
- 修复版 v2.1: 修正 mammoth messages TypeError (Message对象转str)
- 支持递归 + 中文路径 + 复杂表格/图片/样式
- 依赖: mammoth (pip install mammoth)
- 输出: 原目录同名 .md (跳过已存在)
- 作者: AI Assistant
用法: python doc2md.py "C:\Users\admin\source\repos\xxx\doc"
"""
def convert_docx_to_md(input_path, output_path):
"""
将单个 docx 文件转换为 md (v2.1 修复 messages 转 str)
"""
try:
with open(input_path, "rb") as docx_file:
result = mammoth.convert_to_markdown(docx_file)
# 安全处理警告: Message对象转str (避免 TypeError)
if result.messages:
warnings = [msg.message for msg in result.messages]
print(f"⚠ 警告 ({os.path.basename(input_path)}): {'; '.join(warnings)}")
# 写入 MD (UTF-8)
with open(output_path, "w", encoding="utf-8") as md_file:
md_file.write(result.value)
return True
except Exception as e:
raise Exception(f"转换失败: {str(e)}")
def recursive_convert(folder_path):
"""
递归批量转换
"""
converted_count = 0
skipped_count = 0
warning_count = 0
for root, dirs, files in os.walk(folder_path):
for file in files:
file_lower = file.lower()
if file_lower.endswith('.docx'):
input_file = os.path.join(root, file)
name, _ = os.path.splitext(file)
output_file = os.path.join(root, name + '.md')
if os.path.exists(output_file):
print(f"⚠ 已存在MD,跳过: {output_file}")
skipped_count += 1
continue
try:
convert_docx_to_md(input_file, output_file)
print(f"✓ 转换成功: {input_file} → {output_file}")
converted_count += 1
except Exception as e:
print(f"✗ 转换失败 {input_file}: {e}")
skipped_count += 1
elif file_lower.endswith('.doc'):
print(f"⚠ 跳过 .doc: {os.path.join(root, file)}")
skipped_count += 1
print(f"\n🎉 完成! 成功: {converted_count}, 跳过: {skipped_count}")
if __name__ == "__main__":
if len(sys.argv) > 1:
folder_path = sys.argv[1]
else:
folder_path = input("请输入文件夹路径: ").strip().rstrip(os.sep)
if not os.path.exists(folder_path):
print("❌ 文件夹不存在!")
sys.exit(1)
print(f"开始递归转换文件夹: {folder_path}")
recursive_convert(folder_path)