PDF论文文字公式提取,翻译与对照代码(自用)

代码1:

import re

def process_markdown_file(input_file, output_file):
    # 步骤1: 读取文件并存储为[正文,标签]格式的列表
    lines = []
    with open(input_file, 'r', encoding='utf-8') as f:
        for line in f:
            content = line.rstrip('\n\r')  # 过滤行末回车
            if content.strip():  # 过滤空白行
                lines.append([content, None])
    
    # 步骤2: 处理<!--DoNotTouchBelow-->与<!--DoNotTouchAbove-->之间的内容
    i = 0
    while i < len(lines):
        if lines[i][0] == "<!--DoNotTouchBelow-->":
            start = i
            # 查找结束标记
            while i < len(lines) and lines[i][0] != "<!--DoNotTouchAbove-->":
                i += 1
            if i < len(lines):  # 找到结束标记
                # 合并范围内的内容
                merged_content = '\n'.join([lines[j][0] for j in range(start, i+1)])
                # 删除原有元素并插入合并后的元素
                del lines[start:i+1]
                lines.insert(start, [merged_content, "Fixed"])
        i += 1
    
    # 步骤3: 标记标题行
    for i in range(len(lines)):
        if lines[i][1] is None and re.match(r'^#+ ', lines[i][0]):
            lines[i][1] = "Title"
    
    # 步骤4: 合并连续的普通文本行
    i = 0
    while i < len(lines):
        if lines[i][1] is None:
            # 找到连续的未标记行
            j = i
            content_parts = []
            while j < len(lines) and lines[j][1] is None:
                part = lines[j][0]
                # 如果行末是连字符，去掉连字符
                if part.endswith('-'):
                    part = part[:-1]
                content_parts.append(part)
                j += 1
            
            # 合并这些行，用空格连接（除非前一行以连字符结尾）
            merged_content = ""
            for k, part in enumerate(content_parts):
                if k > 0 and not content_parts[k-1].endswith('-'):
                    merged_content += " "
                merged_content += part
            
            # 删除原有元素并插入合并后的元素
            del lines[i:j]
            lines.insert(i, [merged_content, "Text"])
        i += 1
    
    # 步骤5: 拆分数学公式
    new_lines = []
    for line in lines:
        if line[1] == "Text":
            content = line[0]
            # 匹配$$...$$和$...$格式的数学公式
            parts = re.split(r'(\$\$[^$]+\$\$|\$[^$]+\$)', content)
            for part in parts:
                if part:
                    if re.match(r'^\$\$[^$]+\$\$$|^ \$[^$]+\$ $', part):
                        new_lines.append([part, "Equation"])
                    elif re.match(r'^\$[^$]+\$$', part):
                        new_lines.append([part, "Equation"])
                    elif part.strip():
                        new_lines.append([part, "Text"])
        else:
            new_lines.append(line)
    lines = new_lines
    
    # 步骤6: 处理短公式
    new_lines = []
    i = 0
    while i < len(lines):
        if lines[i][1] == "Equation":
            content = lines[i][0]
            # 检查是否为短公式（长度<10且不包含等号）
            stripped_content = content.strip('$')
            if len(stripped_content) < 30 and '=' not in stripped_content:
                # 合并到前一个Text元素
                if new_lines and new_lines[-1][1] == "Text":
                    new_lines[-1][0] += content
                else:
                    new_lines.append(["", "Text"])
                    new_lines[-1][0] += content
                # 如果后面有Text元素，也合并
                if i+1 < len(lines) and lines[i+1][1] == "Text":
                    new_lines[-1][0] += lines[i+1][0]
                    i += 1  # 跳过下一个Text元素
            else:
                # 用双美元符号包围
                if content.startswith('$') and not content.startswith('$$'):
                    new_content = '$$' + content.strip('$') + '$$'
                    new_lines.append([new_content, "Equation"])
                else:
                    new_lines.append([content, "Equation"])
        else:
            new_lines.append(lines[i])
        i += 1
    lines = new_lines
    
    # 步骤7: 将Text元素按句子分行
    for i in range(len(lines)):
        if lines[i][1] == "Text":
            content = lines[i][0]
            # 匹配句子分隔（小写字母+. +大写字母）
            content = re.sub(r'([a-z])\. ([A-Z])', r'\1.\n\2', content)
            lines[i][0] = content
    
    # 步骤8: 合并所有元素的正文部分
    output_content = '\n'.join([line[0] for line in lines])
    
    # 写入输出文件
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(output_content)

# 使用示例
input_file = "source.md"
output_file = "processed.md"
process_markdown_file(input_file, output_file)

功能:将胡乱分行的pdf文件内容复制得到的md文件整理成正确的格式,每句话一行,如果公式较短则嵌在行内,否则单分行,大型md表格可使用注释进行分隔.

代码2:

import re

def merge_bilingual_markdown(english_file, chinese_file, output_file):
    """
    合并英文和中文 Markdown 文件为对照版本
    
    Args:
        english_file (str): 英文 Markdown 文件路径
        chinese_file (str): 中文翻译 Markdown 文件路径
        output_file (str): 输出对照文件路径
    """
    
    # 读取两个文件的内容
    with open(english_file, 'r', encoding='utf-8') as f:
        english_lines = f.readlines()
    
    with open(chinese_file, 'r', encoding='utf-8') as f:
        chinese_lines = f.readlines()
    
    # 确保两个文件行数相同
    if len(english_lines) != len(chinese_lines):
        print(f"警告: 文件行数不同。英文文件 {len(english_lines)} 行，中文文件 {len(chinese_lines)} 行")
        max_lines = max(len(english_lines), len(chinese_lines))
        
        # 补齐较短的文件
        while len(english_lines) < max_lines:
            english_lines.append('\n')
        while len(chinese_lines) < max_lines:
            chinese_lines.append('\n')
    
    merged_lines = []
    
    for i in range(len(english_lines)):
        eng_line = english_lines[i].rstrip('\n')
        chn_line = chinese_lines[i].rstrip('\n')
        
        # 检查是否为空白行
        if is_empty_line(eng_line) and is_empty_line(chn_line):
            continue
        
        # 检查是否为标题行
        eng_is_heading = is_heading(eng_line)
        chn_is_heading = is_heading(chn_line)
        
        if eng_is_heading and chn_is_heading:
            # 都是标题行：合并标题
            merged_line = merge_headings(eng_line, chn_line)
            merged_lines.append(merged_line + '\n')
        else:
            # 检查是否为纯 LaTeX 行
            if is_latex_line(eng_line):
                # 如果是纯 LaTeX 行，只保留原文行并在后面加一个空白行
                merged_lines.append(eng_line + '\n')
                merged_lines.append('\n')
            else:
                # 正文行：英文在前，中文在后
                merged_lines.append(eng_line + '\n')
                if not is_empty_line(chn_line):
                    merged_lines.append(chn_line + '\n')
                merged_lines.append('\n')  # 添加空白行
    
    # 写入输出文件
    with open(output_file, 'w', encoding='utf-8') as f:
        f.writelines(merged_lines)
    
    print(f"对照文件已保存至: {output_file}")

def is_empty_line(line):
    """
    判断是否为空白行（只包含空格或制表符）
    
    Args:
        line (str): 行内容
        
    Returns:
        bool: 是否为空白行
    """
    return line.strip() == ''

def is_heading(line):
    """
    判断是否为标题行
    
    Args:
        line (str): 行内容
        
    Returns:
        bool: 是否为标题行
    """
    return re.match(r'^#+ ', line.strip()) is not None

def is_latex_line(line):
    """
    判断是否为纯 LaTeX 代码行
    
    Args:
        line (str): 行内容
        
    Returns:
        bool: 是否为纯 LaTeX 行
    """
    stripped_line = line.strip()
    
    # LaTeX 行通常以 $$ 开头和结尾，或者以 \begin{...} 开头
    # 或者整行都是 LaTeX 命令（以反斜杠开头）
    if stripped_line.startswith('$$') and stripped_line.endswith('$$'):
        return True
    if stripped_line.startswith('\\begin{') and stripped_line.endswith('\\end{'):
        return True
    if re.match(r'^\s*\\[a-zA-Z]+', stripped_line):
        # 检查是否整行都是 LaTeX 命令
        return True
    if stripped_line.startswith('$') and stripped_line.endswith('$') and stripped_line != '$':
        return True
    
    return False

def merge_headings(eng_heading, chn_heading):
    """
    合并英文和中文标题行
    
    Args:
        eng_heading (str): 英文标题行
        chn_heading (str): 中文标题行
        
    Returns:
        str: 合并后的标题行
    """
    # 提取英文标题内容（保留井号）
    eng_content = eng_heading
    
    # 提取中文标题内容（去除井号）
    chn_content = chn_heading.lstrip('# ').strip()
    
    # 合并为一行，用空格连接
    return f"{eng_content} {chn_content}"

# 使用示例
if __name__ == "__main__":
    english_file = "导出.md"
    chinese_file = "翻译.md"
    output_file = "对照.md"
    
    merge_bilingual_markdown(english_file, chinese_file, output_file)

功能:将翻译得到的中文文本与英文文本逐行对照,中文标题与英文标题放在同一行以便于梳理文章结构,纯公式行单开一行,不翻译

posted @ 2025-09-25 14:01 Isakovsky 阅读(23) 评论(0) 收藏举报

刷新页面返回顶部

Isakovsky

AfACMer,北京理工大学,网络空间安全学院,PhD在读博客所有内容遵循CC0协议,但建议转载时附上原博客链接.

PDF论文文字公式提取,翻译与对照代码(自用)

公告

Isakovsky

AfACMer,北京理工大学,网络空间安全学院,PhD在读 博客所有内容遵循CC0协议,但建议转载时附上原博客链接.

PDF论文文字公式提取,翻译与对照代码(自用)

公告

AfACMer,北京理工大学,网络空间安全学院,PhD在读博客所有内容遵循CC0协议,但建议转载时附上原博客链接.