pdfbox 获取除水印外的有效内容

1. 依赖

        <dependency>
            <groupId>org.apache.pdfbox</groupId>
            <artifactId>pdfbox</artifactId>
            <version>2.0.27</version>
        </dependency>

2. WatermarkFilter

package com.example.sound.utils;

import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
import org.apache.pdfbox.util.Matrix;
import java.io.IOException;

public class WatermarkFilter extends PDFTextStripper {

    private static final float MAX_ROTATION_ANGLE = 20;        // 最大倾斜角度(度)

    public WatermarkFilter() throws IOException {
        super.setSuppressDuplicateOverlappingText(true); // 启用重复文本过滤
    }

    @Override
    protected void processTextPosition(TextPosition text) {
        // 获取文本变换矩阵
        Matrix matrix = text.getTextMatrix();

        // 计算旋转角度 (通过矩阵解析)
        double angle = Math.toDegrees(Math.atan2(matrix.getShearY(), matrix.getScaleY()));

        // 水印判断条件
        if (!isWatermark(text, angle)) {
            super.processTextPosition(text); // 非水印文本保留
        }
    }

    private boolean isWatermark(TextPosition text, double angle) {
        // 条件:倾斜角度过大
        if (Math.abs(angle) > MAX_ROTATION_ANGLE) {
            return true;
        }
        if (text.getFont().getName().toLowerCase().contains("watermark")) {
            return true;
        }
        return false;
    }
}

 

3. 调用

    /**
     * 从PDF文件中提取文本
     */
    private static String extractTextFromPdf(String filePath) throws IOException {
        try (PDDocument document = PDDocument.load(new File(filePath), MemoryUsageSetting.setupTempFileOnly())) {
            WatermarkFilter stripper = new WatermarkFilter(); // 忽略水印
            return stripper.getText(document);
        }
    }

  

 

posted @ 2025-06-20 11:23  人间春风意  阅读(70)  评论(0)    收藏  举报