PDFUtils (解析PDF 中的文本 和 图片 PDF 转 HTML HTML 转 PDF)

引入pdfbox依赖

        <dependency>
            <groupId>org.apache.pdfbox</groupId>
            <artifactId>pdfbox</artifactId>
            <version>2.0.19</version>
        </dependency>
package com.icil.swift;

import lombok.extern.slf4j.Slf4j;
import org.apache.commons.io.IOUtils;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import org.apache.pdfbox.rendering.PDFRenderer;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.PDFTextStripperByArea;

import javax.imageio.ImageIO;
import java.awt.*;
import java.awt.geom.Rectangle2D;
import java.awt.image.BufferedImage;
import java.io.*;
import java.util.Base64;
import java.util.HashMap;
import java.util.Map;


@Slf4j
public class PDFParserUtils {
    private final static String TEXT = "text";
    private final static String IMAGE = "image";

    /**
     * 获取文本
     *
     * @param file file
     * @return text
     */
    public static String readPDF(File file) {
        return readPDF(file, false).getOrDefault(TEXT, "") + "";
    }

    /**
     * 获取文本
     *
     * @param inputStream file inputStream
     * @return
     */
    public static String readPDF(InputStream inputStream) {
        return readPDF(inputStream, false).getOrDefault(TEXT, "") + "";
    }

    /**
     * 获取文本 和图片
     *
     * @param file      file
     * @param isImgRead 是否读取Image
     * @return {"text":"...",  "imageMap":{'fileName':'byte[]'}}
     */
    public static Map<String, Object> readPDF(File file, Boolean isImgRead) {
        Map<String, Object> result = new HashMap<>();
        if (file == null || !file.exists()) {
            return result;
        }
        try {
            return readPDF(new FileInputStream(file), isImgRead);
        } catch (FileNotFoundException e) {
            log.error("parse pdf error : {}", e.getMessage());
        }
        return result;
    }

    /**
     * 读取 文本 和 图片
     *
     * @param inputStream file inputStream
     * @param isImgRead   是否读取 图片
     * @return {"text":"...",  "imageMap":{'fileName':'byte[]'}}
     */
    public static Map<String, Object> readPDF(InputStream inputStream, Boolean isImgRead) {
        Map<String, Object> result = new HashMap<>();
        if (inputStream == null) {
            return result;
        }
        //收集图片
        Map<String, byte[]> imageFileAndByteMap = new HashMap<>();
        //收集文本
        StringBuilder sb = new StringBuilder("");
        try (PDDocument doc = PDDocument.load(inputStream)) {
            PDFTextStripper textStripper = new PDFTextStripper();
            for (int i = 1; i <= doc.getNumberOfPages(); i++) {
                textStripper.setStartPage(i);
                textStripper.setEndPage(i);
                // 一次输出多个页时,按顺序输出
                textStripper.setSortByPosition(true);
                String s = textStripper.getText(doc);
                sb.append(s);
                //读取图片
                if (isImgRead) {
                    getImage(doc, i, imageFileAndByteMap);
                }
            }
        } catch (Exception e) {
            e.printStackTrace();
            log.info("Parse PDF error {}", e.getMessage());
        }
        result.put(TEXT, sb.toString());
        result.put(IMAGE, imageFileAndByteMap);
        return result;
    }


    /**
     * 读取每一页中的屙图片  返回map fileName  byte[]
     *
     * @param doc       PDDocument
     * @param pageIndex 从 1 开始
     * @throws Exception exception
     */
    private static void getImage(PDDocument doc, int pageIndex, Map<String, byte[]> imageFileAndByteMap) throws Exception {
        PDPage page = doc.getPage(pageIndex - 1);
        PDResources resources = page.getResources();
        // 获取页中的对象
        Iterable<COSName> xobjects = resources.getXObjectNames();
        if (xobjects != null) {
            for (COSName cosName : xobjects) {
                String fileName = cosName.getName();
                if (resources.isImageXObject(cosName)) {
                    // 获取每页资源的图片
                    PDImageXObject ixt = (PDImageXObject) resources.getXObject(cosName);
                    ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
                    ImageIO.write(ixt.getImage(), ixt.getSuffix(), outputStream);
                    imageFileAndByteMap.put(fileName + "." + ixt.getSuffix(), outputStream.toByteArray());
                }
            }
        }
    }


    /**
     * 读取文本内容和图片
     *
     * @param file 文件路徑
     */
    public static void readTextImage(File file) {
        if (file == null) {
            return;
        }
        try (PDDocument doc = PDDocument.load(file)) {
            PDFTextStripper textStripper = new PDFTextStripper();
            for (int i = 1; i <= doc.getNumberOfPages(); i++) {
                textStripper.setStartPage(i);
                textStripper.setEndPage(i);
                // 读取图片
                PDPage page = doc.getPage(i - 1);
                PDResources resources = page.getResources();
                // 获取页中的对象
                Iterable<COSName> xobjects = resources.getXObjectNames();
                if (xobjects != null) {
                    for (COSName cosName : xobjects) {
                        boolean isImageXObject = resources.isImageXObject(cosName);
                        if (isImageXObject) {
                            // 获取每页资源的图片
                            PDImageXObject ixt = (PDImageXObject) resources.getXObject(cosName);
                            File outputFile = new File("第 " + (i) + " 页" + cosName.getName() + "." + ixt.getSuffix());
                            ImageIO.write(ixt.getImage(), ixt.getSuffix(), outputFile);
                        }
                    }
                }
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
    }


    /**
     * 读取指定区域
     *
     * @param file
     * @param x      指定的x坐标
     * @param y      指定的y坐标
     * @param width  矩形的宽度
     * @param height 矩形的高度
     * @return
     */
    public static String readRectangle(File file, int x, int y, int width, int height) {
        if (file == null) {
            return "";
        }
        PDDocument doc = null;
        try {
            doc = PDDocument.load(file);
            // y轴向下为正,x轴向右为正。
            PDFTextStripperByArea stripperByArea = new PDFTextStripperByArea();
            stripperByArea.setSortByPosition(true);
            // 划定区域
            Rectangle2D rect = new Rectangle(x, y, width, height);
            stripperByArea.addRegion("area", rect);
            PDPage page = doc.getPage(1);
            stripperByArea.extractRegions(page);
            // 获取区域的text
            String text = stripperByArea.getTextForRegion("area");
            text = text.trim();
            doc.close();
            return text;
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            if (doc != null) {
                try {
                    doc.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
        return "";
    }


    /**
     * byte数组转换成16进制字符串
     *
     * @param src byte[]
     * @return hex text content
     */
    private static String bytesToHexString(byte[] src) {
        StringBuilder stringBuilder = new StringBuilder();
        if (src == null || src.length <= 0) {
            return null;
        }
        for (byte b : src) {
            int v = b & 0xFF;
            String hv = Integer.toHexString(v);
            if (hv.length() < 2) {
                stringBuilder.append(0);
            }
            stringBuilder.append(hv);
        }
        return stringBuilder.toString();
    }


    private static final String PRE_HTML_CODE = "<html><head><meta charset=\"UTF-8\"></head>" +
            "<body style=\"background-color:gray;\"><style>" +
            "img {background-color:#fff; text-align:center; " +
            "width:100%; max-width:100%;margin-top:6px;}</style>";
    private static final String SUF_HTML_CODE = "</body></html>";
    private static final String MID_HTML_CODE = "<img src=\"data:image/png;base64,";
    private static final String MIDD_HTML_CODE = "\">";

    /**
     * pdf转html
     */
    public static String pdfToHtml(InputStream inputStream) {
        StringBuilder sb = new StringBuilder();
        sb.append(PRE_HTML_CODE);
        try (PDDocument document = PDDocument.load(inputStream)) {
            int pages = document.getNumberOfPages();
            PDFRenderer renderer = new PDFRenderer(document);
            for (int i = 0; i < pages; i++) {
                sb.append(MID_HTML_CODE);
                try (ByteArrayOutputStream outputStream = new ByteArrayOutputStream()) {
                    BufferedImage image = renderer.renderImage(i, 2.5f);
                    ImageIO.write(image, "png", outputStream);
                    sb.append(Base64.getEncoder().encodeToString(outputStream.toByteArray()));
                } catch (IOException e) {
                    log.error("Error converting PDF to HTML: {}", e.getMessage());
                }
                sb.append(MIDD_HTML_CODE);
            }
        } catch (IOException e) {
            log.error("Error loading PDF document: {}", e.getMessage());
        }
        sb.append(SUF_HTML_CODE);
        return sb.toString();
    }

    /** 常用文件的文件头如下:
     JPEG (jpg),文件头:FFD8FF
     PNG (png),文件头:89504E47
     GIF (gif),文件头:47494638
     TIFF (tif),文件头:49492A00
     Windows Bitmap (bmp),文件头:424D
     CAD (dwg),文件头:41433130
     Adobe Photoshop (psd),文件头:38425053
     Rich Text Format (rtf),文件头:7B5C727466
     XML (xml),文件头:3C3F786D6C
     HTML (html),文件头:68746D6C3E
     Email [thorough only] (eml),文件头:44656C69766572792D646174653A
     Outlook Express (dbx),文件头:CFAD12FEC5FD746F
     Outlook (pst),文件头:2142444E
     MS Word/Excel (xls.or.doc),文件头:D0CF11E0
     MS Access (mdb),文件头:5374616E64617264204A
     WordPerfect (wpd),文件头:FF575043
     Postscript. (eps.or.ps),文件头:252150532D41646F6265
     Adobe Acrobat (pdf),文件头:255044462D312E
     Quicken (qdf),文件头:AC9EBD8F
     Windows Password (pwl),文件头:E3828596
     ZIP Archive (zip),文件头:504B0304
     RAR Archive (rar),文件头:52617221
     Wave (wav),文件头:57415645
     AVI (avi),文件头:41564920
     Real Audio (ram),文件头:2E7261FD
     Real Media (rm),文件头:2E524D46
     MPEG (mpg),文件头:000001BA
     MPEG (mpg),文件头:000001B3
     Quicktime (mov),文件头:6D6F6F76
     Windows Media (asf),文件头:3026B2758E66CF11
     MIDI (mid),文件头:4D546864
     */
    /**
     * 根據io流前4個字節,判斷文件類型
     *
     * @param ioBytes
     * @return
     */
    private static String getFileType(byte[] ioBytes) throws Exception {
        if (ioBytes == null || ioBytes.length < 4) {
            log.error("非正常文件");
            throw new Exception("Abnormal image file.");
        }
        byte[] b = new byte[4];
        System.arraycopy(ioBytes, 0, b, 0, 4);
        String type = ("" + bytesToHexString(b)).toUpperCase();
        if (type.contains("25504446")) {
            return "PDF";
        } else if (type.contains("504B0304")) {
            return "ZIP";
        } else if (type.contains("52617221")) {
            return "RAR";
        }
        return "";
    }

    @SuppressWarnings("unchecked")
    public static void main(String[] args) {
        File file = new File("C:\\Users\\Sea\\Downloads\\seatest.pdf");
//        PDFParserUtils.readTextImage(file);
        Map<String, Object> stringObjectMap = PDFParserUtils.readPDF(file, true);
        Map<String, byte[]> imgs = (Map<String, byte[]>) stringObjectMap.get(PDFParserUtils.IMAGE);
        Object o = stringObjectMap.get(PDFParserUtils.TEXT);
        System.err.println(" text : " + o);
        imgs.forEach((filename, bt) -> {
            try {
                FileOutputStream fileOutputStream = new FileOutputStream(filename);
                IOUtils.write(bt, fileOutputStream);
            } catch (Exception e) {
                e.printStackTrace();
            }
        });
    }

}

Ref-link: https://www.cnblogs.com/lshan/p/17336353.html

posted @ 2024-02-26 16:49  lyu6  阅读(189)  评论(0)    收藏  举报