PDFUtils (解析PDF 中的文本和图片 PDF 转 HTML HTML 转 PDF)

引入pdfbox依赖

        <dependency>
            <groupId>org.apache.pdfbox</groupId>
            <artifactId>pdfbox</artifactId>
            <version>2.0.19</version>
        </dependency>

package com.icil.swift;

import lombok.extern.slf4j.Slf4j;
import org.apache.commons.io.IOUtils;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import org.apache.pdfbox.rendering.PDFRenderer;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.PDFTextStripperByArea;

import javax.imageio.ImageIO;
import java.awt.*;
import java.awt.geom.Rectangle2D;
import java.awt.image.BufferedImage;
import java.io.*;
import java.util.Base64;
import java.util.HashMap;
import java.util.Map;


@Slf4j
public class PDFParserUtils {
    private final static String TEXT = "text";
    private final static String IMAGE = "image";

    /**
     * 获取文本
     *
     * @param file file
     * @return text
     */
    public static String readPDF(File file) {
        return readPDF(file, false).getOrDefault(TEXT, "") + "";
    }

    /**
     * 获取文本
     *
     * @param inputStream file inputStream
     * @return
     */
    public static String readPDF(InputStream inputStream) {
        return readPDF(inputStream, false).getOrDefault(TEXT, "") + "";
    }

    /**
     * 获取文本 和图片
     *
     * @param file      file
     * @param isImgRead 是否读取Image
     * @return {"text":"...",  "imageMap":{'fileName':'byte[]'}}
     */
    public static Map<String, Object> readPDF(File file, Boolean isImgRead) {
        Map<String, Object> result = new HashMap<>();
        if (file == null || !file.exists()) {
            return result;
        }
        try {
            return readPDF(new FileInputStream(file), isImgRead);
        } catch (FileNotFoundException e) {
            log.error("parse pdf error : {}", e.getMessage());
        }
        return result;
    }

    /**
     * 读取 文本 和 图片
     *
     * @param inputStream file inputStream
     * @param isImgRead   是否读取 图片
     * @return {"text":"...",  "imageMap":{'fileName':'byte[]'}}
     */
    public static Map<String, Object> readPDF(InputStream inputStream, Boolean isImgRead) {
        Map<String, Object> result = new HashMap<>();
        if (inputStream == null) {
            return result;
        }
        //收集图片
        Map<String, byte[]> imageFileAndByteMap = new HashMap<>();
        //收集文本
        StringBuilder sb = new StringBuilder("");
        try (PDDocument doc = PDDocument.load(inputStream)) {
            PDFTextStripper textStripper = new PDFTextStripper();
            for (int i = 1; i <= doc.getNumberOfPages(); i++) {
                textStripper.setStartPage(i);
                textStripper.setEndPage(i);
                // 一次输出多个页时，按顺序输出
                textStripper.setSortByPosition(true);
                String s = textStripper.getText(doc);
                sb.append(s);
                //读取图片
                if (isImgRead) {
                    getImage(doc, i, imageFileAndByteMap);
                }
            }
        } catch (Exception e) {
            e.printStackTrace();
            log.info("Parse PDF error {}", e.getMessage());
        }
        result.put(TEXT, sb.toString());
        result.put(IMAGE, imageFileAndByteMap);
        return result;
    }


    /**
     * 读取每一页中的屙图片  返回map fileName  byte[]
     *
     * @param doc       PDDocument
     * @param pageIndex 从 1 开始
     * @throws Exception exception
     */
    private static void getImage(PDDocument doc, int pageIndex, Map<String, byte[]> imageFileAndByteMap) throws Exception {
        PDPage page = doc.getPage(pageIndex - 1);
        PDResources resources = page.getResources();
        // 获取页中的对象
        Iterable<COSName> xobjects = resources.getXObjectNames();
        if (xobjects != null) {
            for (COSName cosName : xobjects) {
                String fileName = cosName.getName();
                if (resources.isImageXObject(cosName)) {
                    // 获取每页资源的图片
                    PDImageXObject ixt = (PDImageXObject) resources.getXObject(cosName);
                    ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
                    ImageIO.write(ixt.getImage(), ixt.getSuffix(), outputStream);
                    imageFileAndByteMap.put(fileName + "." + ixt.getSuffix(), outputStream.toByteArray());
                }
            }
        }
    }


    /**
     * 读取文本内容和图片
     *
     * @param file 文件路徑
     */
    public static void readTextImage(File file) {
        if (file == null) {
            return;
        }
        try (PDDocument doc = PDDocument.load(file)) {
            PDFTextStripper textStripper = new PDFTextStripper();
            for (int i = 1; i <= doc.getNumberOfPages(); i++) {
                textStripper.setStartPage(i);
                textStripper.setEndPage(i);
                // 读取图片
                PDPage page = doc.getPage(i - 1);
                PDResources resources = page.getResources();
                // 获取页中的对象
                Iterable<COSName> xobjects = resources.getXObjectNames();
                if (xobjects != null) {
                    for (COSName cosName : xobjects) {
                        boolean isImageXObject = resources.isImageXObject(cosName);
                        if (isImageXObject) {
                            // 获取每页资源的图片
                            PDImageXObject ixt = (PDImageXObject) resources.getXObject(cosName);
                            File outputFile = new File("第 " + (i) + " 页" + cosName.getName() + "." + ixt.getSuffix());
                            ImageIO.write(ixt.getImage(), ixt.getSuffix(), outputFile);
                        }
                    }
                }
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
    }


    /**
     * 读取指定区域
     *
     * @param file
     * @param x      指定的x坐标
     * @param y      指定的y坐标
     * @param width  矩形的宽度
     * @param height 矩形的高度
     * @return
     */
    public static String readRectangle(File file, int x, int y, int width, int height) {
        if (file == null) {
            return "";
        }
        PDDocument doc = null;
        try {
            doc = PDDocument.load(file);
            // y轴向下为正，x轴向右为正。
            PDFTextStripperByArea stripperByArea = new PDFTextStripperByArea();
            stripperByArea.setSortByPosition(true);
            // 划定区域
            Rectangle2D rect = new Rectangle(x, y, width, height);
            stripperByArea.addRegion("area", rect);
            PDPage page = doc.getPage(1);
            stripperByArea.extractRegions(page);
            // 获取区域的text
            String text = stripperByArea.getTextForRegion("area");
            text = text.trim();
            doc.close();
            return text;
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            if (doc != null) {
                try {
                    doc.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
        return "";
    }


    /**
     * byte数组转换成16进制字符串
     *
     * @param src byte[]
     * @return hex text content
     */
    private static String bytesToHexString(byte[] src) {
        StringBuilder stringBuilder = new StringBuilder();
        if (src == null || src.length <= 0) {
            return null;
        }
        for (byte b : src) {
            int v = b & 0xFF;
            String hv = Integer.toHexString(v);
            if (hv.length() < 2) {
                stringBuilder.append(0);
            }
            stringBuilder.append(hv);
        }
        return stringBuilder.toString();
    }


    private static final String PRE_HTML_CODE = "<html><head><meta charset=\"UTF-8\"></head>" +
            "<body style=\"background-color:gray;\"><style>" +
            "img {background-color:#fff; text-align:center; " +
            "width:100%; max-width:100%;margin-top:6px;}</style>";
    private static final String SUF_HTML_CODE = "</body></html>";
    private static final String MID_HTML_CODE = "<img src=\"data:image/png;base64,";
    private static final String MIDD_HTML_CODE = "\">";

    /**
     * pdf转html
     */
    public static String pdfToHtml(InputStream inputStream) {
        StringBuilder sb = new StringBuilder();
        sb.append(PRE_HTML_CODE);
        try (PDDocument document = PDDocument.load(inputStream)) {
            int pages = document.getNumberOfPages();
            PDFRenderer renderer = new PDFRenderer(document);
            for (int i = 0; i < pages; i++) {
                sb.append(MID_HTML_CODE);
                try (ByteArrayOutputStream outputStream = new ByteArrayOutputStream()) {
                    BufferedImage image = renderer.renderImage(i, 2.5f);
                    ImageIO.write(image, "png", outputStream);
                    sb.append(Base64.getEncoder().encodeToString(outputStream.toByteArray()));
                } catch (IOException e) {
                    log.error("Error converting PDF to HTML: {}", e.getMessage());
                }
                sb.append(MIDD_HTML_CODE);
            }
        } catch (IOException e) {
            log.error("Error loading PDF document: {}", e.getMessage());
        }
        sb.append(SUF_HTML_CODE);
        return sb.toString();
    }

    /** 常用文件的文件头如下：
     JPEG (jpg)，文件头：FFD8FF
     PNG (png)，文件头：89504E47
     GIF (gif)，文件头：47494638
     TIFF (tif)，文件头：49492A00
     Windows Bitmap (bmp)，文件头：424D
     CAD (dwg)，文件头：41433130
     Adobe Photoshop (psd)，文件头：38425053
     Rich Text Format (rtf)，文件头：7B5C727466
     XML (xml)，文件头：3C3F786D6C
     HTML (html)，文件头：68746D6C3E
     Email [thorough only] (eml)，文件头：44656C69766572792D646174653A
     Outlook Express (dbx)，文件头：CFAD12FEC5FD746F
     Outlook (pst)，文件头：2142444E
     MS Word/Excel (xls.or.doc)，文件头：D0CF11E0
     MS Access (mdb)，文件头：5374616E64617264204A
     WordPerfect (wpd)，文件头：FF575043
     Postscript. (eps.or.ps)，文件头：252150532D41646F6265
     Adobe Acrobat (pdf)，文件头：255044462D312E
     Quicken (qdf)，文件头：AC9EBD8F
     Windows Password (pwl)，文件头：E3828596
     ZIP Archive (zip)，文件头：504B0304
     RAR Archive (rar)，文件头：52617221
     Wave (wav)，文件头：57415645
     AVI (avi)，文件头：41564920
     Real Audio (ram)，文件头：2E7261FD
     Real Media (rm)，文件头：2E524D46
     MPEG (mpg)，文件头：000001BA
     MPEG (mpg)，文件头：000001B3
     Quicktime (mov)，文件头：6D6F6F76
     Windows Media (asf)，文件头：3026B2758E66CF11
     MIDI (mid)，文件头：4D546864
     */
    /**
     * 根據io流前4個字節，判斷文件類型
     *
     * @param ioBytes
     * @return
     */
    private static String getFileType(byte[] ioBytes) throws Exception {
        if (ioBytes == null || ioBytes.length < 4) {
            log.error("非正常文件");
            throw new Exception("Abnormal image file.");
        }
        byte[] b = new byte[4];
        System.arraycopy(ioBytes, 0, b, 0, 4);
        String type = ("" + bytesToHexString(b)).toUpperCase();
        if (type.contains("25504446")) {
            return "PDF";
        } else if (type.contains("504B0304")) {
            return "ZIP";
        } else if (type.contains("52617221")) {
            return "RAR";
        }
        return "";
    }

    @SuppressWarnings("unchecked")
    public static void main(String[] args) {
        File file = new File("C:\\Users\\Sea\\Downloads\\seatest.pdf");
//        PDFParserUtils.readTextImage(file);
        Map<String, Object> stringObjectMap = PDFParserUtils.readPDF(file, true);
        Map<String, byte[]> imgs = (Map<String, byte[]>) stringObjectMap.get(PDFParserUtils.IMAGE);
        Object o = stringObjectMap.get(PDFParserUtils.TEXT);
        System.err.println(" text : " + o);
        imgs.forEach((filename, bt) -> {
            try {
                FileOutputStream fileOutputStream = new FileOutputStream(filename);
                IOUtils.write(bt, fileOutputStream);
            } catch (Exception e) {
                e.printStackTrace();
            }
        });
    }

}

Ref-link: https://www.cnblogs.com/lshan/p/17336353.html

posted @ 2024-02-26 16:49 lyu6 阅读(189) 评论(0) 收藏举报

刷新页面返回顶部

lyu_blog

PDFUtils (解析PDF 中的文本和图片 PDF 转 HTML HTML 转 PDF)

公告

lyu_blog

PDFUtils (解析PDF 中的文本 和 图片 PDF 转 HTML HTML 转 PDF)

公告

PDFUtils (解析PDF 中的文本和图片 PDF 转 HTML HTML 转 PDF)