PDFUtils (解析PDF 中的文本 和 图片 PDF 转 HTML HTML 转 PDF)
引入pdfbox依赖
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.19</version>
</dependency>
package com.icil.swift;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.io.IOUtils;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import org.apache.pdfbox.rendering.PDFRenderer;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.PDFTextStripperByArea;
import javax.imageio.ImageIO;
import java.awt.*;
import java.awt.geom.Rectangle2D;
import java.awt.image.BufferedImage;
import java.io.*;
import java.util.Base64;
import java.util.HashMap;
import java.util.Map;
@Slf4j
public class PDFParserUtils {
private final static String TEXT = "text";
private final static String IMAGE = "image";
/**
* 获取文本
*
* @param file file
* @return text
*/
public static String readPDF(File file) {
return readPDF(file, false).getOrDefault(TEXT, "") + "";
}
/**
* 获取文本
*
* @param inputStream file inputStream
* @return
*/
public static String readPDF(InputStream inputStream) {
return readPDF(inputStream, false).getOrDefault(TEXT, "") + "";
}
/**
* 获取文本 和图片
*
* @param file file
* @param isImgRead 是否读取Image
* @return {"text":"...", "imageMap":{'fileName':'byte[]'}}
*/
public static Map<String, Object> readPDF(File file, Boolean isImgRead) {
Map<String, Object> result = new HashMap<>();
if (file == null || !file.exists()) {
return result;
}
try {
return readPDF(new FileInputStream(file), isImgRead);
} catch (FileNotFoundException e) {
log.error("parse pdf error : {}", e.getMessage());
}
return result;
}
/**
* 读取 文本 和 图片
*
* @param inputStream file inputStream
* @param isImgRead 是否读取 图片
* @return {"text":"...", "imageMap":{'fileName':'byte[]'}}
*/
public static Map<String, Object> readPDF(InputStream inputStream, Boolean isImgRead) {
Map<String, Object> result = new HashMap<>();
if (inputStream == null) {
return result;
}
//收集图片
Map<String, byte[]> imageFileAndByteMap = new HashMap<>();
//收集文本
StringBuilder sb = new StringBuilder("");
try (PDDocument doc = PDDocument.load(inputStream)) {
PDFTextStripper textStripper = new PDFTextStripper();
for (int i = 1; i <= doc.getNumberOfPages(); i++) {
textStripper.setStartPage(i);
textStripper.setEndPage(i);
// 一次输出多个页时,按顺序输出
textStripper.setSortByPosition(true);
String s = textStripper.getText(doc);
sb.append(s);
//读取图片
if (isImgRead) {
getImage(doc, i, imageFileAndByteMap);
}
}
} catch (Exception e) {
e.printStackTrace();
log.info("Parse PDF error {}", e.getMessage());
}
result.put(TEXT, sb.toString());
result.put(IMAGE, imageFileAndByteMap);
return result;
}
/**
* 读取每一页中的屙图片 返回map fileName byte[]
*
* @param doc PDDocument
* @param pageIndex 从 1 开始
* @throws Exception exception
*/
private static void getImage(PDDocument doc, int pageIndex, Map<String, byte[]> imageFileAndByteMap) throws Exception {
PDPage page = doc.getPage(pageIndex - 1);
PDResources resources = page.getResources();
// 获取页中的对象
Iterable<COSName> xobjects = resources.getXObjectNames();
if (xobjects != null) {
for (COSName cosName : xobjects) {
String fileName = cosName.getName();
if (resources.isImageXObject(cosName)) {
// 获取每页资源的图片
PDImageXObject ixt = (PDImageXObject) resources.getXObject(cosName);
ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
ImageIO.write(ixt.getImage(), ixt.getSuffix(), outputStream);
imageFileAndByteMap.put(fileName + "." + ixt.getSuffix(), outputStream.toByteArray());
}
}
}
}
/**
* 读取文本内容和图片
*
* @param file 文件路徑
*/
public static void readTextImage(File file) {
if (file == null) {
return;
}
try (PDDocument doc = PDDocument.load(file)) {
PDFTextStripper textStripper = new PDFTextStripper();
for (int i = 1; i <= doc.getNumberOfPages(); i++) {
textStripper.setStartPage(i);
textStripper.setEndPage(i);
// 读取图片
PDPage page = doc.getPage(i - 1);
PDResources resources = page.getResources();
// 获取页中的对象
Iterable<COSName> xobjects = resources.getXObjectNames();
if (xobjects != null) {
for (COSName cosName : xobjects) {
boolean isImageXObject = resources.isImageXObject(cosName);
if (isImageXObject) {
// 获取每页资源的图片
PDImageXObject ixt = (PDImageXObject) resources.getXObject(cosName);
File outputFile = new File("第 " + (i) + " 页" + cosName.getName() + "." + ixt.getSuffix());
ImageIO.write(ixt.getImage(), ixt.getSuffix(), outputFile);
}
}
}
}
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* 读取指定区域
*
* @param file
* @param x 指定的x坐标
* @param y 指定的y坐标
* @param width 矩形的宽度
* @param height 矩形的高度
* @return
*/
public static String readRectangle(File file, int x, int y, int width, int height) {
if (file == null) {
return "";
}
PDDocument doc = null;
try {
doc = PDDocument.load(file);
// y轴向下为正,x轴向右为正。
PDFTextStripperByArea stripperByArea = new PDFTextStripperByArea();
stripperByArea.setSortByPosition(true);
// 划定区域
Rectangle2D rect = new Rectangle(x, y, width, height);
stripperByArea.addRegion("area", rect);
PDPage page = doc.getPage(1);
stripperByArea.extractRegions(page);
// 获取区域的text
String text = stripperByArea.getTextForRegion("area");
text = text.trim();
doc.close();
return text;
} catch (IOException e) {
e.printStackTrace();
} finally {
if (doc != null) {
try {
doc.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
return "";
}
/**
* byte数组转换成16进制字符串
*
* @param src byte[]
* @return hex text content
*/
private static String bytesToHexString(byte[] src) {
StringBuilder stringBuilder = new StringBuilder();
if (src == null || src.length <= 0) {
return null;
}
for (byte b : src) {
int v = b & 0xFF;
String hv = Integer.toHexString(v);
if (hv.length() < 2) {
stringBuilder.append(0);
}
stringBuilder.append(hv);
}
return stringBuilder.toString();
}
private static final String PRE_HTML_CODE = "<html><head><meta charset=\"UTF-8\"></head>" +
"<body style=\"background-color:gray;\"><style>" +
"img {background-color:#fff; text-align:center; " +
"width:100%; max-width:100%;margin-top:6px;}</style>";
private static final String SUF_HTML_CODE = "</body></html>";
private static final String MID_HTML_CODE = "<img src=\"data:image/png;base64,";
private static final String MIDD_HTML_CODE = "\">";
/**
* pdf转html
*/
public static String pdfToHtml(InputStream inputStream) {
StringBuilder sb = new StringBuilder();
sb.append(PRE_HTML_CODE);
try (PDDocument document = PDDocument.load(inputStream)) {
int pages = document.getNumberOfPages();
PDFRenderer renderer = new PDFRenderer(document);
for (int i = 0; i < pages; i++) {
sb.append(MID_HTML_CODE);
try (ByteArrayOutputStream outputStream = new ByteArrayOutputStream()) {
BufferedImage image = renderer.renderImage(i, 2.5f);
ImageIO.write(image, "png", outputStream);
sb.append(Base64.getEncoder().encodeToString(outputStream.toByteArray()));
} catch (IOException e) {
log.error("Error converting PDF to HTML: {}", e.getMessage());
}
sb.append(MIDD_HTML_CODE);
}
} catch (IOException e) {
log.error("Error loading PDF document: {}", e.getMessage());
}
sb.append(SUF_HTML_CODE);
return sb.toString();
}
/** 常用文件的文件头如下:
JPEG (jpg),文件头:FFD8FF
PNG (png),文件头:89504E47
GIF (gif),文件头:47494638
TIFF (tif),文件头:49492A00
Windows Bitmap (bmp),文件头:424D
CAD (dwg),文件头:41433130
Adobe Photoshop (psd),文件头:38425053
Rich Text Format (rtf),文件头:7B5C727466
XML (xml),文件头:3C3F786D6C
HTML (html),文件头:68746D6C3E
Email [thorough only] (eml),文件头:44656C69766572792D646174653A
Outlook Express (dbx),文件头:CFAD12FEC5FD746F
Outlook (pst),文件头:2142444E
MS Word/Excel (xls.or.doc),文件头:D0CF11E0
MS Access (mdb),文件头:5374616E64617264204A
WordPerfect (wpd),文件头:FF575043
Postscript. (eps.or.ps),文件头:252150532D41646F6265
Adobe Acrobat (pdf),文件头:255044462D312E
Quicken (qdf),文件头:AC9EBD8F
Windows Password (pwl),文件头:E3828596
ZIP Archive (zip),文件头:504B0304
RAR Archive (rar),文件头:52617221
Wave (wav),文件头:57415645
AVI (avi),文件头:41564920
Real Audio (ram),文件头:2E7261FD
Real Media (rm),文件头:2E524D46
MPEG (mpg),文件头:000001BA
MPEG (mpg),文件头:000001B3
Quicktime (mov),文件头:6D6F6F76
Windows Media (asf),文件头:3026B2758E66CF11
MIDI (mid),文件头:4D546864
*/
/**
* 根據io流前4個字節,判斷文件類型
*
* @param ioBytes
* @return
*/
private static String getFileType(byte[] ioBytes) throws Exception {
if (ioBytes == null || ioBytes.length < 4) {
log.error("非正常文件");
throw new Exception("Abnormal image file.");
}
byte[] b = new byte[4];
System.arraycopy(ioBytes, 0, b, 0, 4);
String type = ("" + bytesToHexString(b)).toUpperCase();
if (type.contains("25504446")) {
return "PDF";
} else if (type.contains("504B0304")) {
return "ZIP";
} else if (type.contains("52617221")) {
return "RAR";
}
return "";
}
@SuppressWarnings("unchecked")
public static void main(String[] args) {
File file = new File("C:\\Users\\Sea\\Downloads\\seatest.pdf");
// PDFParserUtils.readTextImage(file);
Map<String, Object> stringObjectMap = PDFParserUtils.readPDF(file, true);
Map<String, byte[]> imgs = (Map<String, byte[]>) stringObjectMap.get(PDFParserUtils.IMAGE);
Object o = stringObjectMap.get(PDFParserUtils.TEXT);
System.err.println(" text : " + o);
imgs.forEach((filename, bt) -> {
try {
FileOutputStream fileOutputStream = new FileOutputStream(filename);
IOUtils.write(bt, fileOutputStream);
} catch (Exception e) {
e.printStackTrace();
}
});
}
}

浙公网安备 33010602011771号