代码改变世界

pdfbox 识别 pdf为excel

1、继承 PageDrawer 和 PDFRenderer获得文本框坐标

2、通过坐标获取文字

3、通过easyExcel生成表格

public class MyPageDrawer extends PageDrawer {

static final List<Coordinate> COORDINATE_LIST = new ArrayList<>();
double pageHeight;
MyPageDrawer(PageDrawerParameters parameters) throws IOException
{
super(parameters);
this.pageHeight=parameters.getPage().getBBox().getHeight();

}
PDPage pdPage;
@Override
public void processPage(PDPage aPage) throws IOException {
this.pdPage=aPage;
super.processPage(aPage);
}

@Override
public void fillPath(int windingRule) {
Shape bbox = getLinePath().getBounds2D();
Coordinate startCoordinate = new Coordinate(bbox.getBounds().getLocation().x,(int)pageHeight-bbox.getBounds().getLocation().y);
COORDINATE_LIST.add(startCoordinate);
getLinePath().reset();
}

}

public   class MyPDFRenderer extends PDFRenderer
{
MyPDFRenderer(PDDocument document)
{
super(document);
}

@Override
protected PageDrawer createPageDrawer(PageDrawerParameters parameters) throws IOException
{
return new MyPageDrawer(parameters);
// return new TestPageDrawer(parameters);
}
}



public class App {
public static void main(String[] args) throws Exception {
String fileName = "E:\\download\\test\\2020年12月北京工程造价信息.pdf"; //这里先手动把绝对路径的文件夹给补上。
readPDF(fileName);
}
/**
* 读PDF文件,使用了pdfbox开源项目
* @param fileName
*/
public static void readPDF(String fileName) {
File file = new File(fileName);
FileInputStream in = null;
try {
in = new FileInputStream(fileName);
// 新建一个PDF解析器对象
PDFParser parser = new PDFParser(new RandomAccessFile(file,"rw"));
// 对PDF文件进行解析
parser.parse();
// 获取解析后得到的PDF文档对象
PDDocument pdfdocument = parser.getPDDocument();
System.out.println("NumberOfPages:"+ pdfdocument.getNumberOfPages());

PDFRenderer renderer = new MyPDFRenderer(pdfdocument);

int pageNum=12;
BufferedImage image = renderer.renderImage(pageNum);
ImageIO.write(image, "PNG", new File("test.png"));

// System.out.println("SEG_LINETO_LIST...");
// MyPageDrawer.SEG_LINETO_LIST.stream().forEach(System.out::println);

String resultFileName = "simpleWrite" + System.currentTimeMillis() + ".xlsx";
EasyExcel.write(resultFileName).sheet().doWrite(judgeCoordinate(MyPageDrawer.COORDINATE_LIST, pdfdocument, pageNum));
} catch (Exception e) {
System.out.println("读取PDF文件" + file.getAbsolutePath() + "生失败!" + e);
e.printStackTrace();
} finally {
if (in != null) {
try {
in.close();
} catch (IOException e1) {
}
}
}
}
/**
* 去重排序
*
* @param coordinateList
* @param document
* @return
*/
private static List<List<String>> judgeCoordinate(List<Coordinate> coordinateList, PDDocument document,int pageNum) {
//去除pdf边界
coordinateList=coordinateList.stream().filter(coordinate -> !(coordinate.getX()<38||coordinate.getY()<70||coordinate.getY()>780||coordinate.getX()>558)).collect(Collectors.toList());
// 去重 按y,x排序 从左上角开始计算
coordinateList = coordinateList.stream().sorted(Comparator.comparing(Coordinate::getY).thenComparing(Coordinate::getX)).collect(Collectors.toList());
System.out.println("去重,排序后,分组前...");
coordinateList.stream().forEach(System.out::println);
// 去除相近元素
for(int a=0;a<coordinateList.size();a++){
Coordinate coordinateStart = coordinateList.get(a);
for (int j = a+1; j < coordinateList.size(); j++) {
Coordinate coordinateC = coordinateList.get(j);
if (Math.abs(coordinateStart.getY()-coordinateC.getY()) <=2) {
if(Math.abs(coordinateC.getX()-coordinateStart.getX())<=2){
coordinateList.remove(j);
j--;
}else {
int y=coordinateStart.getY()>coordinateC.getY()?coordinateC.getY():coordinateStart.getY();
coordinateC.setY(y);
}
}else {
break;
}
}
}
//需要重新排序
coordinateList=coordinateList.stream().sorted(Comparator.comparing(Coordinate::getY).thenComparing(Coordinate::getX))
.collect(Collectors.toList());

Map<Integer, List<Coordinate>> groupList = coordinateList.stream()
.collect(Collectors.groupingBy(Coordinate::getY));
Map<Integer, List<Coordinate>> result =new LinkedHashMap<>();
groupList.entrySet().stream().sorted(Map.Entry.<Integer, List<Coordinate>>comparingByKey())
.forEachOrdered(e -> result.put(e.getKey(), e.getValue()));
System.out.println("总行数:"+result.size());

List<List<Coordinate>> resultRow = result.values().stream()
.collect(Collectors.toList());
resultRow=resultRow.stream().filter(item-> (item.size()>1)).collect(Collectors.toList());
System.out.println("去重,排序,分组后...");

resultRow.stream().forEach(System.out::println);
List<List<String>> mapList = new ArrayList<>();
for (int k = 0; k < resultRow.size()-1; k++) {
Map<String,String> map = new HashMap<>();
List<String> listRow=new ArrayList<>();
boolean nullData=false;
for (int i = 0; i < resultRow.get(k).size()-1; i++) {
Coordinate coordinateStart=resultRow.get(k).get(i);
List<Coordinate> nextRow=resultRow.get(k+1);
if(nextRow.size()>i+1){
Coordinate coordinateEnd=nextRow.get(i+1);
int width=coordinateEnd.getX() - coordinateStart.getX();
int height=coordinateEnd.getY() - coordinateStart.getY();
//左上角 为原始点 向右 加宽向下加高
try {
String info = readRectangleInfo(coordinateStart.getX(), coordinateStart.getY(),
width,height, document,pageNum);
info = info.replaceAll("\r|\n", "");
map.put("column"+i,info);
if(info==null||info.length()==0){
nullData=true;
}else {
nullData=false;
listRow.add(info);
}
}catch (Exception e){
e.printStackTrace();
}
}
}
if(!nullData){
mapList.add(listRow);
}
}
Gson gson = new Gson();
String mapListString = gson.toJson(mapList);
System.out.println(mapListString);

return mapList;
}
private static String readRectangleInfo(int x, int y, int width, int height, PDDocument document
, int pageNum) throws Exception {
PDFTextStripperByArea stripper = new PDFTextStripperByArea();
stripper.setSortByPosition(true);
Rectangle rect = new Rectangle(x, y, width, height);
stripper.addRegion("rect", rect);
PDPage firstPage = document.getPage(pageNum);
stripper.extractRegions(firstPage);
return stripper.getTextForRegion("rect");
}
}


<dependencies>

<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.22</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.pdfbox/fontbox -->
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>fontbox</artifactId>
<version>2.0.22</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.pdfbox/jempbox -->
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>jempbox</artifactId>
<version>1.8.16</version>
</dependency>
<dependency>
<groupId>com.google.code.gson</groupId>
<artifactId>gson</artifactId>
<version>2.8.0</version>
</dependency>

<dependency>
<groupId>com.alibaba</groupId>
<artifactId>easyexcel</artifactId>
<version>2.2.7</version>
</dependency>

</dependencies>

posted on 2021-01-19 15:02  Captain林  阅读(346)  评论(0编辑  收藏  举报

导航