package org.pdffolder.pdffolder01;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.tika.utils.StringUtils;
import java.io.File;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
//提取指定共同区域的内容
public class BeginArea {
public static void main(String[] args) throws Exception {
try {
String folderPath = "C:\\Users\\kfeng5\\OneDrive - DXC Production\\Desktop\\年假工作";
File folder = new File(folderPath);
File[] files = folder.listFiles();
if (files != null) {
for (File file : files) {
if (file.getName().endsWith(".pdf")) {
PDDocument document = PDDocument.load(file);
PDFTextStripper stripper = new PDFTextStripper(); // 初始化文本剥离器
String text = stripper.getText(document);
String[] lines = text.split("\n");
boolean nameFlag = false;
boolean infoFlag = false;
String name = null;
StringBuilder sb = new StringBuilder();
for (String line : lines) {
if (line.startsWith("(2023年版)")) {
continue;
}
Pattern compile0 = Pattern.compile("^\\d+(.*)");
Matcher matcher0 = compile0.matcher(line.trim());
if (matcher0.find()) {
if (StringUtils.isBlank(matcher0.group(1))) {
continue;//跳出此line
}
}
if (line.startsWith("附件")) {
nameFlag = true;
continue;
}
if (nameFlag) {
Pattern pattern = Pattern.compile("(.*)诊疗方案");
Matcher matcher = pattern.matcher(line);
if (matcher.find()) {
name = matcher.group(1);
nameFlag = false;
continue;
}
}
Pattern compile1 = Pattern.compile("四、病理改变(.*)");
Matcher matcher1 = compile1.matcher(line.trim());
if (matcher1.find()) {
infoFlag = true;
continue;//跳出此line
}
if(infoFlag){
if (line.startsWith("五")) {
System.out.println(name + "!" + sb.toString());
sb = new StringBuilder();
break;
}
sb.append(line.replaceAll("\r", ""));
continue;
}
}
document.close();
}
}
}
}catch(Exception e){
throw new Exception(e);
}
}
}