将题库类型的pdf文件,转为对应的json文件
1. 相关实体类
import lombok.AllArgsConstructor; import lombok.Data; import lombok.NoArgsConstructor; @Data @NoArgsConstructor @AllArgsConstructor public class Answer { private String type; }
import lombok.AllArgsConstructor; import lombok.Data; import lombok.NoArgsConstructor; @Data @NoArgsConstructor @AllArgsConstructor public class Option { private String type; private String content; }
import lombok.AllArgsConstructor; import lombok.Data; import lombok.NoArgsConstructor; import java.util.List; @Data @NoArgsConstructor @AllArgsConstructor public class Question { private String title; private List<Option> options; private Answer answer; private String analysis; private String cIndex; }
2. 正则格式化工具类
package com.example.sound.utils; import java.util.ArrayList; import java.util.List; import java.util.regex.*; public class QParserUtils { public static Question parseQuestionTitle(String input, Question question) { String pattern = "(\\d+[、.])\\s*(.*?)\\s*\\(\\s*(\\w)\\s*\\)\\s*([^()]*)$"; // 创建 Pattern 和 Matcher 对象 Pattern regex = Pattern.compile(pattern); Matcher matcher = regex.matcher(input); if (matcher.find()) { String number = matcher.group(1); // 题目序号 String contentBefore = matcher.group(2); // 答案前的内容 String answer = matcher.group(3); // 正确答案(A/B/C/D) String contentAfter = matcher.group(4); // 答案后的内容 // 构造修改后的内容(用()替换正确答案) String modifiedContent = contentBefore + "()" + contentAfter; System.out.println("题目序号: " + number); System.out.println("正确答案: " + answer); System.out.println("修改后题目: " + number + modifiedContent); question.setTitle(number + modifiedContent); question.setCIndex(number); question.setAnswer(new Answer(answer)); question.setAnalysis("解析过程略"); } else { System.out.println("输入不符合预期格式。"); } return question; } public static Question parseQuestionOptions(String input, Question question) { // 定义正则表达式 String pattern = "([A-D])[\u3001\\.](.+?)(?=[A-D][\u3001\\.]|$)"; // 创建 Pattern 和 Matcher 对象 Pattern regex = Pattern.compile(pattern); Matcher matcher = regex.matcher(input); List<Option> options = new ArrayList<>(); if (question != null && question.getOptions() != null && question.getOptions().size() > 0) { options = question.getOptions(); } // 查找并打印所有匹配项 while (matcher.find()) { Option option = new Option(); String optionLetter = matcher.group(1); // 选项字母 String optionContent = matcher.group(2).trim(); // 选项内容(去除前后空格) option.setType(optionLetter); option.setContent(optionContent); System.out.println("选项: " + optionLetter + ", 内容: " + optionContent); options.add(option); } question.setOptions(options); return question; } public static boolean isStartByNumber(String input) { String pattern = "^\\d+[、.]"; Pattern regex = Pattern.compile(pattern); Matcher matcher = regex.matcher(input); return matcher.find(); } }
3. 去水印的工具类
import org.apache.pdfbox.text.PDFTextStripper; import org.apache.pdfbox.text.TextPosition; import org.apache.pdfbox.util.Matrix; import java.io.IOException; public class WatermarkFilter extends PDFTextStripper { private static final float MAX_ROTATION_ANGLE = 20; // 最大倾斜角度(度) public WatermarkFilter() throws IOException { super.setSuppressDuplicateOverlappingText(true); // 启用重复文本过滤 } @Override protected void processTextPosition(TextPosition text) { // 获取文本变换矩阵 Matrix matrix = text.getTextMatrix(); // 计算旋转角度 (通过矩阵解析) double angle = Math.toDegrees(Math.atan2(matrix.getShearY(), matrix.getScaleY())); // 水印判断条件 if (!isWatermark(text, angle)) { super.processTextPosition(text); // 非水印文本保留 } } private boolean isWatermark(TextPosition text, double angle) { // 条件:倾斜角度过大 if (Math.abs(angle) > MAX_ROTATION_ANGLE) { return true; } if (text.getFont().getName().toLowerCase().contains("watermark")) { return true; } return false; } }
4. 主方法
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.node.ArrayNode; import com.fasterxml.jackson.databind.node.ObjectNode; import org.apache.pdfbox.io.MemoryUsageSetting; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.text.PDFTextStripper; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.List; import static com.example.sound.utils.QParserUtils.isStartByNumber; public class TTPdfToQuestionJsonConverter { public static void main(String[] args) { try { String pdfFilePath = "D:\\11.pdf"; String jsonFilePath = "D:\\11.json"; convertPdfToJson(pdfFilePath, jsonFilePath); System.out.println("PDF转换JSON成功!"); } catch (IOException e) { e.printStackTrace(); } } /** * 将PDF文件转换为JSON格式 * @param pdfFilePath PDF文件路径 * @param jsonFilePath 输出JSON文件路径 * @throws IOException */ public static void convertPdfToJson(String pdfFilePath, String jsonFilePath) throws IOException { // 1. 读取PDF文本内容 String pdfText = extractTextFromPdf(pdfFilePath); // 2. 解析PDF文本为题目列表 List<Question> questions = parseQuestionsFromText(pdfText); // 3. 转换为JSON格式并保存 saveAsJson(questions, jsonFilePath); } /** * 从PDF文件中提取文本 */ private static String extractTextFromPdf(String filePath) throws IOException { try (PDDocument document = PDDocument.load(new File(filePath), MemoryUsageSetting.setupTempFileOnly())) { WatermarkFilter stripper = new WatermarkFilter(); // 忽略水印 return stripper.getText(document); } } /** * 解析文本为题目列表 */ private static List<Question> parseQuestionsFromText(String text) { List<Question> questions = new ArrayList<>(); // 分割不同题目(假设题目之间有两个换行) String[] questionBlocks = text.split("\n"); Question question = new Question(); StringBuilder titleInput = new StringBuilder(); StringBuilder optionInput = new StringBuilder(); for (String block : questionBlocks) { if (block.trim().isEmpty() || block.trim().contains("单项选择")) continue; if (optionInput.length() > 0 && !isStartByNumber(block.trim())) { optionInput.append(block.trim()); } if (titleInput.length() > 0 && !block.trim().startsWith("A")) { titleInput.append(block.trim()); } if (question.getTitle() == null && block.trim().startsWith("A") && titleInput.length() > 0) { QParserUtils.parseQuestionTitle(titleInput.toString(), question); optionInput = new StringBuilder(); optionInput.append(block.trim()); } if (titleInput.length() == 0 && optionInput.length() == 0) { titleInput.append(block.trim()); } if (optionInput.length() > 0 && isStartByNumber(block.trim())) { QParserUtils.parseQuestionOptions(optionInput.toString(), question); optionInput = new StringBuilder(); titleInput = new StringBuilder(); } if (question.getAnswer() != null && question.getOptions() != null && !question.getTitle().isEmpty() && question.getOptions().size() > 3) { questions.add(question); question = new Question(); titleInput = new StringBuilder(); titleInput.append(block.trim()); } if (block.trim().contains("判断题")) break; } if (optionInput.length() > 0) { QParserUtils.parseQuestionOptions(optionInput.toString(), question); questions.add(question); } return questions; } /** * 解析选择题 */ private static Question parseChoiceQuestion(String block) { Question question = new Question(); // 提取标题 String title = block.substring(0, block.indexOf("A. ")).trim(); question.setTitle(title); // 提取选项 List<Option> options = new ArrayList<>(); String[] lines = block.split("\n"); for (String line : lines) { if (line.startsWith("A. ") || line.startsWith("B. ") || line.startsWith("C. ") || line.startsWith("D. ")) { String type = line.substring(0, 2).replace(".", ""); String content = line.substring(3).trim(); options.add(new Option(type, content)); } } question.setOptions(options); // 简单示例:答案和解析默认值 question.setAnswer(new Answer("A")); question.setAnalysis("解析过程略"); return question; } /** * 解析判断题 */ private static Question parseTrueFalseQuestion(String block) { Question question = new Question(); // 提取标题和答案 String[] parts = block.split("("); if (parts.length == 2) { String title = parts[0].trim(); String answerStr = parts[1].replace(")", "").trim(); question.setTitle(title); // 答案解析 boolean isCorrect = answerStr.equalsIgnoreCase("对") || answerStr.equalsIgnoreCase("正确"); question.setAnswer(new Answer(isCorrect ? "对" : "错")); // 默认解析 question.setAnalysis("解析过程略"); } return question; } /** * 保存为JSON文件 */ private static void saveAsJson(List<Question> questions, String filePath) throws IOException { ObjectMapper mapper = new ObjectMapper(); ObjectNode rootNode = mapper.createObjectNode(); rootNode.put("name", "题库名称"); ArrayNode dataNode = mapper.createArrayNode(); for (Question question : questions) { ObjectNode questionNode = mapper.createObjectNode(); questionNode.put("title", question.getTitle()); if (question.getOptions() != null && !question.getOptions().isEmpty()) { ArrayNode optionsNode = mapper.createArrayNode(); for (Option option : question.getOptions()) { ObjectNode optionNode = mapper.createObjectNode(); optionNode.put("type", option.getType()); optionNode.put("content", option.getContent()); optionsNode.add(optionNode); } questionNode.set("options", optionsNode); } ObjectNode answerNode = mapper.createObjectNode(); answerNode.put("type", question.getAnswer().getType()); questionNode.set("answer", answerNode); questionNode.put("analysis", question.getAnalysis()); questionNode.put("cIndex", question.getCIndex()); dataNode.add(questionNode); } rootNode.set("data", dataNode); mapper.writerWithDefaultPrettyPrinter().writeValue(new File(filePath), rootNode); } }
pdf中,选择题的示例为:
1、题目题目题目(B)题目题目。
A、测试1 B、测试2 C、测试3 D、测试4
2、题目题目题目(B)题目题目11。
A、测试11 B、测试22 C、测试33 D、测试44
使用dpfbox解析后,结尾会加上\n,所以代码中用\n做处理,并针对标题换行和选项换行做处理
生成的json示例:
{ "name" : "题库名称", "data" : [ { "title" : "1、题目题目题目()题目题目。", "options" : [ { "type" : "A", "content" : "测试1" }, { "type" : "B", "content" : "测试2" }, { "type" : "C", "content" : "测试3" }, { "type" : "D", "content" : "测试4" } ], "answer" : { "type" : "B" }, "analysis" : "解析过程略", "cIndex" : "1、" }] }