将题库类型的pdf文件,转为对应的json文件
1. 相关实体类
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@NoArgsConstructor
@AllArgsConstructor
public class Answer {
private String type;
}
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@NoArgsConstructor
@AllArgsConstructor
public class Option {
private String type;
private String content;
}
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import java.util.List;
@Data
@NoArgsConstructor
@AllArgsConstructor
public class Question {
private String title;
private List<Option> options;
private Answer answer;
private String analysis;
private String cIndex;
}
2. 正则格式化工具类
package com.example.sound.utils;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.*;
public class QParserUtils {
public static Question parseQuestionTitle(String input, Question question) {
String pattern = "(\\d+[、.])\\s*(.*?)\\s*\\(\\s*(\\w)\\s*\\)\\s*([^()]*)$";
// 创建 Pattern 和 Matcher 对象
Pattern regex = Pattern.compile(pattern);
Matcher matcher = regex.matcher(input);
if (matcher.find()) {
String number = matcher.group(1); // 题目序号
String contentBefore = matcher.group(2); // 答案前的内容
String answer = matcher.group(3); // 正确答案(A/B/C/D)
String contentAfter = matcher.group(4); // 答案后的内容
// 构造修改后的内容(用()替换正确答案)
String modifiedContent = contentBefore + "()" + contentAfter;
System.out.println("题目序号: " + number);
System.out.println("正确答案: " + answer);
System.out.println("修改后题目: " + number + modifiedContent);
question.setTitle(number + modifiedContent);
question.setCIndex(number);
question.setAnswer(new Answer(answer));
question.setAnalysis("解析过程略");
} else {
System.out.println("输入不符合预期格式。");
}
return question;
}
public static Question parseQuestionOptions(String input, Question question) {
// 定义正则表达式
String pattern = "([A-D])[\u3001\\.](.+?)(?=[A-D][\u3001\\.]|$)";
// 创建 Pattern 和 Matcher 对象
Pattern regex = Pattern.compile(pattern);
Matcher matcher = regex.matcher(input);
List<Option> options = new ArrayList<>();
if (question != null && question.getOptions() != null && question.getOptions().size() > 0) {
options = question.getOptions();
}
// 查找并打印所有匹配项
while (matcher.find()) {
Option option = new Option();
String optionLetter = matcher.group(1); // 选项字母
String optionContent = matcher.group(2).trim(); // 选项内容(去除前后空格)
option.setType(optionLetter);
option.setContent(optionContent);
System.out.println("选项: " + optionLetter + ", 内容: " + optionContent);
options.add(option);
}
question.setOptions(options);
return question;
}
public static boolean isStartByNumber(String input) {
String pattern = "^\\d+[、.]";
Pattern regex = Pattern.compile(pattern);
Matcher matcher = regex.matcher(input);
return matcher.find();
}
}
3. 去水印的工具类
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
import org.apache.pdfbox.util.Matrix;
import java.io.IOException;
public class WatermarkFilter extends PDFTextStripper {
private static final float MAX_ROTATION_ANGLE = 20; // 最大倾斜角度(度)
public WatermarkFilter() throws IOException {
super.setSuppressDuplicateOverlappingText(true); // 启用重复文本过滤
}
@Override
protected void processTextPosition(TextPosition text) {
// 获取文本变换矩阵
Matrix matrix = text.getTextMatrix();
// 计算旋转角度 (通过矩阵解析)
double angle = Math.toDegrees(Math.atan2(matrix.getShearY(), matrix.getScaleY()));
// 水印判断条件
if (!isWatermark(text, angle)) {
super.processTextPosition(text); // 非水印文本保留
}
}
private boolean isWatermark(TextPosition text, double angle) {
// 条件:倾斜角度过大
if (Math.abs(angle) > MAX_ROTATION_ANGLE) {
return true;
}
if (text.getFont().getName().toLowerCase().contains("watermark")) {
return true;
}
return false;
}
}
4. 主方法
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.node.ArrayNode;
import com.fasterxml.jackson.databind.node.ObjectNode;
import org.apache.pdfbox.io.MemoryUsageSetting;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import static com.example.sound.utils.QParserUtils.isStartByNumber;
public class TTPdfToQuestionJsonConverter {
public static void main(String[] args) {
try {
String pdfFilePath = "D:\\11.pdf";
String jsonFilePath = "D:\\11.json";
convertPdfToJson(pdfFilePath, jsonFilePath);
System.out.println("PDF转换JSON成功!");
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* 将PDF文件转换为JSON格式
* @param pdfFilePath PDF文件路径
* @param jsonFilePath 输出JSON文件路径
* @throws IOException
*/
public static void convertPdfToJson(String pdfFilePath, String jsonFilePath) throws IOException {
// 1. 读取PDF文本内容
String pdfText = extractTextFromPdf(pdfFilePath);
// 2. 解析PDF文本为题目列表
List<Question> questions = parseQuestionsFromText(pdfText);
// 3. 转换为JSON格式并保存
saveAsJson(questions, jsonFilePath);
}
/**
* 从PDF文件中提取文本
*/
private static String extractTextFromPdf(String filePath) throws IOException {
try (PDDocument document = PDDocument.load(new File(filePath), MemoryUsageSetting.setupTempFileOnly())) {
WatermarkFilter stripper = new WatermarkFilter(); // 忽略水印
return stripper.getText(document);
}
}
/**
* 解析文本为题目列表
*/
private static List<Question> parseQuestionsFromText(String text) {
List<Question> questions = new ArrayList<>();
// 分割不同题目(假设题目之间有两个换行)
String[] questionBlocks = text.split("\n");
Question question = new Question();
StringBuilder titleInput = new StringBuilder();
StringBuilder optionInput = new StringBuilder();
for (String block : questionBlocks) {
if (block.trim().isEmpty() || block.trim().contains("单项选择")) continue;
if (optionInput.length() > 0 && !isStartByNumber(block.trim())) {
optionInput.append(block.trim());
}
if (titleInput.length() > 0 && !block.trim().startsWith("A")) {
titleInput.append(block.trim());
}
if (question.getTitle() == null && block.trim().startsWith("A") && titleInput.length() > 0) {
QParserUtils.parseQuestionTitle(titleInput.toString(), question);
optionInput = new StringBuilder();
optionInput.append(block.trim());
}
if (titleInput.length() == 0 && optionInput.length() == 0) {
titleInput.append(block.trim());
}
if (optionInput.length() > 0 && isStartByNumber(block.trim())) {
QParserUtils.parseQuestionOptions(optionInput.toString(), question);
optionInput = new StringBuilder();
titleInput = new StringBuilder();
}
if (question.getAnswer() != null && question.getOptions() != null && !question.getTitle().isEmpty() && question.getOptions().size() > 3) {
questions.add(question);
question = new Question();
titleInput = new StringBuilder();
titleInput.append(block.trim());
}
if (block.trim().contains("判断题")) break;
}
if (optionInput.length() > 0) {
QParserUtils.parseQuestionOptions(optionInput.toString(), question);
questions.add(question);
}
return questions;
}
/**
* 解析选择题
*/
private static Question parseChoiceQuestion(String block) {
Question question = new Question();
// 提取标题
String title = block.substring(0, block.indexOf("A. ")).trim();
question.setTitle(title);
// 提取选项
List<Option> options = new ArrayList<>();
String[] lines = block.split("\n");
for (String line : lines) {
if (line.startsWith("A. ") || line.startsWith("B. ") || line.startsWith("C. ") || line.startsWith("D. ")) {
String type = line.substring(0, 2).replace(".", "");
String content = line.substring(3).trim();
options.add(new Option(type, content));
}
}
question.setOptions(options);
// 简单示例:答案和解析默认值
question.setAnswer(new Answer("A"));
question.setAnalysis("解析过程略");
return question;
}
/**
* 解析判断题
*/
private static Question parseTrueFalseQuestion(String block) {
Question question = new Question();
// 提取标题和答案
String[] parts = block.split("(");
if (parts.length == 2) {
String title = parts[0].trim();
String answerStr = parts[1].replace(")", "").trim();
question.setTitle(title);
// 答案解析
boolean isCorrect = answerStr.equalsIgnoreCase("对") || answerStr.equalsIgnoreCase("正确");
question.setAnswer(new Answer(isCorrect ? "对" : "错"));
// 默认解析
question.setAnalysis("解析过程略");
}
return question;
}
/**
* 保存为JSON文件
*/
private static void saveAsJson(List<Question> questions, String filePath) throws IOException {
ObjectMapper mapper = new ObjectMapper();
ObjectNode rootNode = mapper.createObjectNode();
rootNode.put("name", "题库名称");
ArrayNode dataNode = mapper.createArrayNode();
for (Question question : questions) {
ObjectNode questionNode = mapper.createObjectNode();
questionNode.put("title", question.getTitle());
if (question.getOptions() != null && !question.getOptions().isEmpty()) {
ArrayNode optionsNode = mapper.createArrayNode();
for (Option option : question.getOptions()) {
ObjectNode optionNode = mapper.createObjectNode();
optionNode.put("type", option.getType());
optionNode.put("content", option.getContent());
optionsNode.add(optionNode);
}
questionNode.set("options", optionsNode);
}
ObjectNode answerNode = mapper.createObjectNode();
answerNode.put("type", question.getAnswer().getType());
questionNode.set("answer", answerNode);
questionNode.put("analysis", question.getAnalysis());
questionNode.put("cIndex", question.getCIndex());
dataNode.add(questionNode);
}
rootNode.set("data", dataNode);
mapper.writerWithDefaultPrettyPrinter().writeValue(new File(filePath), rootNode);
}
}
pdf中,选择题的示例为:
1、题目题目题目(B)题目题目。
A、测试1 B、测试2 C、测试3 D、测试4
2、题目题目题目(B)题目题目11。
A、测试11 B、测试22 C、测试33 D、测试44
使用dpfbox解析后,结尾会加上\n,所以代码中用\n做处理,并针对标题换行和选项换行做处理
生成的json示例:
{
"name" : "题库名称",
"data" : [ {
"title" : "1、题目题目题目()题目题目。",
"options" : [ {
"type" : "A",
"content" : "测试1"
}, {
"type" : "B",
"content" : "测试2"
}, {
"type" : "C",
"content" : "测试3"
}, {
"type" : "D",
"content" : "测试4"
} ],
"answer" : {
"type" : "B"
},
"analysis" : "解析过程略",
"cIndex" : "1、"
}]
}
作者:人间春风意
扫描左侧的二维码可以赞赏

本作品采用署名-非商业性使用-禁止演绎 4.0 国际 进行许可。

浙公网安备 33010602011771号