document.write("");

将题库类型的pdf文件,转为对应的json文件

 

1. 相关实体类

import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;

@Data
@NoArgsConstructor
@AllArgsConstructor
public class Answer {
    private String type;
}
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;

@Data
@NoArgsConstructor
@AllArgsConstructor
public class Option {
    private String type;
    private String content;
}
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;

import java.util.List;

@Data
@NoArgsConstructor
@AllArgsConstructor
public class Question {
    private String title;
    private List<Option> options;
    private Answer answer;
    private String analysis;
    private String cIndex;
}

2. 正则格式化工具类

package com.example.sound.utils;


import java.util.ArrayList;
import java.util.List;
import java.util.regex.*;

public class QParserUtils {

    public static Question parseQuestionTitle(String input, Question question) {
        String pattern = "(\\d+[、.])\\s*(.*?)\\s*\\(\\s*(\\w)\\s*\\)\\s*([^()]*)$";
        // 创建 Pattern 和 Matcher 对象
        Pattern regex = Pattern.compile(pattern);
        Matcher matcher = regex.matcher(input);
        if (matcher.find()) {
            String number = matcher.group(1);        // 题目序号
            String contentBefore = matcher.group(2); // 答案前的内容
            String answer = matcher.group(3);        // 正确答案(A/B/C/D)
            String contentAfter = matcher.group(4);  // 答案后的内容

            // 构造修改后的内容(用()替换正确答案)
            String modifiedContent = contentBefore + "()" + contentAfter;

            System.out.println("题目序号: " + number);
            System.out.println("正确答案: " + answer);
            System.out.println("修改后题目: " + number + modifiedContent);

            question.setTitle(number + modifiedContent);
            question.setCIndex(number);
            question.setAnswer(new Answer(answer));
            question.setAnalysis("解析过程略");
        } else {
            System.out.println("输入不符合预期格式。");
        }
        return question;
    }
	
    public static Question parseQuestionOptions(String input, Question question) {
        // 定义正则表达式
        String pattern = "([A-D])[\u3001\\.](.+?)(?=[A-D][\u3001\\.]|$)";

        // 创建 Pattern 和 Matcher 对象
        Pattern regex = Pattern.compile(pattern);
        Matcher matcher = regex.matcher(input);
        List<Option> options = new ArrayList<>();
        if (question != null && question.getOptions() != null && question.getOptions().size() > 0) {
            options = question.getOptions();
        }
        // 查找并打印所有匹配项
        while (matcher.find()) {
            Option option = new Option();
            String optionLetter = matcher.group(1);  // 选项字母
            String optionContent = matcher.group(2).trim();  // 选项内容(去除前后空格)
            option.setType(optionLetter);
            option.setContent(optionContent);
            System.out.println("选项: " + optionLetter + ", 内容: " + optionContent);
            options.add(option);
        }
        question.setOptions(options);
        return question;
    }

    public static boolean isStartByNumber(String input) {
        String pattern = "^\\d+[、.]";
        Pattern regex = Pattern.compile(pattern);
        Matcher matcher = regex.matcher(input);
        return matcher.find();
    }
}

3. 去水印的工具类

import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
import org.apache.pdfbox.util.Matrix;
import java.io.IOException;

public class WatermarkFilter extends PDFTextStripper {

    private static final float MAX_ROTATION_ANGLE = 20;        // 最大倾斜角度(度)

    public WatermarkFilter() throws IOException {
        super.setSuppressDuplicateOverlappingText(true); // 启用重复文本过滤
    }

    @Override
    protected void processTextPosition(TextPosition text) {
        // 获取文本变换矩阵
        Matrix matrix = text.getTextMatrix();

        // 计算旋转角度 (通过矩阵解析)
        double angle = Math.toDegrees(Math.atan2(matrix.getShearY(), matrix.getScaleY()));

        // 水印判断条件
        if (!isWatermark(text, angle)) {
            super.processTextPosition(text); // 非水印文本保留
        }
    }

    private boolean isWatermark(TextPosition text, double angle) {
        // 条件:倾斜角度过大
        if (Math.abs(angle) > MAX_ROTATION_ANGLE) {
            return true;
        }
        if (text.getFont().getName().toLowerCase().contains("watermark")) {
            return true;
        }
        return false;
    }
}

4. 主方法

import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.node.ArrayNode;
import com.fasterxml.jackson.databind.node.ObjectNode;
import org.apache.pdfbox.io.MemoryUsageSetting;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import static com.example.sound.utils.QParserUtils.isStartByNumber;

public class TTPdfToQuestionJsonConverter {

    public static void main(String[] args) {
        try {
            String pdfFilePath = "D:\\11.pdf";
            String jsonFilePath = "D:\\11.json";
            convertPdfToJson(pdfFilePath, jsonFilePath);
            System.out.println("PDF转换JSON成功!");
        } catch (IOException e) {
            e.printStackTrace();
        }

    }

    /**
     * 将PDF文件转换为JSON格式
     * @param pdfFilePath PDF文件路径
     * @param jsonFilePath 输出JSON文件路径
     * @throws IOException
     */
    public static void convertPdfToJson(String pdfFilePath, String jsonFilePath) throws IOException {
        // 1. 读取PDF文本内容
        String pdfText = extractTextFromPdf(pdfFilePath);

        // 2. 解析PDF文本为题目列表
        List<Question> questions = parseQuestionsFromText(pdfText);

        // 3. 转换为JSON格式并保存
        saveAsJson(questions, jsonFilePath);
    }

    /**
     * 从PDF文件中提取文本
     */
    private static String extractTextFromPdf(String filePath) throws IOException {
        try (PDDocument document = PDDocument.load(new File(filePath), MemoryUsageSetting.setupTempFileOnly())) {
            WatermarkFilter stripper = new WatermarkFilter(); // 忽略水印
            return stripper.getText(document);
        }
    }

    /**
     * 解析文本为题目列表
     */
    private static List<Question> parseQuestionsFromText(String text) {
        List<Question> questions = new ArrayList<>();

        // 分割不同题目(假设题目之间有两个换行)
        String[] questionBlocks = text.split("\n");
        Question question = new Question();
        StringBuilder titleInput = new StringBuilder();
        StringBuilder optionInput = new StringBuilder();
        for (String block : questionBlocks) {
            if (block.trim().isEmpty() || block.trim().contains("单项选择")) continue;
            if (optionInput.length() > 0 && !isStartByNumber(block.trim())) {
                optionInput.append(block.trim());
            }
            if (titleInput.length() > 0 && !block.trim().startsWith("A")) {
                titleInput.append(block.trim());
            }
            if (question.getTitle() == null && block.trim().startsWith("A") && titleInput.length() > 0) {
                QParserUtils.parseQuestionTitle(titleInput.toString(), question);
                optionInput = new StringBuilder();
                optionInput.append(block.trim());
            }
            if (titleInput.length() == 0 && optionInput.length() == 0) {
                titleInput.append(block.trim());
            }
            if (optionInput.length() > 0 && isStartByNumber(block.trim())) {
                QParserUtils.parseQuestionOptions(optionInput.toString(), question);
                optionInput = new StringBuilder();
                titleInput = new StringBuilder();
            }
            if (question.getAnswer() != null && question.getOptions() != null && !question.getTitle().isEmpty() && question.getOptions().size() > 3) {
                questions.add(question);
                question = new Question();
                titleInput = new StringBuilder();
                titleInput.append(block.trim());
            }
            if (block.trim().contains("判断题")) break;
        }
        if (optionInput.length() > 0) {
            QParserUtils.parseQuestionOptions(optionInput.toString(), question);
            questions.add(question);
        }
        return questions;
    }


    /**
     * 解析选择题
     */
    private static Question parseChoiceQuestion(String block) {
        Question question = new Question();

        // 提取标题
        String title = block.substring(0, block.indexOf("A. ")).trim();
        question.setTitle(title);

        // 提取选项
        List<Option> options = new ArrayList<>();
        String[] lines = block.split("\n");
        for (String line : lines) {
            if (line.startsWith("A. ") || line.startsWith("B. ") || line.startsWith("C. ") || line.startsWith("D. ")) {
                String type = line.substring(0, 2).replace(".", "");
                String content = line.substring(3).trim();
                options.add(new Option(type, content));
            }
        }
        question.setOptions(options);

        // 简单示例:答案和解析默认值
        question.setAnswer(new Answer("A"));
        question.setAnalysis("解析过程略");

        return question;
    }

    /**
     * 解析判断题
     */
    private static Question parseTrueFalseQuestion(String block) {
        Question question = new Question();

        // 提取标题和答案
        String[] parts = block.split("(");
        if (parts.length == 2) {
            String title = parts[0].trim();
            String answerStr = parts[1].replace(")", "").trim();
            question.setTitle(title);

            // 答案解析
            boolean isCorrect = answerStr.equalsIgnoreCase("对") || answerStr.equalsIgnoreCase("正确");
            question.setAnswer(new Answer(isCorrect ? "对" : "错"));

            // 默认解析
            question.setAnalysis("解析过程略");
        }

        return question;
    }

    /**
     * 保存为JSON文件
     */
    private static void saveAsJson(List<Question> questions, String filePath) throws IOException {
        ObjectMapper mapper = new ObjectMapper();
        ObjectNode rootNode = mapper.createObjectNode();
        rootNode.put("name", "题库名称");

        ArrayNode dataNode = mapper.createArrayNode();
        for (Question question : questions) {
            ObjectNode questionNode = mapper.createObjectNode();
            questionNode.put("title", question.getTitle());

            if (question.getOptions() != null && !question.getOptions().isEmpty()) {
                ArrayNode optionsNode = mapper.createArrayNode();
                for (Option option : question.getOptions()) {
                    ObjectNode optionNode = mapper.createObjectNode();
                    optionNode.put("type", option.getType());
                    optionNode.put("content", option.getContent());
                    optionsNode.add(optionNode);
                }
                questionNode.set("options", optionsNode);
            }

            ObjectNode answerNode = mapper.createObjectNode();
            answerNode.put("type", question.getAnswer().getType());
            questionNode.set("answer", answerNode);

            questionNode.put("analysis", question.getAnalysis());
            questionNode.put("cIndex", question.getCIndex());
            dataNode.add(questionNode);
        }

        rootNode.set("data", dataNode);

        mapper.writerWithDefaultPrettyPrinter().writeValue(new File(filePath), rootNode);
    }

}

pdf中,选择题的示例为:

1、题目题目题目(B)题目题目。
A、测试1 B、测试2 C、测试3 D、测试4

2、题目题目题目(B)题目题目11。
A、测试11 B、测试22 C、测试33 D、测试44

使用dpfbox解析后,结尾会加上\n,所以代码中用\n做处理,并针对标题换行和选项换行做处理

生成的json示例:

{
  "name" : "题库名称",
  "data" : [ {
    "title" : "1、题目题目题目()题目题目。",
    "options" : [ {
      "type" : "A",
      "content" : "测试1"
    }, {
      "type" : "B",
      "content" : "测试2"
    }, {
      "type" : "C",
      "content" : "测试3"
    }, {
      "type" : "D",
      "content" : "测试4"
    } ],
    "answer" : {
      "type" : "B"
    },
    "analysis" : "解析过程略",
    "cIndex" : "1、"
  }]
}

  

 

posted @ 2025-06-20 11:29  人间春风意  阅读(74)  评论(0)    收藏  举报