团队冲刺第九天

今天上课展示了第一阶段冲刺的成果,可以做到读取传输的图片上的表格,并输出到web页面,但是要保存到本地形成excel表格还有点问题,

还需要解决,观看了其他组的展示后,我看到了他们的优点,比如页面好看,简洁,功能多多,也认识到了自己的不足,还有许多地方需要改进。

 

package com.example.demo;

import com.sun.org.slf4j.internal.Logger;
import com.sun.org.slf4j.internal.LoggerFactory;
import net.sourceforge.tess4j.ITesseract;
import net.sourceforge.tess4j.Tesseract;
import net.sourceforge.tess4j.TesseractException;
import org.opencv.core.*;
import org.opencv.core.Rect;


import org.opencv.highgui.HighGui;
import org.opencv.imgcodecs.Imgcodecs;
import org.opencv.imgproc.Imgproc;
import org.opencv.objdetect.Objdetect;

import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.List;

public class TableScanner {
    private static final Logger logger = LoggerFactory.getLogger(TableScanner.class);

    private static final String TESSERACT_DATA_PATH = "/usr/share/tesseract-ocr/4.00/tessdata"; // Tesseract OCR数据路径

    private String imagePath; // 图像文件路径
    private String tablePath; // 表格文件保存路径

    public TableScanner(String imagePath, String tablePath) {
        this.imagePath = imagePath;
        this.tablePath = tablePath;
    }

    public void scanTable() throws IOException {
        // 加载OpenCV库
        System.loadLibrary(Core.NATIVE_LIBRARY_NAME);

        // 读取图像文件
        Mat image = Imgcodecs.imread(imagePath);

        // 转换为灰度图像
        Mat gray = new Mat();
        Imgproc.cvtColor(image, gray, Imgproc.COLOR_BGR2GRAY);

        // 对图像进行二值化处理
        Mat binary = new Mat();
        Imgproc.threshold(gray, binary, 0, 255, Imgproc.THRESH_BINARY_INV | Imgproc.THRESH_OTSU);

        // 进行表格检测
        Rect tableRect = detectTable(binary);

        if (tableRect == null) {
            logger.warn("Failed to detect table in image: {}", imagePath);
            return;
        }

        // 提取表格区域
        Mat table = new Mat(image, tableRect);

        // 进行表格识别
        ITesseract tesseract = new Tesseract();
        tesseract.setDatapath(TESSERACT_DATA_PATH);
        tesseract.setLanguage("eng"); // 使用英文语言库
        String text = null;
        BufferedImage bufferedImage = null;
        try {
            MatOfByte matOfByte = new MatOfByte();
            Imgcodecs.imencode(".jpg", table, matOfByte);
            byte[] byteArray = matOfByte.toArray();
            InputStream in = new ByteArrayInputStream(byteArray);
            bufferedImage = ImageIO.read(in);
        } catch (IOException e) {
            logger.error("Failed to convert Mat to BufferedImage: {}", e.getMessage());
            return;
        }
        try {
            text = tesseract.doOCR(bufferedImage);
        } catch (TesseractException e) {
            logger.error("Failed to recognize table in image: {}", imagePath, e);
            return;
        }
//        try {
//            text = tesseract.doOCR(table);
//        } catch (TesseractException e) {
//            logger.error("Failed to recognize table in image: {}", imagePath, e);
//            return;
//        }

        // 保存为CSV文件
        Path tableFile = Paths.get(tablePath);
        Files.write(tableFile, text.getBytes());
    }

    /**
     * 检测图像中的表格区域
     */
    private Rect detectTable(Mat binary) {
        // 进行轮廓检测
        List<MatOfPoint> contours = Lists.newArrayList();
        Mat hierarchy = new Mat();
        Imgproc.findContours(binary, contours, hierarchy, Imgproc.RETR_EXTERNAL, Imgproc.CHAIN_APPROX_SIMPLE);

        // 查找最大的矩形轮廓
        Rect maxRect = null;
        double maxArea = 0;
        for (MatOfPoint contour : contours) {
            Rect rect = (Rect) Imgproc.boundingRect(contour);
            double area = rect.width * rect.height;
            if (area > maxArea && isTable(rect)) {
                maxRect = rect;
                maxArea = area;
            }
        }

        return maxRect;
    }

    /**
     * 判断矩形是否为表格
     */
    private boolean isTable(Rect rect) {
       return rect.width > 50 && rect.height > 50;
       
    }

}

  

posted @ 2023-04-24 20:54  lcz111  阅读(30)  评论(0)    收藏  举报