java解析word中的excel

一、思路

1. 入口→分发：extractFromWord 是总入口，核心是 “格式分发”，将.doc 和.docx 分流到不同处理逻辑；

2. .doc 核心：绕开路径解析，用 “逐层遍历 + 兜底读取” 确保文件能读到，再交给extractFromOLE解析；

3. .docx 核心：直接遍历 PackagePart，利用 POI 对 OOXML 的原生支持，快速识别 Excel 附件；

4. 解析核心：extractFromOLE 是格式兼容关键，区分.xls/.xlsx 用不同 POI 模块，避免解析失败；

5. 稳定性保障：多层过滤（过小文件 / 特殊字符）+ 异常捕获（单个文件失败不中断）+ 格式适配，确保程序稳定运行。

二、核心依赖：

<!-- Apache POI 处理 Word 和 Excel -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>5.2.4</version>
</dependency>
<!-- Apache POI 处理 OLE 对象（嵌入式附件） -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>5.2.4</version>
</dependency>
<!-- Apache Tika 识别文件类型（辅助提取附件） -->
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-core</artifactId>
<version>2.9.1</version>
</dependency>

三、源码

import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.openxml4j.opc.PackagePart;
import org.apache.poi.poifs.filesystem.*;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.tika.Tika;

import java.io.*;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;

public class WordExcelExtractor {
    private static final Tika tika = new Tika();
    private static final Pattern NON_PRINTABLE_CHAR_PATTERN = Pattern.compile("[^\\x20-\\x7E]");
    private static final int MIN_EXCEL_SIZE = 100;

    public static List<byte[]> extractFromWord(File wordFile) throws IOException {
        validateFile(wordFile);
        List<byte[]> excelDataList = new ArrayList<>();
        String fileName = wordFile.getName().toLowerCase();

        try {
            if (fileName.endsWith(".docx")) {
                extractFromDocx(wordFile, excelDataList);
            } else if (fileName.endsWith(".doc")) {
                extractFromDocDirect(wordFile, excelDataList);
            } else {
                throw new IllegalArgumentException("不支持的格式！仅支持 .doc/.docx");
            }
        } catch (IllegalArgumentException e) {
            throw e;
        } catch (Exception e) {
            throw new IOException("提取 Excel 附件失败：" + e.getMessage(), e);
        }
        return excelDataList;
    }

    private static void extractFromDocDirect(File docFile, List<byte[]> excelDataList) throws IOException {
        try (FileInputStream fis = new FileInputStream(docFile);
             POIFSFileSystem poifs = new POIFSFileSystem(fis)) {
            DirectoryEntry rootEntry = poifs.getRoot();
            DirectoryEntry objectPool = getObjectPoolDirectory(rootEntry);
            if (objectPool == null) {
                System.out.println("无 ObjectPool 目录，无嵌入式附件");
                return;
            }

            for (Entry entry1 : objectPool) {
                String dir1Name = filterSpecialChars(entry1.getName());
                if (dir1Name.isEmpty() || !(entry1 instanceof DirectoryEntry)) {
                    continue;
                }
                DirectoryEntry subDir = (DirectoryEntry) entry1;
                System.out.println("找到 OBJECTPOOL 子目录：" + dir1Name);

                for (Entry entry2 : subDir) {
                    String fileName = filterSpecialChars(entry2.getName());
                    if (fileName.isEmpty() || !(entry2 instanceof DocumentEntry)) {
                        continue;
                    }
                    DocumentEntry docEntry = (DocumentEntry) entry2;
                    long fileSize = docEntry.getSize();
                    System.out.println("找到文件：" + dir1Name + "/" + fileName + "（大小：" + fileSize + " 字节）");

                    if (fileSize < MIN_EXCEL_SIZE) {
                        System.out.println("⚠️ 跳过过小文件（非 Excel）：" + dir1Name + "/" + fileName);
                        continue;
                    }

                    try {
                        // 主方案：路径读取
                        try (InputStream is = poifs.createDocumentInputStream(getEntryFullPath(docEntry))) {
                            byte[] oleData = is.readAllBytes();
                            extractAndAddExcel(oleData, dir1Name + "/" + fileName, excelDataList);
                        } catch (Exception e) {
                            System.out.println("⚠️  路径读取失败，尝试直接读取文件字节");
                            byte[] oleData = readDocumentEntryDirect(docEntry);
                            if (oleData != null && oleData.length >= MIN_EXCEL_SIZE) {
                                extractAndAddExcel(oleData, dir1Name + "/" + fileName, excelDataList);
                            } else {
                                System.out.println("❌ 兜底读取失败（数据无效）：" + dir1Name + "/" + fileName);
                            }
                        }
                    } catch (Exception e) {
                        System.out.println("❌ 处理文件失败，跳过：" + dir1Name + "/" + fileName + " → " + e.getMessage());
                    }
                }
            }
        }
    }

    private static void extractAndAddExcel(byte[] oleData, String filePath, List<byte[]> excelDataList) {
        try {
            byte[] excelData = extractFromOLE(oleData);
            if (excelData != null) {
                excelDataList.add(excelData);
                System.out.println("✅ 成功提取 Excel：" + filePath);
            } else {
                String fileHeader = getFileHeader(oleData);
                System.out.println("❌ 非 Excel 文件（文件头：" + fileHeader + "）：" + filePath);
            }
        } catch (Exception e) {
            System.out.println("❌ 提取 Excel 失败：" + filePath + " → " + e.getMessage());
        }
    }

    private static byte[] readDocumentEntryDirect(DocumentEntry docEntry) {
        try (InputStream is = new DocumentInputStream(docEntry)) {
            byte[] data = new byte[(int) docEntry.getSize()];
            int readLen = is.read(data);
            return readLen > 0 ? data : null;
        } catch (IOException e) {
            System.out.println("⚠️  直接读取字节失败：" + e.getMessage());
            return null;
        }
    }

    /**
     * 核心修复：区分 .xls 和 .xlsx 格式，适配对应的解析模块
     */
    private static byte[] extractFromOLE(byte[] oleData) {
        // 1. 快速过滤非 Excel 文件
        if (!isExcelFile(oleData)) {
            return null;
        }

        // 2. 判断是 .xls（OLE2）还是 .xlsx（OOXML）
        boolean isXls = isXlsFile(oleData);
        boolean isXlsx = isXlsxFile(oleData);

        // 3. 处理 .xlsx 格式（OOXML）
        if (isXlsx) {
            try (ByteArrayInputStream bais = new ByteArrayInputStream(oleData)) {
                // 验证是否为有效 .xlsx（用 OOXML 专用的 OPCPackage）
                try (OPCPackage opcPackage = OPCPackage.open(bais)) {
                    // 可选：进一步验证是否为 Excel 工作表（避免其他 OOXML 文件）
                    try (XSSFWorkbook workbook = new XSSFWorkbook(opcPackage)) {
                        // 能打开工作簿，说明是有效 .xlsx
                        return oleData;
                    }
                }
            } catch (Exception e) {
                System.out.println("⚠️  无效的 .xlsx 文件：" + e.getMessage());
                return null;
            }
        }

        // 4. 处理 .xls 格式（OLE2）
        if (isXls) {
            ByteArrayInputStream bais = null;
            POIFSFileSystem poifs = null;
            try {
                bais = new ByteArrayInputStream(oleData);
                poifs = new POIFSFileSystem(bais);
                DirectoryEntry root = poifs.getRoot();

                if (root.hasEntry("Package")) {
                    try (InputStream is = poifs.createDocumentInputStream("Package")) {
                        byte[] data = is.readAllBytes();
                        return isExcelFile(data) ? data : null;
                    }
                } else if (root.hasEntry("Contents")) {
                    try (InputStream is = poifs.createDocumentInputStream("Contents")) {
                        byte[] data = is.readAllBytes();
                        return isExcelFile(data) ? data : null;
                    }
                }
                // 直接是 .xls 文件，无需额外解析
                return oleData;
            } catch (NotOLE2FileException e) {
                System.out.println("⚠️  非 OLE2 格式文件：" + e.getMessage());
            } catch (IOException e) {
                System.out.println("⚠️  解析 .xls 文件失败：" + e.getMessage());
            } finally {
                if (poifs != null) {
                    try {
                        poifs.close();
                    } catch (IOException e) {}
                }
                if (bais != null) {
                    try {
                        bais.close();
                    } catch (IOException e) {}
                }
            }
        }

        // 5. Tika 辅助验证
        String fileType = tika.detect(oleData, "");
        if (fileType.contains("excel") || fileType.contains("spreadsheet")) {
            return oleData;
        }

        return null;
    }

    /**
     * 单独判断是否为 .xls 文件（OLE2 格式）
     */
    private static boolean isXlsFile(byte[] data) {
        if (data.length < 4) return false;
        byte b1 = data[0], b2 = data[1], b3 = data[2], b4 = data[3];
        return (b1 == (byte) 0xD0 && b2 == (byte) 0xCF && b3 == (byte) 0x11 && b4 == (byte) 0xE0);
    }

    /**
     * 单独判断是否为 .xlsx 文件（OOXML 格式）
     */
    private static boolean isXlsxFile(byte[] data) {
        if (data.length < 4) return false;
        byte b1 = data[0], b2 = data[1], b3 = data[2], b4 = data[3];
        return (b1 == (byte) 0x50 && b2 == (byte) 0x4B && b3 == (byte) 0x03 && b4 == (byte) 0x04);
    }

    // ---------------------- 工具方法 ----------------------
    private static DirectoryEntry getObjectPoolDirectory(DirectoryEntry root) throws IOException {
        if (root.hasEntry("ObjectPool")) {
            return (DirectoryEntry) root.getEntry("ObjectPool");
        } else if (root.hasEntry("OBJECTPOOL")) {
            return (DirectoryEntry) root.getEntry("OBJECTPOOL");
        }
        return null;
    }

    private static String filterSpecialChars(String name) {
        return name == null ? "" : NON_PRINTABLE_CHAR_PATTERN.matcher(name).replaceAll("");
    }

    private static boolean isExcelContentType(String contentType) {
        return contentType.equals("application/vnd.ms-excel")
                || contentType.equals("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")
                || contentType.equals("application/vnd.ms-excel.sheet.macroEnabled.12");
    }

    private static boolean isExcelFile(byte[] data) {
        return isXlsFile(data) || isXlsxFile(data);
    }

    private static String getFileHeader(byte[] data) {
        if (data.length < 4) return "不足4字节";
        StringBuilder sb = new StringBuilder();
        for (int i = 0; i < 4; i++) {
            sb.append(String.format("%02X ", data[i]));
        }
        return sb.toString().trim();
    }

    private static void validateFile(File file) throws IOException {
        if (!file.exists()) throw new FileNotFoundException("文件不存在：" + file.getAbsolutePath());
        if (!file.isFile()) throw new IOException("路径不是文件：" + file.getAbsolutePath());
        if (!file.canRead()) throw new IOException("文件不可读：" + file.getAbsolutePath());
    }

    private static void extractFromDocx(File docxFile, List<byte[]> excelDataList) throws IOException {
        try (XWPFDocument doc = new XWPFDocument(OPCPackage.open(docxFile))) {
            for (PackagePart part : doc.getAllEmbeddedParts()) {
                String contentType = part.getContentType();
                try (InputStream is = part.getInputStream()) {
                    byte[] data = is.readAllBytes();
                    if (data.length < MIN_EXCEL_SIZE) continue;
                    if (isExcelContentType(contentType) || isExcelFile(data)) {
                        excelDataList.add(data);
                        System.out.println("✅ 提取 .docx 中的 Excel 附件");
                    } else if (contentType.contains("oleObject")) {
                        byte[] excelData = extractFromOLE(data);
                        if (excelData != null) {
                            excelDataList.add(excelData);
                        }
                    }
                } catch (Exception e) {
                    System.out.println("❌ 处理 .docx 附件失败：" + e.getMessage());
                }
            }
        } catch (Exception e) {
            throw new IOException("解析 .docx 文件失败：" + e.getMessage(), e);
        }
    }

    /**
     * 获取 Entry 的绝对路径（用于主方案路径读取，即使兜底方案常用，也需保留避免报红）
     */
    private static String getEntryFullPath(Entry entry) {
        List<String> pathParts = new ArrayList<>();
        Entry current = entry;
        while (current != null) {
            String name = current.getName();
            // 过滤根目录和无效名称
            if (name != null && !name.isEmpty() && !"Root Entry".equals(name)) {
                pathParts.add(name);
            }
            current = current.getParent();
        }
        // 反转路径部分，得到正确的绝对路径
        StringBuilder path = new StringBuilder();
        for (int i = pathParts.size() - 1; i >= 0; i--) {
            if (path.length() > 0) {
                path.append("/");
            }
            path.append(pathParts.get(i));
        }
        return path.toString();
    }
}

posted @ 2025-11-05 10:01 ChMao 阅读(51) 评论(0) 收藏举报

刷新页面返回顶部

ChMao

java解析word中的excel

公告