java解析word中的excel
一、思路

1. 入口→分发:extractFromWord 是总入口,核心是 “格式分发”,将.doc 和.docx 分流到不同处理逻辑;
2. .doc 核心:绕开路径解析,用 “逐层遍历 + 兜底读取” 确保文件能读到,再交给extractFromOLE解析;
3. .docx 核心:直接遍历 PackagePart,利用 POI 对 OOXML 的原生支持,快速识别 Excel 附件;
4. 解析核心:extractFromOLE 是格式兼容关键,区分.xls/.xlsx 用不同 POI 模块,避免解析失败;
5. 稳定性保障:多层过滤(过小文件 / 特殊字符)+ 异常捕获(单个文件失败不中断)+ 格式适配,确保程序稳定运行。
二、核心依赖:
<!-- Apache POI 处理 Word 和 Excel --> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi-ooxml</artifactId> <version>5.2.4</version> </dependency> <!-- Apache POI 处理 OLE 对象(嵌入式附件) --> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi-scratchpad</artifactId> <version>5.2.4</version> </dependency> <!-- Apache Tika 识别文件类型(辅助提取附件) --> <dependency> <groupId>org.apache.tika</groupId> <artifactId>tika-core</artifactId> <version>2.9.1</version> </dependency>
三、源码
import org.apache.poi.openxml4j.opc.OPCPackage; import org.apache.poi.openxml4j.opc.PackagePart; import org.apache.poi.poifs.filesystem.*; import org.apache.poi.xssf.usermodel.XSSFWorkbook; import org.apache.poi.xwpf.usermodel.XWPFDocument; import org.apache.tika.Tika; import java.io.*; import java.util.ArrayList; import java.util.List; import java.util.regex.Pattern; public class WordExcelExtractor { private static final Tika tika = new Tika(); private static final Pattern NON_PRINTABLE_CHAR_PATTERN = Pattern.compile("[^\\x20-\\x7E]"); private static final int MIN_EXCEL_SIZE = 100; public static List<byte[]> extractFromWord(File wordFile) throws IOException { validateFile(wordFile); List<byte[]> excelDataList = new ArrayList<>(); String fileName = wordFile.getName().toLowerCase(); try { if (fileName.endsWith(".docx")) { extractFromDocx(wordFile, excelDataList); } else if (fileName.endsWith(".doc")) { extractFromDocDirect(wordFile, excelDataList); } else { throw new IllegalArgumentException("不支持的格式!仅支持 .doc/.docx"); } } catch (IllegalArgumentException e) { throw e; } catch (Exception e) { throw new IOException("提取 Excel 附件失败:" + e.getMessage(), e); } return excelDataList; } private static void extractFromDocDirect(File docFile, List<byte[]> excelDataList) throws IOException { try (FileInputStream fis = new FileInputStream(docFile); POIFSFileSystem poifs = new POIFSFileSystem(fis)) { DirectoryEntry rootEntry = poifs.getRoot(); DirectoryEntry objectPool = getObjectPoolDirectory(rootEntry); if (objectPool == null) { System.out.println("无 ObjectPool 目录,无嵌入式附件"); return; } for (Entry entry1 : objectPool) { String dir1Name = filterSpecialChars(entry1.getName()); if (dir1Name.isEmpty() || !(entry1 instanceof DirectoryEntry)) { continue; } DirectoryEntry subDir = (DirectoryEntry) entry1; System.out.println("找到 OBJECTPOOL 子目录:" + dir1Name); for (Entry entry2 : subDir) { String fileName = filterSpecialChars(entry2.getName()); if (fileName.isEmpty() || !(entry2 instanceof DocumentEntry)) { continue; } DocumentEntry docEntry = (DocumentEntry) entry2; long fileSize = docEntry.getSize(); System.out.println("找到文件:" + dir1Name + "/" + fileName + "(大小:" + fileSize + " 字节)"); if (fileSize < MIN_EXCEL_SIZE) { System.out.println("⚠️ 跳过过小文件(非 Excel):" + dir1Name + "/" + fileName); continue; } try { // 主方案:路径读取 try (InputStream is = poifs.createDocumentInputStream(getEntryFullPath(docEntry))) { byte[] oleData = is.readAllBytes(); extractAndAddExcel(oleData, dir1Name + "/" + fileName, excelDataList); } catch (Exception e) { System.out.println("⚠️ 路径读取失败,尝试直接读取文件字节"); byte[] oleData = readDocumentEntryDirect(docEntry); if (oleData != null && oleData.length >= MIN_EXCEL_SIZE) { extractAndAddExcel(oleData, dir1Name + "/" + fileName, excelDataList); } else { System.out.println("❌ 兜底读取失败(数据无效):" + dir1Name + "/" + fileName); } } } catch (Exception e) { System.out.println("❌ 处理文件失败,跳过:" + dir1Name + "/" + fileName + " → " + e.getMessage()); } } } } } private static void extractAndAddExcel(byte[] oleData, String filePath, List<byte[]> excelDataList) { try { byte[] excelData = extractFromOLE(oleData); if (excelData != null) { excelDataList.add(excelData); System.out.println("✅ 成功提取 Excel:" + filePath); } else { String fileHeader = getFileHeader(oleData); System.out.println("❌ 非 Excel 文件(文件头:" + fileHeader + "):" + filePath); } } catch (Exception e) { System.out.println("❌ 提取 Excel 失败:" + filePath + " → " + e.getMessage()); } } private static byte[] readDocumentEntryDirect(DocumentEntry docEntry) { try (InputStream is = new DocumentInputStream(docEntry)) { byte[] data = new byte[(int) docEntry.getSize()]; int readLen = is.read(data); return readLen > 0 ? data : null; } catch (IOException e) { System.out.println("⚠️ 直接读取字节失败:" + e.getMessage()); return null; } } /** * 核心修复:区分 .xls 和 .xlsx 格式,适配对应的解析模块 */ private static byte[] extractFromOLE(byte[] oleData) { // 1. 快速过滤非 Excel 文件 if (!isExcelFile(oleData)) { return null; } // 2. 判断是 .xls(OLE2)还是 .xlsx(OOXML) boolean isXls = isXlsFile(oleData); boolean isXlsx = isXlsxFile(oleData); // 3. 处理 .xlsx 格式(OOXML) if (isXlsx) { try (ByteArrayInputStream bais = new ByteArrayInputStream(oleData)) { // 验证是否为有效 .xlsx(用 OOXML 专用的 OPCPackage) try (OPCPackage opcPackage = OPCPackage.open(bais)) { // 可选:进一步验证是否为 Excel 工作表(避免其他 OOXML 文件) try (XSSFWorkbook workbook = new XSSFWorkbook(opcPackage)) { // 能打开工作簿,说明是有效 .xlsx return oleData; } } } catch (Exception e) { System.out.println("⚠️ 无效的 .xlsx 文件:" + e.getMessage()); return null; } } // 4. 处理 .xls 格式(OLE2) if (isXls) { ByteArrayInputStream bais = null; POIFSFileSystem poifs = null; try { bais = new ByteArrayInputStream(oleData); poifs = new POIFSFileSystem(bais); DirectoryEntry root = poifs.getRoot(); if (root.hasEntry("Package")) { try (InputStream is = poifs.createDocumentInputStream("Package")) { byte[] data = is.readAllBytes(); return isExcelFile(data) ? data : null; } } else if (root.hasEntry("Contents")) { try (InputStream is = poifs.createDocumentInputStream("Contents")) { byte[] data = is.readAllBytes(); return isExcelFile(data) ? data : null; } } // 直接是 .xls 文件,无需额外解析 return oleData; } catch (NotOLE2FileException e) { System.out.println("⚠️ 非 OLE2 格式文件:" + e.getMessage()); } catch (IOException e) { System.out.println("⚠️ 解析 .xls 文件失败:" + e.getMessage()); } finally { if (poifs != null) { try { poifs.close(); } catch (IOException e) {} } if (bais != null) { try { bais.close(); } catch (IOException e) {} } } } // 5. Tika 辅助验证 String fileType = tika.detect(oleData, ""); if (fileType.contains("excel") || fileType.contains("spreadsheet")) { return oleData; } return null; } /** * 单独判断是否为 .xls 文件(OLE2 格式) */ private static boolean isXlsFile(byte[] data) { if (data.length < 4) return false; byte b1 = data[0], b2 = data[1], b3 = data[2], b4 = data[3]; return (b1 == (byte) 0xD0 && b2 == (byte) 0xCF && b3 == (byte) 0x11 && b4 == (byte) 0xE0); } /** * 单独判断是否为 .xlsx 文件(OOXML 格式) */ private static boolean isXlsxFile(byte[] data) { if (data.length < 4) return false; byte b1 = data[0], b2 = data[1], b3 = data[2], b4 = data[3]; return (b1 == (byte) 0x50 && b2 == (byte) 0x4B && b3 == (byte) 0x03 && b4 == (byte) 0x04); } // ---------------------- 工具方法 ---------------------- private static DirectoryEntry getObjectPoolDirectory(DirectoryEntry root) throws IOException { if (root.hasEntry("ObjectPool")) { return (DirectoryEntry) root.getEntry("ObjectPool"); } else if (root.hasEntry("OBJECTPOOL")) { return (DirectoryEntry) root.getEntry("OBJECTPOOL"); } return null; } private static String filterSpecialChars(String name) { return name == null ? "" : NON_PRINTABLE_CHAR_PATTERN.matcher(name).replaceAll(""); } private static boolean isExcelContentType(String contentType) { return contentType.equals("application/vnd.ms-excel") || contentType.equals("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet") || contentType.equals("application/vnd.ms-excel.sheet.macroEnabled.12"); } private static boolean isExcelFile(byte[] data) { return isXlsFile(data) || isXlsxFile(data); } private static String getFileHeader(byte[] data) { if (data.length < 4) return "不足4字节"; StringBuilder sb = new StringBuilder(); for (int i = 0; i < 4; i++) { sb.append(String.format("%02X ", data[i])); } return sb.toString().trim(); } private static void validateFile(File file) throws IOException { if (!file.exists()) throw new FileNotFoundException("文件不存在:" + file.getAbsolutePath()); if (!file.isFile()) throw new IOException("路径不是文件:" + file.getAbsolutePath()); if (!file.canRead()) throw new IOException("文件不可读:" + file.getAbsolutePath()); } private static void extractFromDocx(File docxFile, List<byte[]> excelDataList) throws IOException { try (XWPFDocument doc = new XWPFDocument(OPCPackage.open(docxFile))) { for (PackagePart part : doc.getAllEmbeddedParts()) { String contentType = part.getContentType(); try (InputStream is = part.getInputStream()) { byte[] data = is.readAllBytes(); if (data.length < MIN_EXCEL_SIZE) continue; if (isExcelContentType(contentType) || isExcelFile(data)) { excelDataList.add(data); System.out.println("✅ 提取 .docx 中的 Excel 附件"); } else if (contentType.contains("oleObject")) { byte[] excelData = extractFromOLE(data); if (excelData != null) { excelDataList.add(excelData); } } } catch (Exception e) { System.out.println("❌ 处理 .docx 附件失败:" + e.getMessage()); } } } catch (Exception e) { throw new IOException("解析 .docx 文件失败:" + e.getMessage(), e); } } /** * 获取 Entry 的绝对路径(用于主方案路径读取,即使兜底方案常用,也需保留避免报红) */ private static String getEntryFullPath(Entry entry) { List<String> pathParts = new ArrayList<>(); Entry current = entry; while (current != null) { String name = current.getName(); // 过滤根目录和无效名称 if (name != null && !name.isEmpty() && !"Root Entry".equals(name)) { pathParts.add(name); } current = current.getParent(); } // 反转路径部分,得到正确的绝对路径 StringBuilder path = new StringBuilder(); for (int i = pathParts.size() - 1; i >= 0; i--) { if (path.length() > 0) { path.append("/"); } path.append(pathParts.get(i)); } return path.toString(); } }

浙公网安备 33010602011771号