JAVA读取word(doc)(docx)标题和内容----POI

Posted on 2021-11-09 09:36  牛奶甜了点  阅读(8340)  评论(0编辑  收藏  举报

java 实现poi方式读取word文件内容

1、下载poi的jar包

    下载地址:https://www.apache.org/dyn/closer.lua/poi/release/bin/poi-bin-3.17-20170915.tar.gz

下载解压后用到的jar包

maven:

<!-- https://mvnrepository.com/artifact/org.apache.poi/poi -->
    <dependency>
      <groupId>org.apache.poi</groupId>
      <artifactId>poi</artifactId>
      <version>4.1.2</version>
    </dependency>
    <dependency>
      <groupId>cn.hutool</groupId>
      <artifactId>hutool-all</artifactId>
      <version>5.5.7</version>
    </dependency>

    <!-- https://mvnrepository.com/artifact/org.apache.poi/poi-ooxml -->
    <dependency>
      <groupId>org.apache.poi</groupId>
      <artifactId>poi-ooxml</artifactId>
      <version>4.1.2</version>
    </dependency>
    <!-- https://mvnrepository.com/artifact/org.apache.poi/poi-ooxml-schemas -->
    <dependency>
      <groupId>org.apache.poi</groupId>
      <artifactId>poi-ooxml-schemas</artifactId>
      <version>4.1.2</version>
    </dependency>
    <dependency>
      <groupId>org.apache.poi</groupId>
      <artifactId>ooxml-schemas</artifactId>
      <version>1.1</version>
    </dependency>
    <!-- https://mvnrepository.com/artifact/org.apache.poi/poi-scratchpad -->
    <dependency>
      <groupId>org.apache.poi</groupId>
      <artifactId>poi-scratchpad</artifactId>
      <version>4.1.2</version>
    </dependency>

 

一、读取word全部内容(这个不区分doc和docx)

 

 1 package com.wordcom;
 2  
 3 import java.io.File;
 4 import java.io.FileInputStream;
 5 import java.io.InputStream;
 6 import org.apache.poi.POIXMLDocument;
 7 import org.apache.poi.POIXMLTextExtractor;
 8 import org.apache.poi.hwpf.extractor.WordExtractor;
 9 import org.apache.poi.openxml4j.opc.OPCPackage;
10 import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
11 /**
12  * @Author:hp
13  * @Description:
14  * @Date:2021年11月4日14:58:11
15  * @Modified by:读取word所有内容
16  **/
17 public class DocUtil {
18     public static void main(String[] args)  {
19         String filePath = "C:\\Users\\hp\\Desktop\\新建文件夹 (2)\\忻州地调中心站11楼机房更换通信电源三措一案.docx";
20         String content = readWord(filePath);
21         System.out.println(content);
22     }
23 
24     public static String readWord(String path) {
25         String buffer = "";
26         try {
27             if (path.endsWith(".doc")) {
28                 InputStream is = new FileInputStream(new File(path));
29                 WordExtractor ex = new WordExtractor(is);
30                 buffer = ex.getText();
31                 ex.close();
32             } else if (path.endsWith("docx")) {
33                 OPCPackage opcPackage = POIXMLDocument.openPackage(path);
34                 POIXMLTextExtractor extractor = new XWPFWordExtractor(opcPackage);
35                 buffer = extractor.getText();
36                 extractor.close();
37             } else {
38                 System.out.println("此文件不是word文件!");
39             }
40 
41         } catch (Exception e) {
42             e.printStackTrace();
43         }
44 
45         return buffer;
46     }
47 }

二、获取word各级标题(doc格式)

这个需要保证word格式提前定义好标题格式才能读出来

 

 

 1 package com.wordcom;
 2 import org.apache.poi.hwpf.HWPFDocument;
 3 import org.apache.poi.hwpf.model.StyleDescription;
 4 import org.apache.poi.hwpf.model.StyleSheet;
 5 import org.apache.poi.hwpf.usermodel.Paragraph;
 6 import org.apache.poi.hwpf.usermodel.ParagraphProperties;
 7 import org.apache.poi.hwpf.usermodel.Range;
 8 import java.io.*;
 9 
10 /**
11  * @author hp
12  *获取doc文档的标题
13  */
14 public class WordTitle {
15     public static void main(String[] args) throws Exception {
16     
17         String filePath = "C:\\Users\\hp\\Desktop\\新建文件夹 (2)\\正文查找.doc";
18         printWord(filePath);
19         
20     }
21     public static void printWord(String filePath) throws IOException {
22     
23         InputStream is = new FileInputStream(filePath);
24         
25         HWPFDocument doc = new HWPFDocument(is);
26 
27         Range r = doc.getRange();// 文档范围    
28         
29         for (int i = 0; i < r.numParagraphs(); i++) {
30     
31             Paragraph p = r.getParagraph(i);// 获取段落
32             int numStyles = doc.getStyleSheet().numStyles();    
33     
34             int styleIndex = p.getStyleIndex();
35                     
36             if (numStyles > styleIndex) {
37     
38                 StyleSheet style_sheet = doc.getStyleSheet();
39     
40                 StyleDescription style = style_sheet.getStyleDescription(styleIndex);
41                 ParagraphProperties style1 = style_sheet.getParagraphStyle(styleIndex);
42                
43                 String styleName = style.getName();// 获取每个段落样式名称
44                 //System.out.println(style_sheet);
45                 //System.out.println(styleName);
46                 // 获取自己理想样式的段落文本信息                 
47                 //String styleLoving = "标题";
48                 String text = p.text();// 段落文本
49                 //if (styleName != null && styleName.contains(styleLoving)) {
50                 if (styleName.equals("标题")) {
51                     
52                     System.out.println(text);
53                     }
54                 }
55             }
56         doc.close();
57         }
58 }

三、按段落读取word(doc)(docx)

可以按照自己的需求提取特定的内容

doc

 1 package com.wordcom;
 2 import org.apache.poi.hwpf.HWPFDocument;
 3 import org.apache.poi.hwpf.model.StyleDescription;
 4 import org.apache.poi.hwpf.model.StyleSheet;
 5 import org.apache.poi.hwpf.usermodel.Paragraph;
 6 import org.apache.poi.hwpf.usermodel.ParagraphProperties;
 7 import org.apache.poi.hwpf.usermodel.Range;
 8 import java.io.*;
 9 
10 /**
11  * 
12  * @author hp
13  *获取doc文档的标题
14  */
15 public class WordTitledoc {
16     public static void main(String[] args) throws Exception {
17     
18         String filePath = "C:\\Users\\hp\\Desktop\\新建文件夹 (2)\\一案 .doc";
19         
20         printWord(filePath);
21         
22     }
23         
24     public static void printWord(String filePath) throws IOException {
25     
26         InputStream is = new FileInputStream(filePath);
27         
28         HWPFDocument doc = new HWPFDocument(is);
29 
30         Range r = doc.getRange();// 文档范围    
31     
32         for (int i = 0; i < r.numParagraphs(); i++) {
33     
34             Paragraph p = r.getParagraph(i);// 获取段落
35             int numStyles = doc.getStyleSheet().numStyles();    
36     
37             int styleIndex = p.getStyleIndex();
38                     
39             if (numStyles > styleIndex) {
40     
41                 StyleSheet style_sheet = doc.getStyleSheet();
42     
43                 StyleDescription style = style_sheet.getStyleDescription(styleIndex);
44                 ParagraphProperties style1 = style_sheet.getParagraphStyle(styleIndex);
45                
46                 String styleName = style.getName();// 获取每个段落样式名称
47                 //System.out.println(style_sheet);
48                 //System.out.println(styleName);
49                 // 获取自己理想样式的段落文本信息                 
50                 //String styleLoving = "标题";
51                 String text = p.text();// 段落文本
52                 //if (styleName != null && styleName.contains(styleLoving)) {
53                 if (text.contains(".") || text.contains("、")) {
54                     //String text = p.text();// 段落文本
55                     if (!text.contains(",") && !text.contains(";") && !text.contains("。") && !text.contains("") && !text.contains("20")) {
56                     System.out.println(text);
57                     }
58                     }
59                 }
60             }
61         doc.close();    
62     }
63 }

docx

package com.wordcom;


import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import java.io.*;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
/**
 * 
 * @author hp
 *获取docx文档的标题
 */
public class WordTitledocx {
    public static void main(String[] args) throws Exception {
    
        String filePath = "C:\\Users\\hp\\Desktop\\新建文件夹 (2)\\忻州地调中心站11楼机房更换通信电源三措一案.docx";
        
        printWord(filePath);    
    }
        
    public static void printWord(String filePath) throws IOException {
    
        InputStream is = new FileInputStream(filePath);
        
        XWPFDocument doc = new XWPFDocument(is);

        List<Map<String,Object>> list = new ArrayList();
        
        List<XWPFParagraph> paragraphs2 = doc.getParagraphs();
        
        for (XWPFParagraph xwpfParagraph : paragraphs2) {
            
            String text = xwpfParagraph.getParagraphText();
            if (text.contains(".") || text.contains("、")) {
                //String text = p.text();// 段落文本
                if (!text.contains(",") && !text.contains(";") && !text.contains("。") && !text.contains("") && !text.contains("20")) {
                System.out.println(text);
                }
            }
        }
    }
}