Apache Tika是一个用于文件类型检测和文件内容,其中PDF解析器可以读取pdf内容
所用jar包:
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-core</artifactId>
<version>1.20</version>
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parsers</artifactId>
<version>1.16</version>
</dependency>
public static void main(String[] args) { File file =new File("D:\\101.pdf"); BodyContentHandler handler=new BodyContentHandler(); //元数据对象 Metadata metadata=new Metadata(); FileInputStream inputStream=new FileInputStream(file); ParseContext parseContext=new ParseContext(); // PDFParser pdfParser=new PDFParser(); pdfParser.parse(inputStream, handler, metadata, parseContext); System.out.println("文件属性信息:"); for(String name: metadata.names()){ System.out.println(name+":"+metadata.get(name)); } System.out.println("pdf文件内容:"); System.out.println(handler.toString()); }
浙公网安备 33010602011771号