pdf解析
@Override public DocumentMetaData parsePdf(String url) { log.info("[{}]pdf文档解析开始", url); String filePath = null; DocumentMetaData documentMetaData = null; try { filePath = httpDownload.download(url); String dirPrefix = SpringContextUtils.getBean(ChanceinCollection.class).getReportChartPath(); documentMetaData = pyParseFeign.pyPdfParse(dirPrefix, filePath); List<MetaChart> metaChartList = documentMetaData.getMetaCharts(); List<File> fileList = Lists.newArrayList(); if (CollectionUtils.isNotEmpty(metaChartList)) { for (MetaChart metaChart : metaChartList) { // 解析图片中的文字 File file = new File(metaChart.getFilePath()); fileList.add(file); InputStream inputStream = new FileInputStream(file); String jsonStr = OcrUtils.imageAnalyticalStream(inputStream); String data = OcrUtils.getText(jsonStr); metaChart.setContent(data); } } documentMetaData.setMetaCharts(metaChartList); // 删除调用pdf解析接口后生成的临时文件 for (File file : fileList) { boolean delete = file.delete(); if (Boolean.FALSE == delete) { log.error("删除临时生成的图片失败,图片临时路径:[{}]", file.getAbsolutePath()); } } } catch (Exception e) { log.error("pdf文档解析失败", e); } finally { if (StringUtils.isNotBlank(filePath)) { boolean result = new File(filePath).delete(); if (result) { log.debug("删除本地冗余数据临时文档:[{}]", filePath); } else { log.error("删除本地冗余数据临时文档失败:[{}]", filePath); } } } return documentMetaData; }
/** * 文件描述 * * @ProductName: HundsunHEP * @ProjectName: cloud-search-preprocess-center * @Package: com.chancein.cloud.search.preprocess.documentprocess.downlaod * @Description: note * @Author: 陈柯名 * @Date: 2022/4/1 14:54 * @UpdateUser: 陈柯名 * @UpdateDate: 2022/4/1 14:54 * @UpdateRemark: The modified content * @Version: 1.0 * <p> * Copyright© 2022 Hundsun Technologies Inc.All Rights Reserved **/ @Slf4j @Component public class HttpDownload implements Download { @Override public String download(String url) throws Exception { return httpDownload(url); } public String httpDownload(String url) throws Exception { HttpClient client = HttpClients.createDefault(); HttpGet httpget = new HttpGet(url); HttpResponse response = client.execute(httpget); HttpEntity entity = response.getEntity(); if (StringUtils.isBlank(url)) { return null; } log.info("下载地址:" + url); ChanceinCollection chancein = SpringContextUtils.getBean(ChanceinCollection.class); String tempFileDir = chancein.getTempFilePath(); FileOutputStream fos = null; String tmpFilePath = null; try (InputStream is = entity.getContent()) { Header[] headers = response.getHeaders("Content-Type"); // url中不带文件后缀 才从响应头中获取文件类型 String suffix = FileUtils.getNetworkFileExt(url); if (StringUtils.isEmpty(suffix)) { suffix = getFileType(headers); } tmpFilePath = tempFileDir + "/" + StringUtils.getUUID() + "." + suffix; fos = new FileOutputStream(tmpFilePath); //通过bis读取文件内容,写入到bos byte[] buffer = new byte[1024]; int len; while ((len = is.read(buffer)) != -1) { fos.write(buffer, 0, len); } fos.flush(); } catch (Exception e) { // log.error("下载文件失败:" + url); throw e; } finally { if (fos != null) { fos.close(); } } return tmpFilePath; } /** * 获取文件类型 * @param headers * @return */ private String getFileType(Header[] headers) { String fileType = null; if (headers != null && headers.length > 0) { try{ String contentType = headers[0].getValue(); switch (contentType){ case "application/msword": fileType = "doc"; break; case "application/vnd.ms-excel": case "application/x-xls": fileType = "xls"; break; case "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": fileType = "xlsx"; break; case "application/vnd.ms-powerpoint": case "application/x-ppt": fileType = "ppt"; break; case "application/pdf": fileType = "pdf"; break; case "text/plain": fileType = "txt"; break; default: break; } } catch (Exception e) { log.error("根据链接获取文件格式失败,异常{}",e.getMessage()); } } return fileType; } }
public DocumentMetaData pyPdfParse(String dirPrefix,String filePath) { if (StringUtils.isBlank(filePath) || StringUtils.isBlank(dirPrefix)) { return null; } Map<String, String> params = new HashMap<>(); params.put("dirPrefix", dirPrefix); params.put("filePath", filePath); String url = this.pdfParseUrl.replace(" ", ""); log.info("当前需要调用的python服务地址为:[{}],请求参数为:[{}]", url, params); String content = HttpUtils.doGet(url, params); return JSONObject.parseObject(content, DocumentMetaData.class); }
/** * 带参数的get请求 */ public static String doGet(String url, Map<String, String> params) { try { // 禁用请求重试 CloseableHttpClient httpClient = HttpClientBuilder.create().disableAutomaticRetries().build(); // 设置超时时间 RequestConfig requestConfig = RequestConfig.custom() //一、连接目标服务器超时时间:ConnectionTimeout-->指的是连接一个url的连接等待时间 .setConnectTimeout(5000) //二、读取目标服务器数据超时时间:SocketTimeout-->指的是连接上一个url,获取response的返回等待时间 .setSocketTimeout(5 * 60 * 1000) //三、从连接池获取连接的超时时间:ConnectionRequestTimeout-->如果连接池里连接都被用了,且超过设定时间,就会报错connectionrequesttimeout,会抛出超时异常 .setConnectionRequestTimeout(3 * 60 * 1000).build(); URIBuilder builder = new URIBuilder(url); if (params != null && !params.isEmpty()) { for (Map.Entry<String, String> entry : params.entrySet()) { builder.addParameter(entry.getKey(), entry.getValue()); } } HttpGet client = new HttpGet(builder.build()); client.setConfig(requestConfig); HttpResponse response = httpClient.execute(client); /**请求发送成功,并得到响应**/ if (response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) { /**读取服务器返回过来的json字符串数据**/ String strResult = EntityUtils.toString(response.getEntity()); return strResult; } } catch (IOException | URISyntaxException e) { log.error("调用服务:[{}]出现异常:[{}]", url, e); } return null; }
python对自然语言和文本识别的处理比较好,我们会把文件在服务器的具体的地址和临时文件存放地址发给python的文件解析服务,解析出业务数据 标题,分析师列表,语种,作者,概要等,还有里面的元数据 文件唯一标识,pdf里面包含的图片,页码,等

浙公网安备 33010602011771号