pdf解析

 @Override
    public DocumentMetaData parsePdf(String url) {
        log.info("[{}]pdf文档解析开始", url);
        String filePath = null;
        DocumentMetaData documentMetaData = null;
        try {
            filePath = httpDownload.download(url);
            String dirPrefix = SpringContextUtils.getBean(ChanceinCollection.class).getReportChartPath();
            documentMetaData = pyParseFeign.pyPdfParse(dirPrefix, filePath);
            List<MetaChart> metaChartList = documentMetaData.getMetaCharts();
            List<File> fileList = Lists.newArrayList();
            if (CollectionUtils.isNotEmpty(metaChartList)) {
                for (MetaChart metaChart : metaChartList) {
                    // 解析图片中的文字
                    File file = new File(metaChart.getFilePath());
                    fileList.add(file);
                    InputStream inputStream = new FileInputStream(file);
                    String jsonStr = OcrUtils.imageAnalyticalStream(inputStream);
                    String data = OcrUtils.getText(jsonStr);
                    metaChart.setContent(data);
                }
            }
            documentMetaData.setMetaCharts(metaChartList);
            // 删除调用pdf解析接口后生成的临时文件
            for (File file : fileList) {
                boolean delete = file.delete();
                if (Boolean.FALSE == delete) {
                    log.error("删除临时生成的图片失败,图片临时路径:[{}]", file.getAbsolutePath());
                }
            }
        } catch (Exception e) {
            log.error("pdf文档解析失败", e);
        } finally {
            if (StringUtils.isNotBlank(filePath)) {
                boolean result = new File(filePath).delete();
                if (result) {
                    log.debug("删除本地冗余数据临时文档:[{}]", filePath);
                } else {
                    log.error("删除本地冗余数据临时文档失败:[{}]", filePath);
                }
            }
        }
        return documentMetaData;
    }
/**
 * 文件描述
 *
 * @ProductName: HundsunHEP
 * @ProjectName: cloud-search-preprocess-center
 * @Package: com.chancein.cloud.search.preprocess.documentprocess.downlaod
 * @Description: note
 * @Author: 陈柯名
 * @Date: 2022/4/1 14:54
 * @UpdateUser: 陈柯名
 * @UpdateDate: 2022/4/1 14:54
 * @UpdateRemark: The modified content
 * @Version: 1.0
 * <p>
 * Copyright© 2022 Hundsun Technologies Inc.All Rights Reserved
 **/
@Slf4j
@Component
public class HttpDownload implements Download {

    @Override
    public String download(String url) throws Exception {
        return httpDownload(url);
    }

    public String httpDownload(String url) throws Exception {
        HttpClient client = HttpClients.createDefault();
        HttpGet httpget = new HttpGet(url);
        HttpResponse response = client.execute(httpget);
        HttpEntity entity = response.getEntity();

        if (StringUtils.isBlank(url)) {
            return null;
        }

        log.info("下载地址:" + url);
        ChanceinCollection chancein = SpringContextUtils.getBean(ChanceinCollection.class);
        String tempFileDir = chancein.getTempFilePath();

        FileOutputStream fos = null;
        String tmpFilePath = null;
        try (InputStream is = entity.getContent()) {
            Header[] headers = response.getHeaders("Content-Type");
            // url中不带文件后缀 才从响应头中获取文件类型
            String suffix = FileUtils.getNetworkFileExt(url);
            if (StringUtils.isEmpty(suffix)) {
                suffix = getFileType(headers);
            }
            tmpFilePath = tempFileDir + "/" + StringUtils.getUUID() + "." + suffix;

            fos = new FileOutputStream(tmpFilePath);
            //通过bis读取文件内容,写入到bos
            byte[] buffer = new byte[1024];
            int len;
            while ((len = is.read(buffer)) != -1) {
                fos.write(buffer, 0, len);
            }
            fos.flush();
        } catch (Exception e) {
//            log.error("下载文件失败:" + url);
            throw e;
        } finally {
            if (fos != null) {
                fos.close();
            }
        }
        return tmpFilePath;
    }

    /**
     * 获取文件类型
     * @param headers
     * @return
     */
    private String getFileType(Header[] headers) {
        String fileType = null;
        if (headers != null && headers.length > 0) {
            try{
                String contentType = headers[0].getValue();
                switch (contentType){
                    case "application/msword":
                        fileType = "doc";
                        break;
                    case "application/vnd.ms-excel":
                    case "application/x-xls":
                        fileType = "xls";
                        break;
                    case "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
                        fileType = "xlsx";
                        break;
                    case "application/vnd.ms-powerpoint":
                    case "application/x-ppt":
                        fileType = "ppt";
                        break;
                    case "application/pdf":
                        fileType = "pdf";
                        break;
                    case "text/plain":
                        fileType = "txt";
                        break;
                    default:
                        break;
                }

            } catch (Exception e) {
                log.error("根据链接获取文件格式失败,异常{}",e.getMessage());
            }
        }
        return fileType;
    }

}
  public DocumentMetaData pyPdfParse(String dirPrefix,String filePath) {
        if (StringUtils.isBlank(filePath) || StringUtils.isBlank(dirPrefix)) {
            return null;
        }
        Map<String, String> params = new HashMap<>();
        params.put("dirPrefix", dirPrefix);
        params.put("filePath", filePath);
        String url = this.pdfParseUrl.replace(" ", "");
        log.info("当前需要调用的python服务地址为:[{}],请求参数为:[{}]", url, params);
        String content = HttpUtils.doGet(url, params);
        return JSONObject.parseObject(content, DocumentMetaData.class);
    }
    /**
     * 带参数的get请求
     */
    public static String doGet(String url, Map<String, String> params) {
        try {
            // 禁用请求重试
            CloseableHttpClient httpClient = HttpClientBuilder.create().disableAutomaticRetries().build();

            // 设置超时时间
            RequestConfig requestConfig = RequestConfig.custom()
                    //一、连接目标服务器超时时间:ConnectionTimeout-->指的是连接一个url的连接等待时间
                    .setConnectTimeout(5000)
                    //二、读取目标服务器数据超时时间:SocketTimeout-->指的是连接上一个url,获取response的返回等待时间
                    .setSocketTimeout(5 * 60 * 1000)
                    //三、从连接池获取连接的超时时间:ConnectionRequestTimeout-->如果连接池里连接都被用了,且超过设定时间,就会报错connectionrequesttimeout,会抛出超时异常
                    .setConnectionRequestTimeout(3 * 60 * 1000).build();

            URIBuilder builder = new URIBuilder(url);
            if (params != null && !params.isEmpty()) {
                for (Map.Entry<String, String> entry : params.entrySet()) {
                    builder.addParameter(entry.getKey(), entry.getValue());
                }
            }
            HttpGet client = new HttpGet(builder.build());
            client.setConfig(requestConfig);
            HttpResponse response = httpClient.execute(client);

            /**请求发送成功,并得到响应**/
            if (response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) {
                /**读取服务器返回过来的json字符串数据**/
                String strResult = EntityUtils.toString(response.getEntity());

                return strResult;
            }
        } catch (IOException | URISyntaxException e) {
            log.error("调用服务:[{}]出现异常:[{}]", url, e);
        }

        return null;
    }

python对自然语言和文本识别的处理比较好,我们会把文件在服务器的具体的地址和临时文件存放地址发给python的文件解析服务,解析出业务数据 标题,分析师列表,语种,作者,概要等,还有里面的元数据 文件唯一标识,pdf里面包含的图片,页码,等

posted @ 2023-07-12 14:21  lamda表达式先驱  阅读(107)  评论(0)    收藏  举报