










 * @author ismallboy
 * @date 2020/2/19
public class WordToHtmlUtil {

    private UploadFileUtil uploadFileUtil;

     * 将word2003转换为html文件
     * @param input
     * @param bucket
     * @throws IOException
     * @throws TransformerException
     * @throws ParserConfigurationException
    public String Word2003ToHtml(InputStream input, String bucket, String directory, String visitPoint)
            throws IOException, TransformerException, ParserConfigurationException {
        HWPFDocument wordDocument = new HWPFDocument(input);
        WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(
        wordToHtmlConverter.setPicturesManager(new PicturesManager() {
            public String savePicture(byte[] content, PictureType pictureType, String suggestedName, float widthInches,
                                      float heightInches) {
                String fileName = AliOssUtil.generateImageFileName() + suggestedName.substring(suggestedName.lastIndexOf("."));
                return uploadFileUtil.uploadFile(content, bucket, directory, fileName, visitPoint);
        // 解析word文档
        Document htmlDocument = wordToHtmlConverter.getDocument();
        ByteArrayOutputStream baos = new ByteArrayOutputStream();
        OutputStream outStream = new BufferedOutputStream(baos);
        DOMSource domSource = new DOMSource(htmlDocument);
        StreamResult streamResult = new StreamResult(outStream);
        TransformerFactory factory = TransformerFactory.newInstance();
        Transformer serializer = factory.newTransformer();
        serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
        serializer.setOutputProperty(OutputKeys.INDENT, "yes");
        serializer.setOutputProperty(OutputKeys.METHOD, "html");
        serializer.transform(domSource, streamResult);
        String content = baos.toString();
        return content;

     * 2007版本word转换成html
     * @param input
     * @param bucket
     * @param directory
     * @param visitPoint
     * @return
     * @throws IOException
    public String Word2007ToHtml(InputStream input, String bucket, String directory, String visitPoint)
            throws IOException {
        XWPFDocument document = new XWPFDocument(input);
        // 2) 解析 XHTML配置 (这里设置IURIResolver来设置图片存放的目录)
        XHTMLOptions options = XHTMLOptions.create();
        Map<String, String> imgMap = new HashMap<>();
        options.setExtractor(new IImageExtractor() {
            public void extract(String imagePath, byte[] imageData) throws IOException {
                String fileName = AliOssUtil.generateImageFileName() + imagePath.substring(imagePath.lastIndexOf("."));
                String imgUrl = uploadFileUtil.uploadFile(imageData, bucket, directory, fileName, visitPoint);
                imgMap.put(imagePath, imgUrl);
        // html中图片的路径 相对路径
        options.URIResolver(new IURIResolver() {
            public String resolve(String uri) {
                return imgMap.get(uri);
        // 3) 将 XWPFDocument转换成XHTML
        ByteArrayOutputStream baos = new ByteArrayOutputStream();
        XHTMLConverter.getInstance().convert(document, baos, options);
        String content = baos.toString();
        return content;



    public String uploadSourceNews(MultipartFile file)  {
        String fileName = file.getOriginalFilename();
        String suffixName = fileName.substring(fileName.lastIndexOf("."));
        if (!".doc".equals(suffixName) && !".docx".equals(suffixName)) {
            throw new UploadFileFormatException();
        DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyyMM");
        String dateDir = formatter.format(;
        String directory = imageDir + "/" + dateDir + "/";
        String content = null;
        try {
            InputStream inputStream = file.getInputStream();
            if ("doc".equals(suffixName)) {
                content = wordToHtmlUtil.Word2003ToHtml(inputStream, imageBucket, directory, Constants.HTTPS_PREFIX + imageVisitHost);
            } else {
                content = wordToHtmlUtil.Word2007ToHtml(inputStream, imageBucket, directory, Constants.HTTPS_PREFIX + imageVisitHost);
        } catch (Exception ex) {
            logger.error("word to html exception, detail:", ex);
            return null;
        return content;




docx 是微软开发的基于 xml 的文字处理文件。docx 文件与 doc 文件不同, 因为 docx 文件将数据存储在单独的压缩文件和文件夹中。早期版本的 microsoft office (早于 office 2007) 不支持 docx 文件, 因为 docx 是基于 xml 的, 早期版本将 doc 文件另存为单个二进制文件。


DOCX is an XML based word processing file developed by Microsoft. DOCX files are different than DOC files as DOCX files store data in separate compressed files and folders. Earlier versions of Microsoft Office (earlier than Office 2007) do not support DOCX files because DOCX is XML based where the earlier versions save DOC file as a single binary file.












posted @ 2020-03-27 22:21  ismallboy  阅读(6229)  评论(6编辑  收藏  举报