<经验>使用Jsoup处理HTML的一个工具类

package cn.com.wind.utils;

import lombok.extern.slf4j.Slf4j;
import org.jsoup.Jsoup;
import org.jsoup.nodes.*;
import org.jsoup.select.Elements;

import java.io.*;
import java.util.*;

/**
 * @Author qymeng
 * @Date 2022/4/6
 * @Description
 */
@Slf4j
public class HtmlUtil {
    private final static List<String> controlTagList = Arrays.asList("th", "tr", "li", "p", "h1", "h2", "h3", "h4", "h5", "h6");

    public static void main(String[] args) {
        File file = new File("D:\\qymeng\\SVN\\SVN-reverse\\dev\\src\\dataOriFile\\2020\\03\\03\\0400\\{316DE765-5CC5-11EA-A156-26D8B346C975}.html");
        List<String> list = parseHtmlFile(file);
//        String html="<p style=\"text-align:justify;margin-bottom:0pt;margin-top:1pt;text-indent:5.24%;font-size:8pt;font-family:Times New Roman;font-weight:normal;font-style:normal;text-transform:none;font-variant: normal;\">Indicate by check mark if the registrant is a well-known seasoned issuer, as defined in Rule 405 of the Securities Act.&nbsp;&nbsp;&nbsp;&nbsp;<ix:nonnumeric id=\"F_000009\" name=\"dei:EntityWellKnownSeasonedIssuer\" contextref=\"C_0001674168_20190101_20191231\">Yes</ix:nonnumeric>&nbsp;&nbsp;<span style=\"font-family:MS Mincho;\">☒</span>&nbsp;&nbsp;&nbsp;No&nbsp;&nbsp;<span style=\"font-family:MS Mincho;\">☐</span>&nbsp;&nbsp;&nbsp;</p>";
//        List<String> list = parseHtmlString(html);
        System.out.println(list.size());
        for (String str : list) {
            System.out.println(str);
        }
    }

    /**
     * 解析html文件
     * @param file
     * @return
     */
    public static List<String> parseHtmlFile(File file) {
        List<String> resultList = new ArrayList<>();
        Document doc = null;
        try {
            doc = Jsoup.parse(file, "utf-8");
        } catch (IOException e) {
            e.printStackTrace();
        }
        assert doc != null;
        List<String> list = deepTravalTag(doc.body());
        if (list.size() != 0) {
            resultList.addAll(list);
        }
        removeSpaceFromList(resultList);
        return resultList;
    }

    /**
     * 解析byte数组
     * @param fileContent
     * @return
     */
    public static List<String> parseHtmlBytes(byte[] fileContent){
        InputStream inputStream = new ByteArrayInputStream(fileContent);
        List<String> resultList = new ArrayList<>();
        Document doc = null;
        try {
            doc = Jsoup.parse(inputStream, "utf-8", "");
        } catch (IOException e) {
            e.printStackTrace();
        }
        List<String> list = deepTravalTag(doc.body());
        if (list.size() != 0) {
            resultList.addAll(list);
        }
        removeSpaceFromList(resultList);
        return resultList;
    }

    /**
     * 解析html字符串
     * @param html
     * @return
     */
    public static List<String> parseHtmlString(String html) {
        List<String> resultList = new ArrayList<>();
        Document doc = Jsoup.parse(html, "utf-8");
        List<String> list = deepTravalTag(doc.body());
        if (list.size() != 0) {
            resultList.addAll(list);
        }
        removeSpaceFromList(resultList);
        return resultList;
    }

    /**
     * 深度遍历找文本(递归)
     *
     * @param element
     * @return
     */
    public static String getElementValue(Element element) {
        StringBuilder res = new StringBuilder();
        if (element.childrenSize() == 0) {
            String childrenStr = (element.text().matches("\\s*")) ? "" : element.text().replaceAll("\u200B", "");
            if (!childrenStr.matches("\\s*")) {
                return childrenStr;
            }
        } else {
            for (Node node : element.childNodes()) {
                res.append(getNodeValue(node, !element.tagName().toLowerCase(Locale.ROOT).equals("tr")));
            }
        }

        return res.toString();
    }

    /**
     * 深度遍历结点(递归)
     *
     * @param node
     * @return
     */
    public static String getNodeValue(Node node, boolean needBr) {
        StringBuilder result = new StringBuilder();
        if (node.childNodes().size() == 0) {
            String text = "";
            if (node.nodeName().equals("#text")) {
                text = ((TextNode) node).text();
            } else if (node.nodeName().equals("br") && needBr) {
                text = "\r\n";
            } else {
                text = ((Element) node).text();
            }
            String childrenStr = (text.matches("\\s*")) ? "" : text.replaceAll("\u200B", "");
            if (text.equals("\r\n")) {
                result.append("\r\n") ;
            }
            if (!childrenStr.matches("\\s*")) {
                return childrenStr;
            }
        } else {
            for (int i = 0; i < node.childNodes().size(); i++) {
                if (node.nodeName().toLowerCase(Locale.ROOT).equals("br") && needBr) {
                    if (!result.toString().matches("\\s*")) {
                        result.append("\r\n");
                    }
                } else {
                    result.append(getNodeValue(node.childNode(i), needBr));
                }
            }
        }
        return result.toString();
    }


    /**
     * 深度遍历找标签
     *
     * @param element
     * @return
     */
    public static List<String> deepTravalTag(Element element) {
        List<String> resultList = new ArrayList<>();
        if (element.childrenSize() == 0) {
            resultList.add(element.text());
            return resultList;
        }
        if (element.attr("style") != null && element.attr("style").contains("display:none")) {
            return resultList;
        }

        int lastIndex = -1;
        for (int i = 0; i < element.childrenSize(); i++) {
            Element child = element.children().get(i);

            if (child.text() == null && child.text().matches("\\s*") && !child.nodeName().equals("br")) {
                log.info(child.nodeName() + "中文本为空");
                continue;
            }
            if (child.attr("style") != null && child.attr("style").contains("display:none")) {
                continue;
            }

            //判断标签的子标签包不包含分段标签
            boolean needFind = needFindChildren(child);
            //自己不是分段标签,并且所有子标签也不是
            if (!controlTagList.contains(child.nodeName()) && !needFind) {
                //直接获得文本
                String eleStr = getElementValue(child);
                if (eleStr.matches("\\s*")) {
                    continue;
                }
                if (lastIndex != -1) {
                    String s = resultList.get(lastIndex) + eleStr;
                    resultList.set(lastIndex, s);
                } else {
                    resultList.add(eleStr);
                    lastIndex = resultList.size() - 1;
                }
            } else if (controlTagList.contains(child.nodeName()) && !needFind) { //自己是分段标签,子标签不是
                lastIndex = -1;
                String line = getElementValue(child);
                resultList.add(line);
            } else {
                lastIndex = -1;
                List<String> list = deepTravalTag(child);
                resultList.addAll(list);
            }

        }
        return resultList;
    }

    public static boolean needFindChildren(Element element) {
        if (element.nodeName().equals("tr") || element.nodeName().equals("li")) {
            return false;
        }

        boolean contains = false;
        for (String tag : controlTagList) {
            Elements children = element.children();
            for (Element ele : children) {
                if (ele.getElementsByTag(tag).size() != 0) {
                    contains = true;
                    break;
                }
            }

        }
        return contains;
    }

    /**
     * 去空格
     *
     * @param list
     */
    public static void removeSpaceFromList(List<String> list) {
        list.removeIf(s -> s == null || s.isEmpty());
    }

}

 

posted @ 2022-04-22 13:19  Mikey-  阅读(166)  评论(0)    收藏  举报