package cn.com.wind.utils;
import lombok.extern.slf4j.Slf4j;
import org.jsoup.Jsoup;
import org.jsoup.nodes.*;
import org.jsoup.select.Elements;
import java.io.*;
import java.util.*;
/**
* @Author qymeng
* @Date 2022/4/6
* @Description
*/
@Slf4j
public class HtmlUtil {
private final static List<String> controlTagList = Arrays.asList("th", "tr", "li", "p", "h1", "h2", "h3", "h4", "h5", "h6");
public static void main(String[] args) {
File file = new File("D:\\qymeng\\SVN\\SVN-reverse\\dev\\src\\dataOriFile\\2020\\03\\03\\0400\\{316DE765-5CC5-11EA-A156-26D8B346C975}.html");
List<String> list = parseHtmlFile(file);
// String html="<p style=\"text-align:justify;margin-bottom:0pt;margin-top:1pt;text-indent:5.24%;font-size:8pt;font-family:Times New Roman;font-weight:normal;font-style:normal;text-transform:none;font-variant: normal;\">Indicate by check mark if the registrant is a well-known seasoned issuer, as defined in Rule 405 of the Securities Act. <ix:nonnumeric id=\"F_000009\" name=\"dei:EntityWellKnownSeasonedIssuer\" contextref=\"C_0001674168_20190101_20191231\">Yes</ix:nonnumeric> <span style=\"font-family:MS Mincho;\">☒</span> No <span style=\"font-family:MS Mincho;\">☐</span> </p>";
// List<String> list = parseHtmlString(html);
System.out.println(list.size());
for (String str : list) {
System.out.println(str);
}
}
/**
* 解析html文件
* @param file
* @return
*/
public static List<String> parseHtmlFile(File file) {
List<String> resultList = new ArrayList<>();
Document doc = null;
try {
doc = Jsoup.parse(file, "utf-8");
} catch (IOException e) {
e.printStackTrace();
}
assert doc != null;
List<String> list = deepTravalTag(doc.body());
if (list.size() != 0) {
resultList.addAll(list);
}
removeSpaceFromList(resultList);
return resultList;
}
/**
* 解析byte数组
* @param fileContent
* @return
*/
public static List<String> parseHtmlBytes(byte[] fileContent){
InputStream inputStream = new ByteArrayInputStream(fileContent);
List<String> resultList = new ArrayList<>();
Document doc = null;
try {
doc = Jsoup.parse(inputStream, "utf-8", "");
} catch (IOException e) {
e.printStackTrace();
}
List<String> list = deepTravalTag(doc.body());
if (list.size() != 0) {
resultList.addAll(list);
}
removeSpaceFromList(resultList);
return resultList;
}
/**
* 解析html字符串
* @param html
* @return
*/
public static List<String> parseHtmlString(String html) {
List<String> resultList = new ArrayList<>();
Document doc = Jsoup.parse(html, "utf-8");
List<String> list = deepTravalTag(doc.body());
if (list.size() != 0) {
resultList.addAll(list);
}
removeSpaceFromList(resultList);
return resultList;
}
/**
* 深度遍历找文本(递归)
*
* @param element
* @return
*/
public static String getElementValue(Element element) {
StringBuilder res = new StringBuilder();
if (element.childrenSize() == 0) {
String childrenStr = (element.text().matches("\\s*")) ? "" : element.text().replaceAll("\u200B", "");
if (!childrenStr.matches("\\s*")) {
return childrenStr;
}
} else {
for (Node node : element.childNodes()) {
res.append(getNodeValue(node, !element.tagName().toLowerCase(Locale.ROOT).equals("tr")));
}
}
return res.toString();
}
/**
* 深度遍历结点(递归)
*
* @param node
* @return
*/
public static String getNodeValue(Node node, boolean needBr) {
StringBuilder result = new StringBuilder();
if (node.childNodes().size() == 0) {
String text = "";
if (node.nodeName().equals("#text")) {
text = ((TextNode) node).text();
} else if (node.nodeName().equals("br") && needBr) {
text = "\r\n";
} else {
text = ((Element) node).text();
}
String childrenStr = (text.matches("\\s*")) ? "" : text.replaceAll("\u200B", "");
if (text.equals("\r\n")) {
result.append("\r\n") ;
}
if (!childrenStr.matches("\\s*")) {
return childrenStr;
}
} else {
for (int i = 0; i < node.childNodes().size(); i++) {
if (node.nodeName().toLowerCase(Locale.ROOT).equals("br") && needBr) {
if (!result.toString().matches("\\s*")) {
result.append("\r\n");
}
} else {
result.append(getNodeValue(node.childNode(i), needBr));
}
}
}
return result.toString();
}
/**
* 深度遍历找标签
*
* @param element
* @return
*/
public static List<String> deepTravalTag(Element element) {
List<String> resultList = new ArrayList<>();
if (element.childrenSize() == 0) {
resultList.add(element.text());
return resultList;
}
if (element.attr("style") != null && element.attr("style").contains("display:none")) {
return resultList;
}
int lastIndex = -1;
for (int i = 0; i < element.childrenSize(); i++) {
Element child = element.children().get(i);
if (child.text() == null && child.text().matches("\\s*") && !child.nodeName().equals("br")) {
log.info(child.nodeName() + "中文本为空");
continue;
}
if (child.attr("style") != null && child.attr("style").contains("display:none")) {
continue;
}
//判断标签的子标签包不包含分段标签
boolean needFind = needFindChildren(child);
//自己不是分段标签,并且所有子标签也不是
if (!controlTagList.contains(child.nodeName()) && !needFind) {
//直接获得文本
String eleStr = getElementValue(child);
if (eleStr.matches("\\s*")) {
continue;
}
if (lastIndex != -1) {
String s = resultList.get(lastIndex) + eleStr;
resultList.set(lastIndex, s);
} else {
resultList.add(eleStr);
lastIndex = resultList.size() - 1;
}
} else if (controlTagList.contains(child.nodeName()) && !needFind) { //自己是分段标签,子标签不是
lastIndex = -1;
String line = getElementValue(child);
resultList.add(line);
} else {
lastIndex = -1;
List<String> list = deepTravalTag(child);
resultList.addAll(list);
}
}
return resultList;
}
public static boolean needFindChildren(Element element) {
if (element.nodeName().equals("tr") || element.nodeName().equals("li")) {
return false;
}
boolean contains = false;
for (String tag : controlTagList) {
Elements children = element.children();
for (Element ele : children) {
if (ele.getElementsByTag(tag).size() != 0) {
contains = true;
break;
}
}
}
return contains;
}
/**
* 去空格
*
* @param list
*/
public static void removeSpaceFromList(List<String> list) {
list.removeIf(s -> s == null || s.isEmpty());
}
}