基于boilberpipe算法抓取文章类网页中文章文本
<dependency> <groupId>de.l3s.boilerpipe</groupId> <artifactId>boilerpipe</artifactId> <!--<version>1.2.0</version>--> <version>1.1.0</version> </dependency> <dependency> <groupId>xerces</groupId> <artifactId>xercesImpl</artifactId> <version>2.9.1</version> </dependency> <dependency> <groupId>net.sourceforge.nekohtml</groupId> <artifactId>nekohtml</artifactId> <version>1.9.13</version> </dependency>
public static String getNewsContent(String html) {
if (StringUtils.isEmpty(html)) return html;
String content = null;
InputStream is = null;
try {
is = new ByteArrayInputStream(html.getBytes());
InputSource inputSource = new InputSource(is);
inputSource.setEncoding("UTF-8"); // 在这里设置你的文本的正确格式
TextDocument textDocument = new BoilerpipeSAXInput(inputSource).getTextDocument();
BoilerpipeExtractor extractor = CommonExtractors.CANOLA_EXTRACTOR;
extractor.process(textDocument);
content = textDocument.getContent();
}catch (Exception e){
e.printStackTrace();
} finally {
if (is!=null){
try {
is.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
return content;
}
不积跬步无以至千里

浙公网安备 33010602011771号