JAVA利用jsoup爬取小说网站内容

jar包

 

 



package
Jsouop; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import javax.print.DocFlavor; import javax.print.attribute.standard.PDLOverrideSupported; import java.io.*; import java.net.URL; import java.util.regex.Matcher; import java.util.regex.Pattern; public class Jsoupdemo2 { public static void main(String[] args) throws IOException { //1.导入jar包 //2.获取DOCUMENT对象 //2.1获取路径 String paths = Jsoupdemo2.class.getClassLoader().getResource("student.xml").getPath(); //2.2解析XML对象。加载进内存 ///Document document = Jsoup.parse(new File(paths), "utf-8"); //下一章的href String Shref=null; String urltow=readFileContent("d:\\yd.txt"); System.out.println(urltow); //URL打开一个连接 //随便打开一个章节 URL url=new URL("http://www.uidzhx.com/du/27/27614/"+urltow+""); //获取指定div元素 String sts=Jsoup.parse(url, 100000).select("#content1").toString(); String sc=getChinese(sts); //转换为数组 char[] c=sc.toCharArray(); for(int i=0;i<=c.length-1;i++) { System.out.print(c[i]); //每100个换一行 if(i%50==0){ System.out.println(); } } // System.out.print(ststow);//输出 //获取全部A标签元素 Elements a = Jsoup.parse(url, 100000).getElementsByTag("a"); for (Element element : a) { if(element.text().contains("下一章")){ Shref=element.attr("href");//取得下一章的href } } System.out.println(Shref); //创建文本记录下一章节href FileWriter fileWriter=new FileWriter("d:\\yd.txt"); fileWriter.write(Shref); fileWriter.flush(); fileWriter.close(); } public static String getChinese(String paramValue) {//正则方法 String regex = "([\u4e00-\u9fa5,。]+)"; String str = ""; Matcher matcher = Pattern.compile(regex).matcher(paramValue); while (matcher.find()) { str+= matcher.group(0); if(str.length()%200==0){ System.out.println(); } } return str; } //读取TXT文本内容 public static String readFileContent(String fileName) { File file = new File(fileName); BufferedReader reader = null; StringBuffer sbf = new StringBuffer(); try { reader = new BufferedReader(new FileReader(file)); String tempStr; while ((tempStr = reader.readLine()) != null) { sbf.append(tempStr); } reader.close(); return sbf.toString(); } catch (IOException e) { e.printStackTrace(); } finally { if (reader != null) { try { reader.close(); } catch (IOException e1) { e1.printStackTrace(); } } } return sbf.toString(); } }

 

posted on 2020-12-31 10:55  风华流沙~  阅读(446)  评论(0)    收藏  举报

导航