jsoup解析知网
1 import java.io.IOException; 2 import java.util.ArrayList; 3 import java.util.List; 4 5 import org.jsoup.Connection; 6 import org.jsoup.Connection.Method; 7 import org.jsoup.Jsoup; 8 import org.jsoup.nodes.Document; 9 import org.jsoup.nodes.Element; 10 import org.jsoup.select.Elements; 11 12 public class Kns50 { 13 14 public static void main(String[] args) throws IOException { 15 16 String url = "http://www.cfed.cnki.net/kns50/scdbsearch/cdbbrief.aspx?curpage=1&RecordsPerPage=10&QueryID=45&ID=&turnpage=1&advancedvalue1=&advancedfield1=&secondorderby=&searchInResult=&tpagemode=U&Fields=题名%7c作者%7c关键词%7c摘要%7c全文%7c单位%7c来源%7c主题&KuaKuID=45"; 17 18 List<String> articleInitUrls = new ArrayList<String>();//存放解析的链接 19 20 Connection conn = Jsoup.connect(url); 21 conn.cookie("ASP.NET_SessionId", "i00lvg55szyk2m55bszn12af"); 22 conn.method(Method.GET); 23 conn.followRedirects(false); 24 Document doc = conn.get(); 25 26 Elements links = doc.select("a[href]"); 27 28 for (Element link : links) { 29 30 boolean cd=link.attr("abs:href").contains("detail.aspx");//找出只含有detail.aspk的链接 31 boolean cf=articleInitUrls.contains(link.attr("abs:href"));//剔除重复的链接 32 if (cd) { 33 if (!cf) { 34 articleInitUrls.add(link.attr("abs:href")); 35 } 36 37 } 38 } 39 40 for(String articleInitUrl : articleInitUrls){ 41 getContentByJsoup(articleInitUrl); 42 System.out.println("**********"); 43 } 44 45 46 } 47 48 public static void getContentByJsoup(String url){ 49 //解析整个网页 50 String content=""; 51 try { 52 Document doc=Jsoup.connect(url) 53 .data("jquery", "java") 54 .userAgent("Mozilla") 55 .cookie("auth", "token") 56 .timeout(50000) 57 .get(); 58 59 content=doc.toString(); 60 } catch (IOException e) { 61 e.printStackTrace(); 62 } 63 64 Document doc=Jsoup.parse(content); 65 66 System.out.println("链接:"+url); 67 68 String title=doc.select("span.datatitle").get(1).text(); 69 System.out.println("标题:"+title); 70 71 String author=doc.select("td").text().split("【作者】")[1].split("【")[0]; 72 System.out.println("作者:"+author); 73 74 String summary=doc.select("td").text().split("【中文摘要】")[1].split("【")[0]; 75 System.out.println("中文摘要:"+summary); 76 } 77 78 }
代码还有需要完善的,cookie是写死的,这部分等研究好再改~

浙公网安备 33010602011771号