jsoup解析知网

 1 import java.io.IOException;
 2 import java.util.ArrayList;
 3 import java.util.List;
 4 
 5 import org.jsoup.Connection;
 6 import org.jsoup.Connection.Method;
 7 import org.jsoup.Jsoup;
 8 import org.jsoup.nodes.Document;
 9 import org.jsoup.nodes.Element;
10 import org.jsoup.select.Elements;
11 
12 public class Kns50 {
13 
14 public static void main(String[] args) throws IOException {
15 
16 String url = "http://www.cfed.cnki.net/kns50/scdbsearch/cdbbrief.aspx?curpage=1&RecordsPerPage=10&QueryID=45&ID=&turnpage=1&advancedvalue1=&advancedfield1=&secondorderby=&searchInResult=&tpagemode=U&Fields=题名%7c作者%7c关键词%7c摘要%7c全文%7c单位%7c来源%7c主题&KuaKuID=45";
17 
18 List<String> articleInitUrls = new ArrayList<String>();//存放解析的链接
19 
20 Connection conn = Jsoup.connect(url);
21 conn.cookie("ASP.NET_SessionId", "i00lvg55szyk2m55bszn12af");
22 conn.method(Method.GET);
23 conn.followRedirects(false);
24 Document doc = conn.get();
25 
26 Elements links = doc.select("a[href]");
27 
28 for (Element link : links) {
29 
30 boolean cd=link.attr("abs:href").contains("detail.aspx");//找出只含有detail.aspk的链接
31 boolean cf=articleInitUrls.contains(link.attr("abs:href"));//剔除重复的链接
32 if (cd) {
33 if (!cf) {
34 articleInitUrls.add(link.attr("abs:href")); 
35 }
36 
37 }
38 }
39 
40 for(String articleInitUrl : articleInitUrls){
41 getContentByJsoup(articleInitUrl); 
42 System.out.println("**********");
43 }
44 
45 
46 }
47 
48 public static void getContentByJsoup(String url){ 
49 //解析整个网页
50 String content=""; 
51 try { 
52 Document doc=Jsoup.connect(url) 
53 .data("jquery", "java") 
54 .userAgent("Mozilla") 
55 .cookie("auth", "token") 
56 .timeout(50000) 
57 .get();
58 
59 content=doc.toString();
60 } catch (IOException e) { 
61 e.printStackTrace(); 
62 }
63 
64 Document doc=Jsoup.parse(content); 
65 
66 System.out.println("链接:"+url); 
67 
68 String title=doc.select("span.datatitle").get(1).text();
69 System.out.println("标题:"+title); 
70 
71 String author=doc.select("td").text().split("【作者】")[1].split("【")[0];
72 System.out.println("作者:"+author); 
73 
74 String summary=doc.select("td").text().split("【中文摘要】")[1].split("【")[0];
75 System.out.println("中文摘要:"+summary); 
76 } 
77 
78 }

代码还有需要完善的,cookie是写死的,这部分等研究好再改~

posted @ 2013-11-29 17:23  蓦然回首的包子  阅读(315)  评论(0)    收藏  举报