1 import java.io.IOException;
2 import java.util.ArrayList;
3 import java.util.HashMap;
4 import java.util.List;
5 import java.util.Map;
6
7 import org.jsoup.Connection;
8 import org.jsoup.Connection.Method;
9 import org.jsoup.Connection.Response;
10 import org.jsoup.Jsoup;
11 import org.jsoup.nodes.Document;
12 import org.jsoup.nodes.Element;
13 import org.jsoup.select.Elements;
14
15 /**
16 * grid2008的代码,解析了优先出版
17 *
18 */
19
20 public class GetCkTest20082 {
21 //定义cat标记 注意大写。你懂的
22 static String cat="CJFQ";
23 static String sKuakuID = "";
24 public static void main(String[] args) {
25
26 //列表url
27
28 Map<String,String> cookies = getCookie(cat);
29
30 if(cookies.size() >0){
31 //列表
32 String listUrl = "http://epub.cnki.net/grid2008/brief/brief.aspx?pagename=asp.brief_result_aspx&dbprefix=scdb&skuakuid="+sKuakuID+
33 "&loadgroup=1&prio=true&stab=normal&turnpage=1&recordsperpage=20&queryid="+sKuakuID+"&id=&curpage=3";
34
35 //文章最初链接
36 List<String> articleInitUrls = new ArrayList<String>();
37 Connection conn = Jsoup.connect(listUrl);
38 conn.method(Method.GET);
39 conn.followRedirects(false);
40 conn.timeout(5000);
41 conn.cookies(cookies);
42 try {
43 Document doc = conn.get();
44 Elements links = doc.select("a[target=NewBriefDetail]");
45 if(links.size() <= 0){
46 System.out.println("没有更多文章。");
47 }else{
48 for(Element link : links){
49 articleInitUrls.add(link.attr("abs:href"));
50 //System.out.println(articleInitUrls);
51 }
52 }
53
54 } catch (IOException e) {
55 System.out.println("链接超时了。。");
56 }
57
58 if(articleInitUrls.size()<=0){
59 System.out.println("没有文章!");
60 }
61 for(String articleInitUrl : articleInitUrls){
62
63 Connection conn2 = Jsoup.connect(articleInitUrl);
64
65 conn2.header("Referer", listUrl);
66 conn2.cookies(cookies);
67 conn2.followRedirects(false);
68 try {
69 Document doc = conn2.get();
70 Elements links = doc.select("h2 > a[href]");
71 //System.out.println(links);
72 if(links.size()<=0){
73 System.out.println("最初链接为:【"+articleInitUrl+"】的文章获取实际链接失败!");
74 }else{
75
76 String url1 = links.toString();
77 //System.out.println(url1.substring(url1.indexOf("detail%252f")+11,url1.indexOf(".html")));
78 String url2=url1.substring(url1.indexOf("detail%252f")+11,url1.indexOf(".html"));
79
80 //String articleUrl = links.get(0).attr("href");
81
82 String articleUrl="www.cnki.net/kcms/detail/"+url2+".html";
83 System.out.println(articleUrl);
84 //getArticle(articleUrl);
85 }
86 } catch (IOException e) {
87 System.out.println("最初链接为:【"+articleInitUrl+"】的文章链接超时!");
88 }
89 }
90 }
91
92
93 }
94
95 public static Map<String,String> getCookie(String cat) {
96
97 String listUrl = "http://epub.cnki.net/grid2008/brief/Result.aspx";
98 //检索
99
100 String searchHander = "http://epub.cnki.net/grid2008/request/search.aspx?PageName=ASP.brief_result_aspx&DBViewType=FullText";
101 Connection conn = Jsoup.connect(listUrl);
102 conn.method(Method.GET);
103 conn.followRedirects(false);
104 conn.timeout(5000);
105 try {
106 Document doc = conn.get();
107 String db_opt = doc.select("input#db_opt").attr("value");
108 String db_prefix = doc.select("input#db_prefix").attr("value");
109 String db_configfile = doc.select("input#db_configfile").attr("value");
110
111 String searchHanderUrl = searchHander+"&DbCatalog="+db_opt+"&DbPrefix="+db_prefix+"&ConfigFile="+db_configfile;
112 Connection conn2 = Jsoup.connect(searchHanderUrl);
113 conn2.method(Method.GET);
114 conn2.followRedirects(false);
115 conn2.timeout(5000);
116 Response response;
117 Document doc2 = conn2.get();
118 String responseContent = doc2.select("body").text();
119 if(responseContent.indexOf("sKuakuID") !=-1){
120 System.out.println(responseContent.substring(responseContent.indexOf("sKuakuID")+9));
121 sKuakuID = responseContent.substring(responseContent.indexOf("sKuakuID")+9);
122 }
123 response = conn2.response();
124 return response.cookies();
125 } catch (IOException e) {
126 System.out.println("获取cookies的链接超时了。你懂的!");
127 return new HashMap<String,String>();
128 }
129
130 }
131
132 public static void getArticle(String articleUrl) {
133 Connection conn = Jsoup.connect(articleUrl);
134 conn.method(Method.GET);
135 conn.followRedirects(false);
136 conn.timeout(5000);
137 try {
138 Document doc = conn.get();
139 //这里只打印标题了。
140 Elements links = doc.select("span#chTitle");
141 System.out.println("文章标题:"+links.get(0).text()+"——链接:【"+articleUrl+"】");
142 } catch (IOException e) {
143 System.out.println("链接文章:【"+articleUrl+"】超时了。");
144 }
145
146 }
147
148 }