解析了grid2008的代码

  1 import java.io.IOException;
  2 import java.util.ArrayList;
  3 import java.util.HashMap;
  4 import java.util.List;
  5 import java.util.Map;
  6 
  7 import org.jsoup.Connection;
  8 import org.jsoup.Connection.Method;
  9 import org.jsoup.Connection.Response;
 10 import org.jsoup.Jsoup;
 11 import org.jsoup.nodes.Document;
 12 import org.jsoup.nodes.Element;
 13 import org.jsoup.select.Elements;
 14 
 15 /**
 16  * grid2008的代码,解析了优先出版
 17  * 
 18  */
 19 
 20 public class GetCkTest20082 {
 21     //定义cat标记 注意大写。你懂的
 22     static String  cat="CJFQ";
 23     static String sKuakuID = "";
 24     public static void main(String[] args) {
 25         
 26         //列表url
 27         
 28         Map<String,String> cookies = getCookie(cat);
 29 
 30         if(cookies.size() >0){
 31             //列表
 32             String listUrl = "http://epub.cnki.net/grid2008/brief/brief.aspx?pagename=asp.brief_result_aspx&dbprefix=scdb&skuakuid="+sKuakuID+
 33                     "&loadgroup=1&prio=true&stab=normal&turnpage=1&recordsperpage=20&queryid="+sKuakuID+"&id=&curpage=3";
 34             
 35             //文章最初链接
 36             List<String> articleInitUrls = new ArrayList<String>();
 37             Connection conn = Jsoup.connect(listUrl);
 38             conn.method(Method.GET);
 39             conn.followRedirects(false);
 40             conn.timeout(5000);
 41             conn.cookies(cookies);
 42             try {
 43                 Document doc = conn.get();
 44                 Elements links = doc.select("a[target=NewBriefDetail]");
 45                 if(links.size() <=  0){
 46                     System.out.println("没有更多文章。");
 47                 }else{
 48                     for(Element link : links){
 49                         articleInitUrls.add(link.attr("abs:href"));
 50                         //System.out.println(articleInitUrls);
 51                     }
 52                 }
 53                 
 54             } catch (IOException e) {
 55                 System.out.println("链接超时了。。");
 56             }
 57             
 58             if(articleInitUrls.size()<=0){
 59                 System.out.println("没有文章!");
 60             }
 61             for(String articleInitUrl : articleInitUrls){
 62                 
 63                 Connection conn2 = Jsoup.connect(articleInitUrl);
 64                 
 65                 conn2.header("Referer", listUrl);
 66                 conn2.cookies(cookies);
 67                 conn2.followRedirects(false);
 68                  try {
 69                     Document doc = conn2.get();
 70                     Elements links = doc.select("h2 > a[href]");
 71                     //System.out.println(links);
 72                     if(links.size()<=0){
 73                         System.out.println("最初链接为:【"+articleInitUrl+"】的文章获取实际链接失败!");
 74                     }else{
 75 
 76                         String url1 = links.toString();
 77                         //System.out.println(url1.substring(url1.indexOf("detail%252f")+11,url1.indexOf(".html")));
 78                         String url2=url1.substring(url1.indexOf("detail%252f")+11,url1.indexOf(".html"));
 79                         
 80                         //String articleUrl = links.get(0).attr("href");
 81                         
 82                         String articleUrl="www.cnki.net/kcms/detail/"+url2+".html";
 83                         System.out.println(articleUrl);
 84                         //getArticle(articleUrl);
 85                     }
 86                 } catch (IOException e) {
 87                     System.out.println("最初链接为:【"+articleInitUrl+"】的文章链接超时!");
 88                 }
 89             }
 90         }
 91         
 92 
 93     }
 94     
 95     public static Map<String,String> getCookie(String cat) {
 96         
 97         String listUrl = "http://epub.cnki.net/grid2008/brief/Result.aspx";
 98         //检索
 99         
100         String searchHander = "http://epub.cnki.net/grid2008/request/search.aspx?PageName=ASP.brief_result_aspx&DBViewType=FullText";
101         Connection conn = Jsoup.connect(listUrl);
102         conn.method(Method.GET);
103         conn.followRedirects(false);
104         conn.timeout(5000);
105         try {
106             Document doc = conn.get();
107             String db_opt = doc.select("input#db_opt").attr("value");    
108             String db_prefix = doc.select("input#db_prefix").attr("value");    
109             String db_configfile = doc.select("input#db_configfile").attr("value");    
110             
111             String searchHanderUrl = searchHander+"&DbCatalog="+db_opt+"&DbPrefix="+db_prefix+"&ConfigFile="+db_configfile;
112             Connection conn2 = Jsoup.connect(searchHanderUrl);
113             conn2.method(Method.GET);
114             conn2.followRedirects(false);
115             conn2.timeout(5000);
116             Response response;
117             Document doc2 = conn2.get();
118             String responseContent = doc2.select("body").text();
119             if(responseContent.indexOf("sKuakuID") !=-1){
120                 System.out.println(responseContent.substring(responseContent.indexOf("sKuakuID")+9));
121                 sKuakuID = responseContent.substring(responseContent.indexOf("sKuakuID")+9);
122             }
123             response = conn2.response();
124             return response.cookies(); 
125         } catch (IOException e) {
126             System.out.println("获取cookies的链接超时了。你懂的!");
127             return new HashMap<String,String>();
128         }
129         
130     }
131     
132     public static void getArticle(String articleUrl) {
133         Connection conn = Jsoup.connect(articleUrl);
134         conn.method(Method.GET);
135         conn.followRedirects(false);
136         conn.timeout(5000);
137         try {
138             Document doc = conn.get();
139             //这里只打印标题了。
140             Elements links = doc.select("span#chTitle");
141             System.out.println("文章标题:"+links.get(0).text()+"——链接:【"+articleUrl+"】");
142         } catch (IOException e) {
143             System.out.println("链接文章:【"+articleUrl+"】超时了。");
144         }
145         
146     }
147     
148 }

 

posted @ 2013-11-29 17:26  蓦然回首的包子  阅读(299)  评论(0)    收藏  举报