在使用HttpClient进行网页抓取的时候,难免会遇到目标页面使用Cookies的情况,还好这样的情形HttpClient能够轻松应对。在HttpClient中,无论是在请求中带入Cookies还是请求完成后获取Cookies,都可以通过CookieStore对象来完成。
具体的做法是,将CookieStore实例加入一个HttpContext实例中,然后将该上下文实例带入HTTP请求过程中,如此HttpClient便能使用其中的Cookies信息,并将服务器返回的Cookies存入其中。具体的范例代码如下所示:
package cn.ysh.studio.crawler.httpclient;import java.util.List;import org.apache.http.HttpEntity;import org.apache.http.HttpResponse;import org.apache.http.client.CookieStore;import org.apache.http.client.HttpClient;import org.apache.http.client.methods.HttpGet;import org.apache.http.client.protocol.ClientContext;import org.apache.http.cookie.Cookie;import org.apache.http.impl.client.BasicCookieStore;import org.apache.http.impl.client.DefaultHttpClient;import org.apache.http.protocol.HttpContext;import org.apache.http.protocol.BasicHttpContext;import org.apache.http.util.EntityUtils;/** * 演示如何使用一个本地HTTP上下文填充自定义属性(获取Cookies) * * @author Shenghany * @date 2013-5-19 */publicclassClientCustomContext{publicfinalstaticvoid main(String[] args)throwsException{HttpClient httpclient =newDefaultHttpClient();try{// 创建一个CookieStore的本地实例CookieStore cookieStore =newBasicCookieStore();// 创建一个本地HttpContext实例HttpContext localContext =newBasicHttpContext();// 将自定义的CookieStore实例绑定到http上下文对象中 localContext.setAttribute(ClientContext.COOKIE_STORE, cookieStore);//创建Get请求HttpGet httpget =newHttpGet("http://www.google.com");System.out.println("executing request "+ httpget.getURI());// 将本地Http 上下文对象作为参数带入Get请求执行过程HttpResponse response = httpclient.execute(httpget, localContext);//获得响应实体HttpEntity entity = response.getEntity();System.out.println("----------------------------------------");System.out.println(response.getStatusLine());if(entity !=null){System.out.println("Response content length: "+ entity.getContentLength());}//获取所有Cookies List<Cookie> cookies = cookieStore.getCookies();for(int i =0; i < cookies.size(); i++){System.out.println("Local cookie: "+ cookies.get(i));}// 销毁响应实体EntityUtils.consume(entity);System.out.println("----------------------------------------");}finally{// 当不再需要HttpClient实例时,关闭连接管理器以确保释放所有占用的系统资源 httpclient.getConnectionManager().shutdown();}}}