3089589

  博客园  :: 首页  :: 新随笔  :: 联系 :: 订阅 订阅  :: 管理

最近工作中需要做一个爬虫去抓取指定页面的一些内容,准备使用HttpParser来解析页面结构,顺便看了一下httpclient4,可以将它们配合使用

HttpParser有三种模式,filter模式,visitor模式,bean模式

publicstaticvoid parserByVisitor() throws Exception{  
        Parser parser =new Parser();
        parser.setURL("http://www.baidu.com");  
        NodeVisitor visitor =new NodeVisitor() {  
            @Override  
publicvoid visitTag(Tag tag) {  
if(tag.getTagName().equals("META")){  
                    System.out.println(tag.getAttributeEx("content").getValue());  
                }  
            }  

        };  
        parser.visitAllNodesWith(visitor);
    }

publicstatic List<URL> parserByFilter(URL url) throws ParserException, MalformedURLException{  
        List<URL> result =new ArrayList<URL>();  
        Parser parser =new Parser();  
        parser.setURL(url.toString());  
//设置编码  
        parser.setEncoding("GBK");  
        NodeFilter filter =new NodeClassFilter(LinkTag.class);//过滤器  
        NodeList list = parser.extractAllNodesThatMatch(filter);//过滤所有链接  
//遍历结果  
for(int i=0;i<list.size();i++){  
            LinkTag tag = (LinkTag) list.elementAt(i);  
if(tag.getLink().startsWith("http://")){  
                URL link =new URL(tag.getLink());  
                result.add(link);  
            }  
            System.out.println(tag.getLink());  
        }  
return result;  

    }

publicstaticvoid main(String[] args) throws Exception{
//        parserByVisitor();
        parserByFilter(new URL("http://www.baidu.com"));
    }

httpclient4封装类

public class HttpClientUtils {
private static Logger logger = LoggerFactory.getLogger(HttpClientUtils.class);
private static HttpClient httpclient;
static {
        HttpParams params = new BasicHttpParams();
        HttpProtocolParams.setVersion(params, HttpVersion.HTTP_1_1);
//设置连接超时
        HttpConnectionParams.setConnectionTimeout(params, 5*1000);

        ThreadSafeClientConnManager manager = new ThreadSafeClientConnManager();
//设置最大并发连接数
        manager.setMaxTotal(50);
        httpclient = new DefaultHttpClient(manager,params);
    }

/**
     * get请求3次
     * 
     * @param url
     * @return
     * @throws Exception
*/
public static String get(String url){
        HttpGet httpget = new HttpGet(url);
        httpget.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.63 Safari/535.7 360EE");
        httpget.setHeader("Accept-Charset", "GBK,utf-8;q=0.7,*;q=0.3");
//        httpget.setHeader("Accept-Encoding", "gzip,deflate,sdch");
        httpget.setHeader("Accept-Language", "zh-CN,zh;q=0.8");
        String html = null;
for (int i = 0; i < 2; i++) {
            HttpEntity httpEntity = null;
try {
                HttpResponse response = httpclient.execute(httpget);
                httpEntity = response.getEntity();
if (response.getStatusLine().getStatusCode() == HttpStatus.SC_OK && httpEntity != null) {
                    html = EntityUtils.toString(httpEntity, "GBK");
return html;
                }
            } catch (Exception e) {
                logger.error("第{}次请求失败,开始重连",i+1);
                logger.error(e.getMessage(),e);
            } finally {
try {
                    EntityUtils.consume(httpEntity);
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
        httpget.abort();    //关闭连接
        return html;
    }

/**
     * post请求
     * 
     * @return
*/
public String post(String url, Map<String, String> map){
        List<NameValuePair> nvps = new ArrayList<NameValuePair>();
for (String key : map.keySet()) {
            NameValuePair nvp = new BasicNameValuePair(key, map.get(key));
            nvps.add(nvp);
        }
        String html = null;
try{
            HttpPost httppost = new HttpPost(url);
            httppost.setEntity(new UrlEncodedFormEntity(nvps, HTTP.UTF_8));
            HttpResponse response = httpclient.execute(httppost);
            HttpEntity httpEntity = response.getEntity();

if (response.getStatusLine().getStatusCode() == HttpStatus.SC_OK && httpEntity != null) {
                html = EntityUtils.toString(httpEntity, "GBK");
                EntityUtils.consume(httpEntity);
            }
        }catch(Exception e){
            e.printStackTrace();
        }
return html;
    }
}
 

 

 

posted on 2013-03-26 09:25  liangge0218  阅读(234)  评论(0)    收藏  举报