最近工作中需要做一个爬虫去抓取指定页面的一些内容,准备使用HttpParser来解析页面结构,顺便看了一下httpclient4,可以将它们配合使用
HttpParser有三种模式,filter模式,visitor模式,bean模式
publicstaticvoid parserByVisitor() throws Exception{
Parser parser =new Parser();
parser.setURL("http://www.baidu.com");
NodeVisitor visitor =new NodeVisitor() {
@Override
publicvoid visitTag(Tag tag) {
if(tag.getTagName().equals("META")){
System.out.println(tag.getAttributeEx("content").getValue());
}
}
};
parser.visitAllNodesWith(visitor);
}
publicstatic List<URL> parserByFilter(URL url) throws ParserException, MalformedURLException{
List<URL> result =new ArrayList<URL>();
Parser parser =new Parser();
parser.setURL(url.toString());
//设置编码
parser.setEncoding("GBK");
NodeFilter filter =new NodeClassFilter(LinkTag.class);//过滤器
NodeList list = parser.extractAllNodesThatMatch(filter);//过滤所有链接
//遍历结果
for(int i=0;i<list.size();i++){
LinkTag tag = (LinkTag) list.elementAt(i);
if(tag.getLink().startsWith("http://")){
URL link =new URL(tag.getLink());
result.add(link);
}
System.out.println(tag.getLink());
}
return result;
}
publicstaticvoid main(String[] args) throws Exception{
// parserByVisitor();
parserByFilter(new URL("http://www.baidu.com"));
}
httpclient4封装类
public class HttpClientUtils {
private static Logger logger = LoggerFactory.getLogger(HttpClientUtils.class);
private static HttpClient httpclient;
static {
HttpParams params = new BasicHttpParams();
HttpProtocolParams.setVersion(params, HttpVersion.HTTP_1_1);
//设置连接超时
HttpConnectionParams.setConnectionTimeout(params, 5*1000);
ThreadSafeClientConnManager manager = new ThreadSafeClientConnManager();
//设置最大并发连接数
manager.setMaxTotal(50);
httpclient = new DefaultHttpClient(manager,params);
}
/**
* get请求3次
*
* @param url
* @return
* @throws Exception
*/
public static String get(String url){
HttpGet httpget = new HttpGet(url);
httpget.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.63 Safari/535.7 360EE");
httpget.setHeader("Accept-Charset", "GBK,utf-8;q=0.7,*;q=0.3");
// httpget.setHeader("Accept-Encoding", "gzip,deflate,sdch");
httpget.setHeader("Accept-Language", "zh-CN,zh;q=0.8");
String html = null;
for (int i = 0; i < 2; i++) {
HttpEntity httpEntity = null;
try {
HttpResponse response = httpclient.execute(httpget);
httpEntity = response.getEntity();
if (response.getStatusLine().getStatusCode() == HttpStatus.SC_OK && httpEntity != null) {
html = EntityUtils.toString(httpEntity, "GBK");
return html;
}
} catch (Exception e) {
logger.error("第{}次请求失败,开始重连",i+1);
logger.error(e.getMessage(),e);
} finally {
try {
EntityUtils.consume(httpEntity);
} catch (IOException e) {
e.printStackTrace();
}
}
}
httpget.abort(); //关闭连接
return html;
}
/**
* post请求
*
* @return
*/
public String post(String url, Map<String, String> map){
List<NameValuePair> nvps = new ArrayList<NameValuePair>();
for (String key : map.keySet()) {
NameValuePair nvp = new BasicNameValuePair(key, map.get(key));
nvps.add(nvp);
}
String html = null;
try{
HttpPost httppost = new HttpPost(url);
httppost.setEntity(new UrlEncodedFormEntity(nvps, HTTP.UTF_8));
HttpResponse response = httpclient.execute(httppost);
HttpEntity httpEntity = response.getEntity();
if (response.getStatusLine().getStatusCode() == HttpStatus.SC_OK && httpEntity != null) {
html = EntityUtils.toString(httpEntity, "GBK");
EntityUtils.consume(httpEntity);
}
}catch(Exception e){
e.printStackTrace();
}
return html;
}
}
浙公网安备 33010602011771号