1 package com.raycloud.util;
2
3 import java.io.IOException;
4 import java.util.ArrayList;
5 import java.util.List;
6 import java.util.regex.Matcher;
7 import java.util.regex.Pattern;
8
9 import org.apache.commons.httpclient.HttpClient;
10 import org.apache.commons.httpclient.HttpStatus;
11 import org.apache.commons.httpclient.methods.GetMethod;
12 import org.apache.log4j.Logger;
13
14 /**
15 * <ul>
16 * <li>Function:
17 * <ul>
18 * <li>TODO</li>
19 * </ul>
20 * </li>
21 * <li>CopyRight
22 * <ul></ul>
23 * </li>
24 * <li>author: <a href="http://blog.csdn.net/wgzhl2008">wgzhl2008</a></li>
25 * <li>E-mail: <a href="mailto:wgzhl2008@gmail.com">wgzhl2008@gmail.com</a>
26 * <li>Version:1.0</li>
27 * <li>Date:2012-3-17 下午01:36:26</li>
28 * </ul>
29 */
30 public class FetchShopUtil {
31 private static final Logger logger = Logger.getLogger(FetchShopUtil.class);
32 /**
33 * @param url
34 * @return
35 * @see 依据网址获取页面内容
36 * @author wgzhl2008
37 * @version 1.0
38 * @date 2012-3-17 下午02:04:19
39 */
40 public static String fetchHtmlFromWebPage(String url){
41 GetMethod getMethod =new GetMethod(url);
42 HttpClient client = new HttpClient();
43 int status = 0;
44 try{
45 status = client.executeMethod(getMethod);
46 if(status!=HttpStatus.SC_OK){
47 //如果不成功,休息3s后再进行一次抓取
48 Thread.sleep(1000*3);
49 status = client.executeMethod(getMethod);
50 }
51 }catch (Exception e) {
52 getMethod.releaseConnection();
53 logger.error("抓取网页内容出错"+e.getMessage(),e);
54 }
55 String sResponse="";
56 if(status==HttpStatus.SC_OK){
57 try {
58 sResponse=getMethod.getResponseBodyAsString();
59 } catch (IOException e) {
60 getMethod.releaseConnection();
61 logger.error("抓取网页内容出错"+e.getMessage(),e);
62 }
63 }else{
64 logger.info("抓取网页内容失败");
65 }
66 getMethod.releaseConnection();
67
68 return sResponse;
69 }
70
71 /**
72 * @param html
73 * @return {@link List}
74 * @see 得到店铺名字
75 * @author wgzhl2008
76 * @version 1.0
77 * @date 2012-3-17 下午02:09:01
78 */
79 public static List<String> fecthShopName(String html){
80 List<String> resultItem = new ArrayList<String>();
81 Pattern p1 = Pattern
82 .compile("<p(\\s*)class=(\\s*)\"nick\"(\\s*)>(.*?)</p>");
83 Matcher m1 = p1.matcher(html);
84 String itemContent[] = null;
85 while (m1.find()) {
86 itemContent = m1.group().split(">");
87 for(String s:itemContent){
88 int index=s.indexOf("<");
89 if(index!=0){
90 resultItem.add(s.substring(0,index).trim());
91 }
92 }
93 }
94 return resultItem;
95 }
96 }