爬虫——博客实例

//Rule.java用于指定查询url,method,params
public class Rule
{
private String url;//链接
private String[] params;//参数集合
private String[] values;//参数的值
//对返回的HTML,第一希过滤所用的标签,先设置type
private String resultTagName;
/*设置resultTagName的类型,默认是ID
*CLASS/ID/SELECTION
*/
private int type=ID;
/*请求的类型,默认是get
* GET/POST
*/
private int requestMethod=GET;
public final static int GET=0;
public final static int POST=1;
public final static int class="0";
public final static int ID=1;
public final static int SELECTION=2;
public Rule()
{
}

public Rule(String url,String[] param,String[] values,String resultTagName,int type,int requestMethod)
{
super();
this.url=url;
this.params=params;
this.values=values;
this.resultTagName=resultTagName;
this.type=type;
this.requestMethod=requestMethod;
}

public String getUrl()
{
return url;
}
public void setUrl(String url)
{
this.url=url;
}
public String[] getParams()
{
return params;
}
public void setParams(String[] params)
{
this.params=params;
}
public String[] getValues()
{
return values;
}
public void setValues(String[] values)
{
this.values=values;
}
public String getResultTagName()
{
return resultTagName;
}
public void setResultTagName(String resultTagName)
{
this.resultTagName=resultTagName;
}
public int getType()
{
return type;
}
public void setType(int type)
{
this.type=type;
}
public int getRequestMethod()
{
return requestMethod;
}
public void setRequestMethod()
{
this.requestMethod=requestMethod;
}
}

 

 

//链接需要的数据对象
public class LinkTypeData {
private int id;
// 链接的地址
private String linkHref;
//链接标题
private String linkText;
//摘要
private String summary;
//内容
private String content;
public int getId()
{
return id;
}
public void setId(int id)
{
this.id=id;
}
public String getLinkHref()
{
return linkHref;
}
public void setLinkHref(String linkHref)
{
this.linkHref=linkHref;
}
public String getSummary()
{
return summary;
}
public void setSummary(String summary)
{
this.summary=summary;
}
public String getContent()
{
return content;
}
public void setContent(String content)
{
this.content=content;
}

}

 

 

import java.util.List;

//核心的查询类
public class ExtractService {
public static List<LinkTypeData> extract(Rule rule)//<>是泛型,里面指定了这个集合中存放的是什么数据
{
//对rule必要检验
validateRule(rule);
List<LinkTypeData> datas=new ArrayList<LinkTypeData>();
LinkTypeData data=null;
try
{
//解析rule
String url=rule.getUrl();
String[] params=rule.getParams();
String[] values=rule.getValues();
String resultTagName=rule.getResultTagName();
int type=rule.getType;
int requestType=rule.getRequestMethod();
connection conn=Jsoup.connect(url);//Jsoup.connect解析url网站地址
//设置查询参数
if(params!=null)
{
for(int i=0;i<params.length;i++)
{
conn.data(params[i],values[i]);
}
}
//设置请求类型
Document doc=null;
switch (requestType)
{
case Rule.GET:
doc=conn.timeout(100000).get();
break;
case Rule.Post:
doc=conn.timeout(100000).post();
break;
}
//处理返回数据
Elements results=new Elements();
switch(type)
{
case Rule.CLASS:
results=doc.getElementsByClass(resultTagName);
break;
case Rule.ID:
Element result=doc.getElementById(resultTagName);
results.add(result);
break;
case Rule.SELECTION:
results=doc.select(resultTagName)
break;
default;
//当resultTagName为空时默认去body标签
if(TextUtil.isEmpty(resultTagName))
{
results=doc.getElementsByTag("body");
}
}
for(Element result:results)
{
Elements links=result.getElementsByTag("a");

for(Element link:links)
{
//必要的筛选
String linkHref=link.attr("href");
String linkText=link.text();
data=new LinkTypeData();
data.setLinkHref(linkHref);
data.setLinkText(linkText);
datas.add(data);
}
}
}catch(IOException e)
{
e.printStackTrace();
}
return datas;
}

//传入参数必要检验
private static void validateRule(Rule rule)
{
String url=rule.getUrl();
if(TextUtil.isEmpty(url))
{
throw new RuleException("url不能为空!");
}
if(!url.startsWith("http://"))
{
throw new RuleException("url格式不正确");
}
if(rule.getParams()!=null&&rule.getValues()!=null)
{
if(rule.getParams().length!=rule.getvalues().length)
{
throw new RuleException("参数键值对个数不匹配");
}

}
}
}

posted @ 2015-04-15 21:57  sunshinewxz  阅读(142)  评论(0编辑  收藏  举报