package Test;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class Test {
/**
* 读取一个网页全部内容
* @param htmlurl
* @return
* @throws IOException
*/
public String getOneHtml(String htmlurl) throws IOException{
URL url;
String temp;
StringBuffer sb = new StringBuffer();
try{
url = new URL(htmlurl);
BufferedReader in = new BufferedReader(new InputStreamReader(url.openStream(),"utf-8"));
while((temp = in.readLine())!=null){
sb.append(temp);
}
in.close();
}catch(MalformedURLException me){
System.out.println("您输入的URL格式有问题!请检查后再次输入!");
me.getMessage();
throw me;
}catch(IOException e){
e.printStackTrace();
throw e;
}
return sb.toString();
}
/**
* 获得网页标题
* @param args
*/
public String getTitle(String s){
String regex;
String title="";
List<String> list = new ArrayList<String>();
regex = "<title>.*?</title>";
Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);
Matcher ma = pa.matcher(s);
while(ma.find()){
list.add(ma.group());
}
for(int i=0;i<list.size();i++){
title = title+list.get(i);
}
return outTag(title);
}
//获得链接
public List<String> getLink(String s){
String regex;
String regex1;
List<String> list = new ArrayList<String>();
regex = "<a[^>]*href=('([^']*)'|'([^']*)'|([^s>]*))[^>]*>(.*?)</a>";
Pattern pa = Pattern.compile(regex,Pattern.DOTALL);
Matcher ma = pa.matcher(s);
while(ma.find()){
list.add(ma.group());
}
return list;
}
/**
* 获得脚本代码
* @param s
* @return
*/
public List<String> getScript(String s){
String regex ;
List<String> list = new ArrayList<String>();
regex = "<script.*?</script>";
Pattern pa = Pattern.compile(regex, Pattern.DOTALL);
Matcher ma = pa.matcher(s);
while(ma.find()){
list.add(ma.group());
}
return list;
}
/**
* 获得CSS
* @param s
* @return
*/
public List<String> getCSS(String s){
String regex;
List<String> list = new ArrayList<String>();
regex = "<style.*?</style>";
Pattern pa = Pattern.compile(regex,Pattern.DOTALL);
Matcher ma = pa.matcher(s);
while(ma.find()){
list.add(ma.group());
}
return list;
}
public String outTag(String s) {
return s.replaceAll("<.*?>", "");
}
/**
* 获取京东团购文章标题及内容
* @param args
*/
public HashMap<String,String> getFrom360buy(String s){
HashMap<String,String> hm = new HashMap<String,String>();
StringBuffer sb = new StringBuffer();
String html = "";
System.out.println("------------开始读取网页("+s+")---------");
try{
html = getOneHtml(s);
}catch(Exception e){
e.getMessage();
}
System.out.println("--------------读取网页("+s+")结束----------");
System.out.println("--------------分析("+s+")结果如下----------");
String title = outTag(getTitle(html));
//title = title.replace("京东团购", "");
Pattern pa = Pattern.compile("<h1.*?</h1>",Pattern.DOTALL);
Matcher ma = pa.matcher(html);
while(ma.find()){
sb.append(ma.group());
}
String temp = sb.toString();
temp = temp.replaceAll("<p><em>.*?</em></p>","");
hm.put("title", title);
hm.put("no", outTag(temp));
return hm;
}
/**
* 测试一组网页
* @param args
*/
public static void main(String[] args) {
String url = "";
List<String> list = new ArrayList<String>();
System.out.println("输入URL,一行一个,输入结束后输入go程序开始运行");
BufferedReader br = new BufferedReader(new InputStreamReader(System.in));
try{
while(!(url=br.readLine()).equals("go")){
list.add(url);
}
}catch(Exception e){
e.getMessage();
}
Test t = new Test();
HashMap<String, String> hm = new HashMap<String, String>();
for (int i = 0; i < list.size(); i++) {
hm = t.getFrom360buy(list.get(i));
System.out.println(list.get(i));
System.out.println("标题: " + hm.get("title"));
System.out.println("内容: " + hm.get("no"));
}
}
}
由于分配到爬虫任务,就在网上找了下资料,仿照了下代码,发现是可以用的!
试了几个页面 有title的页面是能够读出标题的。没的话自然也就读不出。想得到自己想要内容主要的还是看正则表达式写的正确与否!
用京东的团购页面做了实验http://tuan.360buy.com/beijing-0-0-1-0-0-index.html,代码是读出团购产品的内容,是能够得到结果的!
有一个问题就是在一个有title的页面得到title时候,在控制台输出的是乱码。还不知道怎么解决,再继续尝试下!
浙公网安备 33010602011771号