import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
*
* @author 园长 & doie.net
*
*/
public class Test {
/**
* 发送请求
* @param host
*/
public static void sendPostRequest(String host) {
URL url;
try {
url = new URL(host);// 发送请求的路径
URLConnection conn = url.openConnection();
HttpURLConnection httpUrlConnection = (HttpURLConnection) conn;
httpUrlConnection.setRequestProperty("User-Agent",
"Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)");
String temp = null;
BufferedReader br = new BufferedReader(new InputStreamReader(conn
.getInputStream()));
while ((temp = br.readLine()) != null) {
// getSourceCode(temp);// 源代码正则
List<String> ls = getSearchURL(temp);
for (int i = 0; i < ls.size(); i++) {
System.out.println(ls.get(i));
}
}
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* 爬URL
* @param sourceCode
* @return
*/
public static List<String> getSearchURL(String sourceCode){
List<String> ls=new ArrayList<String>();
List<String> rs=new ArrayList<String>();
Pattern p1 = Pattern.compile("(http|ftp|https):\\/\\/[\\w\\-_]+(\\.[\\w\\-_]+)+([\\w\\-\\.,@?^=%&:/~\\+#]*[\\w\\-\\@?^=%&/~\\+#])?");// 抓网址
Matcher getUrl = p1.matcher(sourceCode);
while (getUrl.find()) {
ls.add(getSuffix(getUrl.group().trim()));
}
for (int i = 0; i < ls.size(); i++) {
if (!rs.contains(ls.toArray()[i])&&ls.toArray()[i]!=null) {
rs.add(ls.get(i));
}
}
return rs;
}
/**
* 正则匹配
* @param sourceCode
* @return
*/
public static Map<String, Object> getSourceCode(String sourceCode) {
Pattern pattern = Pattern.compile("[a-zA-Z0-9_-]+@[a-zA-Z0-9_-]+(\\.[a-zA-Z0-9_-]+)+");// 抓邮箱
Pattern pattern2 = Pattern.compile("(http|ftp|https):\\/\\/[\\w\\-_]+(\\.[\\w\\-_]+)+([\\w\\-\\.,@?^=%&:/~\\+#]*[\\w\\-\\@?^=%&/~\\+#])?");// 抓网址
Matcher matcher = pattern.matcher(sourceCode);
Map<String, String> emailMap = new HashMap<String, String>();
Matcher matcher2 = pattern2.matcher(sourceCode);
Map<String, Object> httpMap = new HashMap<String, Object>();
List<String> ls=new ArrayList<String>();
List<String> rs=new ArrayList<String>();
while (matcher2.find()) {
ls.add(getSuffix(matcher2.group().trim()));
httpMap.put(matcher2.group().trim(), matcher2.group().trim());
}
for (int i = 0; i < ls.size(); i++) {
if (!rs.contains(ls.toArray()[i])&&ls.toArray()[i]!=null) {
rs.add(ls.get(i));
}
}
for (int i = 0; i < rs.size(); i++) {
// System.out.println(rs.get(i));
}
while (matcher.find()) {
emailMap.put(matcher.group(), matcher.group());
}
Iterator<String> iterator = emailMap.values().iterator();
while (iterator.hasNext()) {
String str = iterator.next();
System.out.println("str:"+str);
}
return httpMap;
}
/**
* 获得网址所在的匹配区域部分
* @param content
* @param strAreaBegin
* @param strAreaEnd
* @return
*/
public static String getArea(String content,String strAreaBegin, String strAreaEnd) {
int a1 = 0, a2 = 0;
a1 = content.indexOf(strAreaBegin)+strAreaBegin.length();
a2 = content.indexOf(strAreaEnd);
return content.substring(a1,a2);
}
/**
* 获取URL后缀
* @param url
* @return
*/
public static String getSuffix(String url){
String suffix[]={"exe","css","js","zip","rar","mid","tar","gif","jpeg","jpg","bmp","avi","mp3","swf","rm","3gp","mp4","wma","wav","rmvb","ram","key","png","psd","pdf","doc","mdb","xls","ppt","docx","pptx","wps","iso","wmv","img","flv","fla","swf"};
String s=url.substring(url.lastIndexOf(".")+1, url.length());
for (int i = 0; i < suffix.length; i++) {
if (suffix[i].equalsIgnoreCase(s)) {
url=null;
}
}
return url;
}
public static void main(String[] args) {
sendPostRequest("http://www.baidu.com/s?wd=%E8%A6%81%E7%9A%84%E7%95%99%E9%82%AE%E7%AE%B1");
}
}