正则表达式抓取文件内容中的http链接地址

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

//正则表达式抓取网页数据
public class HtmlAddressCatch {

public static void main(String[] args) {

  String webaddress = "https://www.zhihu.com/people/Akira_Dunn";
  HtmlAddressCatch.getWebTextContent(webaddress);
  /*String localaddress = "D:\\test\\test.html";
  String targetaddress = "D:\\test\\http.txt";
  HtmlAddressCatch.getLocalTextContent(localaddress , targetaddress);*/
	
}

//给定http链接抓取地址
public static void getWebTextContent(String webaddress){
try {
URL url = new URL(webaddress);
HttpURLConnection con = (HttpURLConnection)url.openConnection();
FileOutputStream file = new FileOutputStream("D:\text.txt");
InputStreamReader read = new InputStreamReader(con.getInputStream());//使用InputStreamReader是为了将InputStream字节流转换成为字符流,一次读取更多的字节
BufferedReader packetreader = new BufferedReader(read);//使用BufferedReader是为了在InputStreamReader的基础上一次读取更多的字节
int i=0;
String regex = "https?😕/\w+\.\w+\.\w+";
Pattern p = Pattern.compile(regex);
while((i=packetreader.read())!=-1)
{
String str = packetreader.readLine();
Matcher m = p.matcher(str);
while(m.find())
{
file.write((m.group()+"\r\n").getBytes());
}
}
} catch (MalformedURLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}

}

// 从本地test.html文件抓取http链接和邮箱地址
public static void getLocalTextContent(String localaddress,String targetaddress){
try {
FileInputStream reader = new FileInputStream(localaddress);
FileOutputStream writer = new FileOutputStream(targetaddress);
byte[] buf = new byte[200];
int point = 0;
//String regex = "https?😕/\w+\.\w+\.\w+";http链接抓取
String regex = "\w+@\w+\.\w+";//邮箱地址抓取
Pattern p = Pattern.compile(regex);
while((point=reader.read(buf))>0)
{
Matcher m = p.matcher(new String(buf));
while(m.find())
{
writer.write((m.group()+"\r\n").getBytes());
}
}
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}

posted @ 2016-09-09 00:58  akiradunn  阅读(6872)  评论(0编辑  收藏  举报