正则表达式抓取文件内容中的http链接地址


import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
//正则表达式抓取网页数据

public class HtmlAddressCatch {
public static void main(String[] args) {

  String webaddress = "https://www.zhihu.com/people/Akira_Dunn";
  HtmlAddressCatch.getWebTextContent(webaddress);
  /*String localaddress = "D:\\test\\test.html";
  String targetaddress = "D:\\test\\http.txt";
  HtmlAddressCatch.getLocalTextContent(localaddress , targetaddress);*/
	
}

//给定http链接抓取地址

public static void getWebTextContent(String webaddress){	

try {

URL url = new URL(webaddress);

HttpURLConnection con = (HttpURLConnection)url.openConnection();

FileOutputStream file = new FileOutputStream("D:\text.txt");

InputStreamReader read = new InputStreamReader(con.getInputStream());//使用InputStreamReader是为了将InputStream字节流转换成为字符流，一次读取更多的字节

BufferedReader packetreader = new BufferedReader(read);//使用BufferedReader是为了在InputStreamReader的基础上一次读取更多的字节

int i=0;

String regex = "https?😕/\w+\.\w+\.\w+";

Pattern p = Pattern.compile(regex);

while((i=packetreader.read())!=-1)

{

String str = packetreader.readLine();

Matcher m = p.matcher(str);

while(m.find())

{

file.write((m.group()+"\r\n").getBytes());

}

}

} catch (MalformedURLException e) {

// TODO Auto-generated catch block

e.printStackTrace();

} catch (FileNotFoundException e) {

// TODO Auto-generated catch block

e.printStackTrace();

} catch (IOException e) {

// TODO Auto-generated catch block

e.printStackTrace();

}
}
// 从本地test.html文件抓取http链接和邮箱地址

public static void getLocalTextContent(String localaddress,String targetaddress){

try {

FileInputStream reader = new FileInputStream(localaddress);

FileOutputStream writer = new FileOutputStream(targetaddress);

byte[] buf = new byte[200];

int point = 0;

//String regex = "https?😕/\w+\.\w+\.\w+";http链接抓取

String regex = "\w+@\w+\.\w+";//邮箱地址抓取

Pattern p = Pattern.compile(regex);

while((point=reader.read(buf))>0)

{

Matcher m = p.matcher(new String(buf));

while(m.find())

{

writer.write((m.group()+"\r\n").getBytes());

}

}

} catch (FileNotFoundException e) {

// TODO Auto-generated catch block

e.printStackTrace();

} catch (IOException e) {

// TODO Auto-generated catch block

e.printStackTrace();

}

}

}
posted @ 2016-09-09 00:58 akiradunn 阅读(6873) 评论(0) 编辑收藏举报
会员力量，点亮园子希望
刷新页面返回顶部
akiradunn

正则表达式抓取文件内容中的http链接地址

公告