java抓取网页或者文件的邮箱号码

抓文件的                                                                                                          

 
package reg;


import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;


public class TestEmail {

public static void main(String[] args) {
// TODO Auto-generated method stub
     BufferedReader br=null;
     try {
br=new BufferedReader(new FileReader("D:/1.htm"));
String str=null;
StringBuilder sb=new StringBuilder();
while((str=br.readLine())!=null){
sb.append(str);
}
List es=getEmail(sb.toString());
for(String e:es){
System.out.println(e);
}
} catch (FileNotFoundException e) {
// TODO: handle exception
e.printStackTrace();
}catch (IOException e) {
// TODO: handle exception
e.printStackTrace();
}finally {
try {
if(br!=null) br.close();
} catch (IOException e) {
// TODO: handle exception
e.printStackTrace();
}
}
}
  public static List getEmail(String str){
 List es=new ArrayList();
Pattern p=Pattern.compile("[\\w\\.-]*\\w+@[\\w\\.-]*\\w+\\.\\w{2,5}");
//  Pattern p=Pattern.compile("[\\w[.-]]+@[\\w[.-]]+\\.[\\w]+");
 Matcher m=p.matcher(str);
 while(m.find()){
 es.add(m.group());
 }
 return es;
  }
}

 

 

 

 
抓网页的                                                                                                                                     

 

package reg;

import java.io.BufferedReader;  
import java.io.InputStreamReader;  
import java.util.regex.Matcher;  
import java.util.regex.Pattern;  
  
public class Testemail01  
{  
   public static String   getWebCon(String domain)  
   {  
    System.out.println("开始抓取邮件地址..("+domain+")");  
    StringBuffer sb=new StringBuffer();  
    try  
    {  
     java.net.URL url=new java.net.URL(domain);  
     BufferedReader in=new BufferedReader(new InputStreamReader(url.openStream()));  
     String line;  
     while((line=in.readLine())!=null)  
     {  
      parse(line);  
     }  
     in.close();  
    }  
    catch(Exception e)  
    {  
     sb.append(e.toString());  
     System.err.println(e);  
      
    }  
    return sb.toString();  
   }  
   public static void main(String[] args)  
   {  
    String s=Testemail01.getWebCon("http://tieba.baidu.com/p/2366935784"); 
   }  
   private static void parse(String line)  
   {  
    Pattern p=Pattern.compile("[\\w[.-]]+@[\\w[.-]]+\\.[\\w]+");//邮箱的正则表达式  
    Matcher m=p.matcher(line);  
    while(m.find())  
    {  
     System.out.println(m.group());  
    }  
   }  
}

 

 

 

 

 

posted on 2017-03-27 16:40  Honey_Badger  阅读(388)  评论(0编辑  收藏  举报

导航

github