一个java版本的简单邮箱小爬虫
//趁着有空回头复习了一把正则表达式
/*
以下代码以百度某个贴吧的 URL 作为源,实现了读取 EmailAddress 并写入文件保存起来的两个功能,如果要爬取其它信息,可以改写正则实现相应功能
要点看引入的包可知:
1.应用到 IO 读写缓冲字符流
2.应用到正则表达式
3.URL 对象获取网页信息
4.util 包的集合框架 ArrayList
*/
import java.io.*; import java.util.regex.*; import java.net.*; import java.util.*; class Spider{ public static void main(String [] args) throws Exception{ URL url=new URL("http://tieba.baidu.com/p/2314539885"); //String [] emailAddress= ArrayList<String> emailList=getEmailByURL(url); for(String emailAddress:emailList){ System.out.println(emailAddress); } String qualifiedName="c://users//ghc//desktop//test//emailAddress.txt"; writeEmailToFile(qualifiedName,emailList); } // Read html from url public static ArrayList<String> getEmailByURL(URL url) throws Exception{ URLConnection urlconn=url.openConnection(); BufferedReader bufreader =new BufferedReader(new InputStreamReader(urlconn.getInputStream())); //regex match pattern String regex="\\w+@\\w+(\\.\\w+)+"; Pattern p=Pattern.compile(regex); ArrayList<String> emailList=new ArrayList<String>(); String line=null; while((line=bufreader.readLine())!=null){ Matcher m=p.matcher(line); // Start to iterator the one matches while(m.find()){ emailList.add(m.group()); //System.out.println(m.group()); } //System.out.println(line); } return emailList; } public static void writeEmailToFile(String qualifiedName,ArrayList<String> emailList) throws Exception{ BufferedWriter bufwriter=new BufferedWriter(new FileWriter(qualifiedName)); for(int i=0;i<emailList.size();i++){ bufwriter.write(emailList.get(i)); bufwriter.newLine(); bufwriter.flush(); } } }
/*读取键盘输入的 三种形式 */ import java.io.*; import java.util.*; import java.util.Scanner; class MyTest{ public static void main(String [] args) throws Exception{ //方法一 Scanner scanner =new Scanner(System.in); String inputStr=scanner.nextLine(); System.out.println(inputStr); //方法二 BufferedReader bufreader=new BufferedReader(new InputStreamReader(System.in)); String line=null; while((line=bufreader.readLine())!=null){ System.out.println(line); } } } /* 把叠词 简化 */ class AbrreviateDemo{ public static void main(String [] args){ String str="II...LLL...ove..ee.....you!"; String regex="\\.+"; String replaceStr=""; str=retriveStr(str,regex,replaceStr); regex="(.)\\1+"; replaceStr="$1"; str=retriveStr(str,regex,replaceStr); System.out.println(str); } public static String retriveStr(String str,String regex,String replaceStr){ return str.replaceAll(regex,replaceStr); } } /* 将一堆杂乱的 IP 地址进行排序 */ import java.util.*; class SortIP{ public static void main(String [] args){ String IP="192.168.0.5 2.2.3.4 127.0.0.1"; printAfterSort(IP); } public static void printAfterSort(String str){ String regex="(0*\\d+)"; str=str.replaceAll(regex,"00$1"); regex="0*(\\d{3})"; str=str.replaceAll(regex,"$1"); System.out.println(str); regex=" +"; String [] strArray=str.split(regex); Arrays.sort(strArray); for(int i=0;i<strArray.length;i++){ System.out.println(strArray[i].replaceAll("0*(\\d+)","$1")); } //System.out.println(str); } } /* 邮箱地址校验 */ class checkMailDemo{ public static void main(String [] args){ String str="liyu@gchchina.com.cn"; System.out.println("result: "+checkMail(str)); //String regex=""; } public static boolean checkMail(String str){ String regex="[a-zA-Z0-9_]+[@][a-zA-Z0-9]+(\\.[a-zA-Z]+){1,3}"; regex="\\w+@\\w+(\\.\\w+){1,3}"; return str.matches(regex); } } /* 从一堆杂乱的字符串中获取需要的手机号码 */ import java.util.regex.*; class RegexDemo{ public static void main(String [] args){ String str="1afasdf13874057617weojfjlj"; String regex="[1-9][3,5,8]\\d{9}"; retriveStr(str,regex); } public static void retriveStr(String str,String regex){ Pattern p=Pattern.compile(regex); Matcher m=p.matcher(str); while(m.find()){ String tempstr=m.group(); System.out.println(tempstr); } } } /* 读取键盘标准输入流并大写方式打印到控制台 */ import java.io.*; import java.util.*; class UpercaseSystemIn{ public static void main(String [] args) throws IOException{ InputStream in=System.in; doUpcaseReadIn(in); } public static void doUpcaseReadIn(InputStream in) throws IOException{ BufferedReader bufr=new BufferedReader(new InputStreamReader(in)); String str=null; while((str=bufr.readLine())!=null){ System.out.println(str.toUpperCase()); if(str.equalsIgnoreCase("exit")) System.exit(0); //break } } } /* 读取某个贴吧邮箱地址并打印到控制台 注意这里的正则*/ import java.net.*; import java.io.*; import java.util.*; import java.util.regex.*; class SpiderTest{ public static void main(String [] args) throws Exception{ URL url=new URL("http://tieba.baidu.com/p/2314539885"); getEmailAddressFromURL(url); } public static void getEmailAddressFromURL(URL url) throws Exception{ URLConnection urlconn=url.openConnection(); BufferedReader bufreader = new BufferedReader(new InputStreamReader(urlconn.getInputStream())); String line=null; String regex="\\w+@\\w+(\\.\\w+)+"; Pattern p=Pattern.compile(regex); while ((line=bufreader.readLine())!=null){ Matcher m = p.matcher(line); while(m.find()){ System.out.println(m.group()); } // System.out.println(line); } } } /* 实现本地二进制文件拷贝 */ import java.io.*; class CopyImg{ public static void main(String [] args){ BufferedInputStream bufinps=null; BufferedOutputStream bufotps=null; try{ bufinps=new BufferedInputStream(new FileInputStream("psb.jpg")); bufotps=new BufferedOutputStream(new FileOutputStream("psb_copy.jpg")); byte [] buf=new byte[8192]; int len=0; while((len=bufinps.read(buf))>0){ bufotps.write(buf,0,len); //bufotps.flush(); } } catch(IOException ioe){ ioe.printStackTrace(); } finally{ if(bufinps!=null) try{ bufinps.close(); } catch(IOException ioe){ ioe.printStackTrace(); } if(bufotps!=null) try{ bufotps.close(); } catch(IOException ioe){ ioe.printStackTrace(); } } } } /* 从某个网页爬取符合规则的邮箱地址并保存到本地磁盘路径下 */ import java.io.*; import java.util.regex.*; import java.net.*; import java.util.*; class Spider{ public static void main(String [] args) throws Exception{ URL url=new URL("http://tieba.baidu.com/p/2314539885"); //String [] emailAddress= ArrayList<String> emailList=getEmailByURL(url); for(String emailAddress:emailList){ System.out.println(emailAddress); } String qualifiedName="c://users//ghc//desktop//test//emailAddress.txt"; writeEmailToFile(qualifiedName,emailList); } // Read html from url public static ArrayList<String> getEmailByURL(URL url) throws Exception{ URLConnection urlconn=url.openConnection(); BufferedReader bufreader =new BufferedReader(new InputStreamReader(urlconn.getInputStream())); //regex match pattern String regex="\\w+@\\w+(\\.\\w+)+"; Pattern p=Pattern.compile(regex); ArrayList<String> emailList=new ArrayList<String>(); String line=null; while((line=bufreader.readLine())!=null){ Matcher m=p.matcher(line); // Start to iterator the one matches while(m.find()){ emailList.add(m.group()); //System.out.println(m.group()); } //System.out.println(line); } return emailList; } public static void writeEmailToFile(String qualifiedName,ArrayList<String> emailList) throws Exception{ BufferedWriter bufwriter=new BufferedWriter(new FileWriter(qualifiedName)); for(int i=0;i<emailList.size();i++){ bufwriter.write(emailList.get(i)); bufwriter.newLine(); bufwriter.flush(); } } } /* 从某个网页爬取图片的 URL 地址然后 进行 下载到本地磁盘路径 基本功能已经实现,但是正则需要自行调整 */ import java.net.*; import java.io.*; import java.util.regex.*; import java.util.*; class ImgSpider{ public static void main(String [] args){ saveImgFromURL("http://image.baidu.com/","c:/users/ghc/desktop/test/"); System.gc(); } public static boolean downLoadImg(String line,String path){ boolean flag=true; FileOutputStream fos=null; BufferedInputStream bufinpts=null; BufferedOutputStream bufopts=null; path=path.replace("<",""); /* System.out.println(line); System.out.println(path); */ try{ bufinpts=new BufferedInputStream((new URL(line)).openConnection().getInputStream()); fos=new FileOutputStream(path); bufopts=new BufferedOutputStream(fos); byte [] buf=new byte[1024]; int len=-1; while((len=bufinpts.read(buf))!=-1){ bufopts.write(buf,0,buf.length); } } catch(IOException ioe){ ioe.printStackTrace(); flag=false; } finally{ if(bufopts!=null) try{ bufopts=null; bufopts.close(); } catch(IOException ioe){ ioe.printStackTrace(); } if(fos!=null) try{ fos=null; fos.close(); } catch(IOException ioe){ ioe.printStackTrace(); } } return flag; } public static boolean saveImgFromURL(String urlStr,String folder){ boolean flag=true; URL url=null; //InputStream in=null; String line=null; BufferedReader bufr=null; Pattern p=null; Matcher m=null; ArrayList<String> imgList=null; try{ url=new URL(urlStr); URLConnection urlconn=url.openConnection(); bufr=new BufferedReader(new InputStreamReader(urlconn.getInputStream())); imgList=new ArrayList<String>(); String regex="<img.*src=(.*?)[^>]*?>"; p=Pattern.compile(regex); while((line=bufr.readLine())!=null){ m=p.matcher(line); while(m.find()){ System.out.println(m.group()); imgList.add(m.group()); } //System.out.println(line); } Iterator<String> it=imgList.iterator(); while(it.hasNext()){ line=it.next(); folder+=line.substring(line.lastIndexOf("/",2) + 1, 3)+".png"; //http://www.jb51.net/images/logo.gif m=Pattern.compile("http://(\\w+\\.)+[a-z]+/images/(\\w+\\.)+[a-z]{3}").matcher(line); while(m.find()){ line=m.group(); //System.out.println(line); downLoadImg(line,folder); } } } catch(MalformedURLException mfe){ mfe.printStackTrace(); flag=false; } catch(IOException ioe){ ioe.printStackTrace(); flag=false; } finally{ if (bufr!=null) try{ bufr=null; bufr.close(); } catch(IOException ie){ ie.printStackTrace(); } } return flag; } } /*正则 小练习 */ class Demo{ public static void main(String [] args){ String qq="1212345"; boolean checkResult=checkQQ(qq); System.out.println(checkResult ? qq+" is right": qq+" is wrong!!!"); String telnumber="15974097817"; checkResult=checkTel(telnumber); System.out.println(checkResult ? telnumber+" is right": telnumber+" is wrong!!!"); String path="c:\\users\\frank\\abqqcdkkkefghhijkkkkkl.txt"; String regex="(.)\\1+"; //叠词切割注意引入组的概念,\n 代表引用第几组 + 出现1次或多次 qq 或者 kkk 均会被当作切割符 printAfterSplit(path,regex); System.out.println("================="); regex="\\."; printAfterSplit(path,regex); String str="abcddeffffg"; regex="(.)\\1{3,}"; String replaceStr="$1"; printAfterReplaceStr(str,regex,replaceStr); } // 以下两个均是正则匹配 校验字符串的函数 public static boolean checkTel(String telnumber){ String regex="[1][3,5,8]\\d{9}"; return telnumber.matches(regex); } public static boolean checkQQ(String qq){ //boolean result=false; String regex="[1-9][0-9]{4,14}"; regex="[1-9]\\d{4,14}"; return qq.matches(regex); /* int len=qq.length(); if(len<5 || len>15 || qq.startsWith("0")){ System.out.println("length or startWith issue!!!"); //return result; } else { char [] ary=qq.toCharArray(); for(int i=0;i<ary.length;i++){ if(!(ary[i]>='0' && ary[i]<='9')) { System.out.println("not between 0 and 9 !!!"); break; //return result; } else result=true; } } */ //return result; } // 以下两个均是正则 切割字符串的函数 public static void printAfterSplit(String path,String regex){ String [] ary=path.split(regex); for(String s:ary){ System.out.println(s); } } // 以下两个均是自定义正则替换字符串函数 public static void printAfterReplaceStr(String str,String regex,String replaceStr){ String resultStr=str.replaceAll(regex,replaceStr); System.out.println(resultStr); } }
如果有来生,一个人去远行,看不同的风景,感受生命的活力。。。