竞争无处不在,青春永不言败!专业撸代码,副业修bug

Talk is cheap , show me the code!



一个java版本的简单邮箱小爬虫

//趁着有空回头复习了一把正则表达式
/*
  以下代码以百度某个贴吧的 URL 作为源,实现了读取 EmailAddress 并写入文件保存起来的两个功能,如果要爬取其它信息,可以改写正则实现相应功能
  要点看引入的包可知:
     1.应用到 IO 读写缓冲字符流
     2.应用到正则表达式
     3.URL 对象获取网页信息
     4.util 包的集合框架 ArrayList
*/
import java.io.*; import java.util.regex.*; import java.net.*; import java.util.*; class Spider{ public static void main(String [] args) throws Exception{ URL url=new URL("http://tieba.baidu.com/p/2314539885"); //String [] emailAddress= ArrayList<String> emailList=getEmailByURL(url); for(String emailAddress:emailList){ System.out.println(emailAddress); } String qualifiedName="c://users//ghc//desktop//test//emailAddress.txt"; writeEmailToFile(qualifiedName,emailList); } // Read html from url public static ArrayList<String> getEmailByURL(URL url) throws Exception{ URLConnection urlconn=url.openConnection(); BufferedReader bufreader =new BufferedReader(new InputStreamReader(urlconn.getInputStream())); //regex match pattern String regex="\\w+@\\w+(\\.\\w+)+"; Pattern p=Pattern.compile(regex); ArrayList<String> emailList=new ArrayList<String>(); String line=null; while((line=bufreader.readLine())!=null){ Matcher m=p.matcher(line); // Start to iterator the one matches while(m.find()){ emailList.add(m.group()); //System.out.println(m.group()); } //System.out.println(line); } return emailList; } public static void writeEmailToFile(String qualifiedName,ArrayList<String> emailList) throws Exception{ BufferedWriter bufwriter=new BufferedWriter(new FileWriter(qualifiedName)); for(int i=0;i<emailList.size();i++){ bufwriter.write(emailList.get(i)); bufwriter.newLine(); bufwriter.flush(); } } }


/*读取键盘输入的 三种形式 */
import java.io.*;
import java.util.*;
import java.util.Scanner;
class MyTest{
	public static void main(String [] args) throws Exception{
		//方法一
		Scanner scanner =new Scanner(System.in);
		String inputStr=scanner.nextLine();
		System.out.println(inputStr);
		
		//方法二
		BufferedReader bufreader=new BufferedReader(new InputStreamReader(System.in));
		String line=null;
		while((line=bufreader.readLine())!=null){
			System.out.println(line);
		}
		
	}
}
/* 把叠词 简化 */
class  AbrreviateDemo{
	public static void main(String [] args){
		String str="II...LLL...ove..ee.....you!";
		
		String regex="\\.+";
		String replaceStr="";
		str=retriveStr(str,regex,replaceStr);
		
		regex="(.)\\1+";
		replaceStr="$1";
		str=retriveStr(str,regex,replaceStr);
		
		System.out.println(str);
	}
	public static String retriveStr(String str,String regex,String replaceStr){
		return str.replaceAll(regex,replaceStr);
	}
}

/* 将一堆杂乱的 IP 地址进行排序 */
import java.util.*;
class SortIP{
	public static void main(String [] args){
		String IP="192.168.0.5 2.2.3.4 127.0.0.1";
		printAfterSort(IP);
	}
	public static void printAfterSort(String str){
		String regex="(0*\\d+)";
		str=str.replaceAll(regex,"00$1");
		
		regex="0*(\\d{3})";
		str=str.replaceAll(regex,"$1");
		
		System.out.println(str);
		
		regex=" +";
		String [] strArray=str.split(regex);
		Arrays.sort(strArray);
		
		for(int i=0;i<strArray.length;i++){
			System.out.println(strArray[i].replaceAll("0*(\\d+)","$1"));
		}
		//System.out.println(str); 
	}
}

/* 邮箱地址校验 */
class checkMailDemo{
	public static void main(String [] args){
		String str="liyu@gchchina.com.cn";
		System.out.println("result: "+checkMail(str));
		//String regex="";
	}
	public static boolean checkMail(String str){
		String regex="[a-zA-Z0-9_]+[@][a-zA-Z0-9]+(\\.[a-zA-Z]+){1,3}";
		regex="\\w+@\\w+(\\.\\w+){1,3}";
		return str.matches(regex);
	}
}
/* 从一堆杂乱的字符串中获取需要的手机号码 */
import java.util.regex.*;
class RegexDemo{
	public static void main(String [] args){
			String str="1afasdf13874057617weojfjlj";
			String regex="[1-9][3,5,8]\\d{9}";
			
			retriveStr(str,regex);
		}
	public static void retriveStr(String str,String regex){
		Pattern p=Pattern.compile(regex);
		Matcher m=p.matcher(str);
		while(m.find()){
			String tempstr=m.group();
			System.out.println(tempstr);
		}
		
	}
}

/* 读取键盘标准输入流并大写方式打印到控制台 */
import java.io.*;
import java.util.*;
class UpercaseSystemIn{
	public static void main(String [] args) throws IOException{
		InputStream in=System.in;
		doUpcaseReadIn(in);
	}
	public static void doUpcaseReadIn(InputStream in) throws IOException{
		BufferedReader bufr=new BufferedReader(new InputStreamReader(in));
		String str=null;
		while((str=bufr.readLine())!=null){
			System.out.println(str.toUpperCase());
			if(str.equalsIgnoreCase("exit")) System.exit(0);  //break 
		}
	}
}

/* 读取某个贴吧邮箱地址并打印到控制台 注意这里的正则*/
import java.net.*;
import java.io.*;
import java.util.*;
import java.util.regex.*;
class SpiderTest{
	public static void main(String [] args) throws Exception{
		URL url=new URL("http://tieba.baidu.com/p/2314539885");
		getEmailAddressFromURL(url);
	}
	public static void getEmailAddressFromURL(URL url) throws Exception{
		URLConnection urlconn=url.openConnection();
		
		BufferedReader bufreader = new BufferedReader(new InputStreamReader(urlconn.getInputStream()));
		
		String line=null;
		
		String regex="\\w+@\\w+(\\.\\w+)+";
		
		
		Pattern p=Pattern.compile(regex);
		
		while ((line=bufreader.readLine())!=null){
			Matcher m = p.matcher(line);
			while(m.find()){
				System.out.println(m.group());
			}
		//	System.out.println(line);
		}
	}
}

/* 实现本地二进制文件拷贝 */

import java.io.*;
class CopyImg{
	public static void main(String [] args){
		BufferedInputStream bufinps=null;
		BufferedOutputStream bufotps=null;
		try{
			bufinps=new BufferedInputStream(new FileInputStream("psb.jpg"));
			bufotps=new BufferedOutputStream(new FileOutputStream("psb_copy.jpg"));
			byte [] buf=new byte[8192];
			int len=0;
			while((len=bufinps.read(buf))>0){
				bufotps.write(buf,0,len);
				//bufotps.flush();
			}
		}
		catch(IOException ioe){
			ioe.printStackTrace();
		}
		finally{
			if(bufinps!=null)
			try{
				bufinps.close();
			}
			catch(IOException ioe){
				ioe.printStackTrace();
			}
			if(bufotps!=null)
			try{
				bufotps.close();
			}
			catch(IOException ioe){
				ioe.printStackTrace();
			}
		}
	}
}

/* 从某个网页爬取符合规则的邮箱地址并保存到本地磁盘路径下 */

import java.io.*;
import java.util.regex.*;
import java.net.*;
import java.util.*;
class Spider{
	public static void main(String [] args) throws Exception{
		URL url=new URL("http://tieba.baidu.com/p/2314539885");
		//String [] emailAddress=
		ArrayList<String> emailList=getEmailByURL(url);
		for(String emailAddress:emailList){
			System.out.println(emailAddress);
		}
		
		String qualifiedName="c://users//ghc//desktop//test//emailAddress.txt";
		
		writeEmailToFile(qualifiedName,emailList);
		
	}
	// Read html from url
	public static ArrayList<String> getEmailByURL(URL url) throws Exception{
		URLConnection urlconn=url.openConnection();
		BufferedReader bufreader =new BufferedReader(new InputStreamReader(urlconn.getInputStream()));
		
		//regex  match pattern
		String regex="\\w+@\\w+(\\.\\w+)+";		
		Pattern p=Pattern.compile(regex);
		
		ArrayList<String> emailList=new ArrayList<String>();
		
		String line=null;
		while((line=bufreader.readLine())!=null){
				Matcher m=p.matcher(line);
				
				// Start to iterator the one matches 
				
				while(m.find()){
					emailList.add(m.group());
					//System.out.println(m.group());
				}
				
				//System.out.println(line);
		}
		return emailList;
	}
	public static void writeEmailToFile(String qualifiedName,ArrayList<String> emailList) throws Exception{
		BufferedWriter bufwriter=new BufferedWriter(new FileWriter(qualifiedName));
		for(int i=0;i<emailList.size();i++){
			bufwriter.write(emailList.get(i));
			bufwriter.newLine();
			bufwriter.flush();
		}
	}
}

/* 从某个网页爬取图片的 URL 地址然后 进行 下载到本地磁盘路径 基本功能已经实现,但是正则需要自行调整 */

import java.net.*;
import java.io.*;
import java.util.regex.*;
import java.util.*;
class ImgSpider{
	public static void main(String [] args){
		saveImgFromURL("http://image.baidu.com/","c:/users/ghc/desktop/test/");
		System.gc();
	}
	public static boolean downLoadImg(String line,String path){
		boolean flag=true;
		FileOutputStream fos=null;
		BufferedInputStream bufinpts=null;
		BufferedOutputStream bufopts=null;
		path=path.replace("<","");
/* 		System.out.println(line);
		System.out.println(path); */
		try{
			bufinpts=new BufferedInputStream((new URL(line)).openConnection().getInputStream());
			fos=new FileOutputStream(path);
			bufopts=new BufferedOutputStream(fos);
			
			byte [] buf=new byte[1024];
			
			int len=-1;
			while((len=bufinpts.read(buf))!=-1){
				bufopts.write(buf,0,buf.length);
			}
		}
		catch(IOException ioe){
			ioe.printStackTrace();
			flag=false;
		}
		finally{
			if(bufopts!=null)
				try{
					bufopts=null;
					bufopts.close();
				}
				catch(IOException ioe){
					ioe.printStackTrace();
				}
			if(fos!=null)
			 try{
				 fos=null;
				 fos.close();
			 }
			 catch(IOException ioe){
					ioe.printStackTrace();
				}
			 
		}
		return flag;	
	}
	
	public static boolean saveImgFromURL(String urlStr,String folder){
		boolean flag=true;
		URL url=null;
		
		//InputStream in=null;
		String line=null;
		BufferedReader bufr=null;
		Pattern p=null;
		Matcher m=null;
		ArrayList<String> imgList=null;
		try{
			url=new URL(urlStr);
			URLConnection urlconn=url.openConnection();
			bufr=new BufferedReader(new InputStreamReader(urlconn.getInputStream()));
			imgList=new ArrayList<String>();
			String regex="<img.*src=(.*?)[^>]*?>";
			p=Pattern.compile(regex);
			while((line=bufr.readLine())!=null){
				m=p.matcher(line);
				while(m.find()){
					System.out.println(m.group());
					imgList.add(m.group());
				}
				//System.out.println(line);
			}
			Iterator<String> it=imgList.iterator();
			while(it.hasNext()){
				line=it.next();
				folder+=line.substring(line.lastIndexOf("/",2) + 1,  
                        3)+".png";  
				//http://www.jb51.net/images/logo.gif
				m=Pattern.compile("http://(\\w+\\.)+[a-z]+/images/(\\w+\\.)+[a-z]{3}").matcher(line);
				while(m.find()){
					line=m.group();
					//System.out.println(line);
					downLoadImg(line,folder);
				}
			}
			
			
		}
		catch(MalformedURLException mfe){
			mfe.printStackTrace();
			flag=false;
		}
		catch(IOException ioe){
			ioe.printStackTrace();
			flag=false;
		}
		
		finally{
			if (bufr!=null)
			try{
				bufr=null;
				bufr.close();
			}
			catch(IOException ie){ ie.printStackTrace();
				
			}
			
		}
		
		return flag;
	}
}







/*正则 小练习 */

class Demo{
	public static void main(String [] args){
		String qq="1212345";
		boolean checkResult=checkQQ(qq);
		System.out.println(checkResult ? qq+" is right": qq+" is wrong!!!");
		
		String telnumber="15974097817";
		checkResult=checkTel(telnumber);
		System.out.println(checkResult ? telnumber+" is right": telnumber+" is wrong!!!");
		
		String path="c:\\users\\frank\\abqqcdkkkefghhijkkkkkl.txt";
		
		
		String regex="(.)\\1+";    //叠词切割注意引入组的概念,\n 代表引用第几组 + 出现1次或多次 qq 或者 kkk 均会被当作切割符
		
		printAfterSplit(path,regex);
		
		System.out.println("=================");
		
		regex="\\.";
		
		printAfterSplit(path,regex);
		
		
		String str="abcddeffffg";
			   regex="(.)\\1{3,}";
		String replaceStr="$1";
		printAfterReplaceStr(str,regex,replaceStr);
		
	}
// 以下两个均是正则匹配 校验字符串的函数
public static boolean checkTel(String telnumber){
	String regex="[1][3,5,8]\\d{9}";
	return telnumber.matches(regex);
}
public static boolean checkQQ(String qq){
		//boolean result=false;
		String regex="[1-9][0-9]{4,14}";
			   regex="[1-9]\\d{4,14}";
		return qq.matches(regex);
		/* int len=qq.length();
		if(len<5 || len>15 || qq.startsWith("0")){
			System.out.println("length or startWith issue!!!");
			//return result;
		}
		else {
				char [] ary=qq.toCharArray();
				for(int i=0;i<ary.length;i++){
					if(!(ary[i]>='0' && ary[i]<='9'))
					{
						System.out.println("not between 0 and 9 !!!");
						break;
						//return result;
					}
					else 
						result=true;
					
				}
		} */
		//return result;
	}
// 以下两个均是正则 切割字符串的函数
public static void printAfterSplit(String path,String regex){

		String [] ary=path.split(regex);
		for(String s:ary){
			System.out.println(s);
		}
	}
// 以下两个均是自定义正则替换字符串函数
public static void printAfterReplaceStr(String str,String regex,String replaceStr){
		String resultStr=str.replaceAll(regex,replaceStr);
		System.out.println(resultStr);
	}
}

  

  

posted @ 2017-04-02 21:56  云雾散人  阅读(589)  评论(0编辑  收藏  举报

Your attitude not your aptitude will determine your altitude!

如果有来生,一个人去远行,看不同的风景,感受生命的活力!