不归家的夜

导航

java 抓取网页图片

import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLConnection;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.UUID;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/***
 * java抓取网络图片
 * 
 * @author ITWANG
 * 
 */
public class CatchImage
{

	// 地址
	private static final String URL = "http://www.4493.com/";
	// 编码
	private static final String ECODING = "UTF-8";
	// 获取img标签正则
	private static final String IMGURL_REG = "<img.*src=(.*?)[^>]*?>";
	// 获取src路径的正则
	private static final String IMGSRC_REG = "http:\"?(.*?)(\"|>|\\s+)";

	private static final String IMGDSRC_REG = "[\"\'](http.+\\.(jpg|JPG|png|PNG|gif|GIF))[\"\']";
	private static final String[] picstuffix = { "jpg", "JPG", "gif", "GIF", "png", "PNG" };

	private static List<String> pList = new ArrayList<>();

	public static void main(String[] args) throws Exception
	{
		CatchImage cm = new CatchImage();
		// 获得html文本内容
		String HTML = cm.getHTML(URL);
		// System.out.println(HTML);
		// 获取图片标签
		List<String> imgUrl = cm.getImageUrl(HTML);
		 // 获取图片src地址
		 List<String> imgSrc = cm.getImageSrc(imgUrl);
		 // 下载图片
		 cm.Download(imgSrc, "E:\\Imagesave"+saveDiff());
		
//		cm.getImageSrc(HTML);
		// cm.ThreadDownload(imgSrc, "E:\\Imagesave"+saveDiff() , 6);
//		cm.TOThreadDownload(pList, "E:\\Imagesave" + saveDiff(), 6, 6000);
	}

	/***
	 * 获取HTML内容
	 * 
	 * @param url
	 * @return
	 * @throws Exception
	 */
	private String getHTML(String url) throws Exception
	{
		URL uri = new URL(url);
		URLConnection connection = uri.openConnection();
		InputStream in = connection.getInputStream();
		byte[] buf = new byte[1024];
		int length = 0;
		StringBuffer sb = new StringBuffer();
		while ((length = in.read(buf, 0, buf.length)) > 0)
		{
			sb.append(new String(buf, ECODING));
		}
		in.close();
		return sb.toString();
	}

	/***
	 * 获取ImageUrl地址
	 * 
	 * @param HTML
	 * @return
	 */
	private List<String> getImageUrl(String HTML)
	{
		Matcher matcher = Pattern.compile(IMGURL_REG).matcher(HTML);
		List<String> listImgUrl = new ArrayList<String>();
		while (matcher.find())
		{
			listImgUrl.add(matcher.group());
		}
		return listImgUrl;
	}

	/***
	 * 获取ImageSrc地址
	 * 
	 * @param listImageUrl
	 * @return
	 */
	private List<String> getImageSrc(List<String> listImageUrl)
	{
		List<String> listImgSrc = new ArrayList<String>();
		for (String image : listImageUrl)
		{
			Matcher matcher = Pattern.compile(IMGSRC_REG).matcher(image);
			while (matcher.find())
			{
				listImgSrc.add(matcher.group().substring(0, matcher.group().length() - 1));
			}
		}
		return listImgSrc;
	}

	/**
	 * 获取html里面的图片
	 * 
	 * @param html
	 * @return
	 */
	private void getImageSrc(String html)
	{
		Pattern p = Pattern.compile(IMGDSRC_REG);
		Matcher m = p.matcher(html);
		while (m.find())
		{
			getpicsrc(m.group(1));
		}
	}

	/**
	 * 截取字符串里面的图片
	 * 
	 * @param src
	 * @return
	 */
	public void getpicsrc(String src)
	{
		if (src.contains("http:"))
		{
			String[] app = src.split("http:");
			for (int i = 0; i < app.length; i++)
			{
				if (!isBlank(app[i]))
				{
					for (int j = 0; j < picstuffix.length; j++)
					{
						if (app[i].contains("." + picstuffix[j]))
						{
							int inum = app[i].indexOf(picstuffix[j]);
							String url = "http:" + app[i].substring(0, inum) + picstuffix[j];
							pList.add(url);
						}
					}
				}
			}
		}
	}

	/**
	 * 去处重复元素
	 * 
	 * @param result
	 * @return
	 */
	public static List<String> RemoveRepeated(List<String> result)
	{
		List<String> tmpArr = new ArrayList<String>();
		for (int i = 0; i < result.size(); i++)
		{
			if (!tmpArr.contains(result.get(i)))
			{
				tmpArr.add((String) result.get(i));
			}
		}
		return tmpArr;
	}

	/**
	 * 判断非空
	 * 
	 * @param cs
	 * @return
	 */
	public static boolean isBlank(CharSequence cs)
	{
		int strLen;
		if (cs == null || (strLen = cs.length()) == 0)
		{
			return true;
		}
		for (int i = 0; i < strLen; i++)
		{
			if (Character.isWhitespace(cs.charAt(i)) == false)
			{
				return false;
			}
		}
		return true;
	}

	/***
	 * 单线程下载图片
	 * 
	 * @param listImgSrc
	 */
	private void Download(List<String> listImgSrc, String savedir)
	{
		for (String url : listImgSrc)
		{
			try
			{
				String imageName = url.substring(url.lastIndexOf("/") + 1, url.length());
				URL uri = new URL(url);
				InputStream in = uri.openStream();
				FileOutputStream fo = new FileOutputStream(new File(savedir + imageName));
				byte[] buf = new byte[1024];
				int length = 0;
				while ((length = in.read(buf, 0, buf.length)) != -1)
				{
					fo.write(buf, 0, length);
				}
				in.close();
				fo.close();
				System.out.println("*^_^*");
			} catch (Exception e)
			{
				System.out.println("-_-!");
			}
		}
	}

	/**
	 * 多线程下载图片
	 * 
	 * @param listImgSrc
	 * @param savedir
	 * @param tnum
	 */
	private void ThreadDownload(List<String> listImgSrc, String savedir, int tnum)
	{
		for (int i = 0; i < listImgSrc.size(); i += tnum)
		{
			new DThread(savedir, tnum, listImgSrc, i).start();
		}
	}

	/**
	 * 
	 * 2014-4-3上午10:52:38 Describe: 多线程下载照片
	 * 
	 * @author: ITWANG
	 */
	class DThread extends Thread
	{

		private String savedir = null;
		private int tnum;
		private List<String> listImgSrc;
		private int bunm;

		public DThread(String savedir, int tnum, List<String> listImgSrc, int bnum)
		{
			this.savedir = savedir;
			this.tnum = tnum;
			this.listImgSrc = listImgSrc;
			this.bunm = bnum;
		}

		@Override
		public void run()
		{
			for (int i = 0; i < tnum; i++)
			{
				try
				{
					String url = listImgSrc.get(bunm + i);
					String sps = url.substring(url.lastIndexOf("."), url.length());
					String imageName = UUID.randomUUID().toString() + sps;
					URL uri = new URL(url);
					InputStream in = uri.openStream();
					System.out.println(savedir + imageName);
					FileOutputStream fo = new FileOutputStream(new File(savedir + imageName));
					byte[] buf = new byte[1024];
					int length = 0;
					while ((length = in.read(buf, 0, buf.length)) != -1)
					{
						fo.write(buf, 0, length);
					}
					in.close();
					fo.close();
					System.out.println("*^_^*");
				} catch (Exception e)
				{
					System.out.println("-_-!");
				}
			}
		}
	}

	/**
	 * 多线程超时下载
	 * 
	 * @param listImgSrc
	 * @param savedir
	 * @param tnum
	 * @param timeout
	 */
	private void TOThreadDownload(List<String> listImgSrc, String savedir, int tnum, int timeout)
	{
		for (int i = 0; i < listImgSrc.size(); i += tnum)
		{
			new TODThread(savedir, tnum, listImgSrc, i, timeout).start();
		}
	}

	/**
	 * 
	 * 2014-4-3上午10:52:07 Describe: 超时方式下载照片线程
	 * 
	 * @author: ITWANG
	 */
	class TODThread extends Thread
	{
		private String savedir = null;
		private int tnum;
		private List<String> listImgSrc;
		private int bunm;
		private int timeout = 3000;

		public TODThread(String savedir, int tnum, List<String> listImgSrc, int bnum, int timeout)
		{
			this.savedir = savedir;
			this.tnum = tnum;
			this.listImgSrc = listImgSrc;
			this.bunm = bnum;
			this.timeout = timeout;
		}

		@Override
		public void run()
		{
			for (int i = 0; i < tnum; i++)
			{
				String url = listImgSrc.get(bunm + i);
				String sps = url.substring(url.lastIndexOf("."), url.length());
				String imageName = UUID.randomUUID().toString() + sps;
				try
				{
					if (getPic(url, savedir, imageName, timeout))
					{
						System.out.println("*^_^*");
					} else
					{
						System.out.println("-_-!");
					}
				} catch (Exception e)
				{
					System.out.println("下载异常");
				}
			}

		}
	}

	/**
	 * GET方式下载照片
	 * 
	 * @param purl
	 * @param folder
	 * @param filename
	 * @param timeout
	 * @return
	 * @throws Exception
	 */
	public boolean getPic(String purl, String folder, String filename, int timeout) throws Exception
	{
		URL url = new URL(purl);
		HttpURLConnection conn = (HttpURLConnection) url.openConnection();
		conn.setConnectTimeout(timeout);
		conn.setRequestMethod("GET");
		conn.setDoOutput(true);
		conn.setDoInput(true);
		if (conn.getResponseCode() == 200)
		{
			InputStream is = conn.getInputStream();
			byte[] bs = new byte[1024];
			int len;
			File sf = new File(folder);
			if (!sf.exists())
			{
				sf.mkdirs();
			}
			OutputStream os = new FileOutputStream(sf.getPath() + "\\" + filename);
			while ((len = is.read(bs)) != -1)
			{
				os.write(bs, 0, len);
			}
			os.close();
			is.close();
			return true;
		}
		return false;
	}

	/**
	 * 时间文件夹
	 * 
	 * @return
	 */
	public static String saveDiff()
	{
		SimpleDateFormat formate = new SimpleDateFormat("yyyy-MM-dd-HH-mm-ss");
		return "\\" + formate.format(System.currentTimeMillis()) + "\\";
	}

}

  

posted on 2014-04-29 17:24  不归家的夜  阅读(428)  评论(0)    收藏  举报