旧瓶新酒-获取网络资源即爬取下载页面内容(图片、html、css、js等)

这个java获取网络资源以前也写过不少
最近用到又重新写了一个,apache.commons.io中的例子就非常好,但是无法对请求进行详细设置
于是大部分照搬,局部替换以设置请求头
如需更加复杂的设置,可以考虑使用同为apche的httpComponents


** ```java package boot.example;

import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;

import java.io.*;
import java.net.HttpURLConnection;
import java.net.URL;

/**

  • Created by wq on 2017/6/6.
    */
    public class Download {
    public static void main(String[] args) {
    Download download = new Download();
    String url = "http://img1.3lian.com/2015/w7/85/d/21.jpg";
    String path1 = "E:\1.jpg";
    String path2 = "E:\2.jpg";
    String path3 = "E:\3.jpg";
    String url2="http://www.baidu.com";
    try {
    download.apacheCommonsIoDownload(url, path1);
    } catch (Exception e) {
    e.printStackTrace();
    }
    try {
    download.pureJavaNetDownload(url, path2);
    } catch (Exception e) {
    e.printStackTrace();
    }
    try {
    download.mixedDownload(url, path3);
    } catch (Exception e) {
    e.printStackTrace();
    }
    try{
    download.getContentAsString(url2);
    }catch (Exception e){
    e.printStackTrace();
    }
    }

    private void apacheCommonsIoDownload(String urlstr, String path) throws Exception {
    apacheCommonsIoDownload(urlstr, new File(path));
    }

    private void apacheCommonsIoDownload(String urlstr, File file) throws Exception {
    FileUtils.copyURLToFile(new URL(urlstr), file);
    }

    private void pureJavaNetDownload(String urlstr, String path) throws Exception {
    pureJavaNetDownload(urlstr, new File(path));
    }

    //无需依赖
    private void pureJavaNetDownload(String urlstr, File file) throws Exception {
    URL url = new URL(urlstr);
    HttpURLConnection httpURLConnection = (HttpURLConnection) url.openConnection();
    httpURLConnection.setRequestMethod("GET");
    //有的网站屏蔽程序抓取 添加User-Agent头信息以避免403
    httpURLConnection.setRequestProperty("User-Agent", "Mozilla/4.0");
    httpURLConnection.setConnectTimeout(10000);
    httpURLConnection.setReadTimeout(10000);
    // httpURLConnection.set...更多请求设置
    httpURLConnection.connect();
    InputStream is = httpURLConnection.getInputStream();
    // 不需要设置可以直接下面 也就是org.apache.commons.io.FileUtils中copyURLToFile(URL source, File destination)的写法
    // InputStream is=url.openStream();
    try {
    FileOutputStream fos = new FileOutputStream(file);
    try {
    // 照搬org.apache.commons.io.IOUtils
    // IOUtils.copy(InputStream input, OutputStream output) 开始
    byte[] buffer = new byte[1024 * 4];
    int n;
    while (-1 != (n = is.read(buffer))) {
    fos.write(buffer, 0, n);
    }
    // IOUtils.copy(InputStream input, OutputStream output) 结束
    } finally {
    try {
    if (is != null) {
    fos.close();
    }
    } catch (IOException ioe) {
    // ignore
    }
    }
    } finally {
    try {
    if (is != null) {
    is.close();
    }
    } catch (IOException ioe) {
    // ignore
    }
    }
    }

    private void mixedDownload(String urlstr, String path) throws Exception {
    mixedDownload(urlstr, new File(path));
    }

    //使用IOUtils减少代码量 弃用FileUtils以对请求进行详细设置 推荐
    private void mixedDownload(String urlstr, File file) throws Exception {
    URL url = new URL(urlstr);
    HttpURLConnection httpURLConnection = (HttpURLConnection) url.openConnection();
    httpURLConnection.setRequestMethod("GET");
    httpURLConnection.setRequestProperty("User-Agent", "Mozilla/4.0");
    httpURLConnection.setConnectTimeout(10000);
    httpURLConnection.setReadTimeout(10000);
    httpURLConnection.connect();
    InputStream is = httpURLConnection.getInputStream();
    try {
    FileOutputStream output = FileUtils.openOutputStream(file);
    try {
    IOUtils.copy(is, output);
    } finally {
    IOUtils.closeQuietly(output);
    }
    } finally {
    IOUtils.closeQuietly(is);
    }
    }

    private void getContentAsString(String urlstr) throws Exception {
    URL url = new URL(urlstr);
    InputStream is=url.openStream();
    ByteArrayOutputStream bos=new ByteArrayOutputStream();
    IOUtils.copy(is, bos);
    System.out.println(bos.toString());
    }
    }

posted on 2017-06-09 09:46  幽魂步  阅读(437)  评论(0)    收藏  举报

导航