爬取豆瓣动画电影排行榜的海报

1、想需要爬取页面展示

 

 2、代码

import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.HttpURLConnection;
import java.net.URL;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.openqa.selenium.edge.EdgeDriver;

public class SpiderPlaybill {
	public static void main(String[] args) throws Exception {
		System.setProperty("webdriver.edge.driver", DriverCommon.getDriverName(DriverCommon.getOSType()));
		EdgeDriver driver = new EdgeDriver();
	    Crawler crawler = new Crawler(driver);
	    crawler.start();
	    //先登陆豆瓣
		for (int page = 0; page <= 20; page += 20) {
			String url = "https://movie.douban.com/explore#!type=movie&tag=动画&sort=recommend&page_limit=20&page_start=0";
			driver.get(url);
			try {
	            Thread.sleep(1000);
	        } catch (InterruptedException e) {
	            e.printStackTrace();
	        }
			Document doc  = Jsoup.parse(driver.getPageSource());
			//逐层分析html
			Elements a = doc.select("div[class=list-wp]");
			Elements b= a.select("a[class=item]");

			for(Element element : b){
				Element first = element.select("img").first();
				String video_name = first.attr("alt")+".jpg";	
				String videoImg = first.attr("src");
				downloadFileFromUrl(videoImg, video_name, "D:\\image\\");
				System.out.println("video_name"+video_name);
			}
		}
		driver.close();
	}
    /**
     * 下载文件
     * @param fileUrl
     * @param fileName
     * @param savePath
     * @throws Exception
     */
    private static void downloadFileFromUrl(String fileUrl, String fileName, String savePath) throws Exception {
        //获取连接
        URL url = new URL(fileUrl);
        HttpURLConnection connection = (HttpURLConnection) url.openConnection();
        connection.setConnectTimeout(3 * 1000);
        //设置请求头
        connection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.110 Safari/537.36");
        //获取输入流
        InputStream in = connection.getInputStream();

        File saveDir = new File(savePath);
        if (!saveDir.exists()) {
            saveDir.mkdirs();
        }
        File file = new File(savePath + fileName);

        OutputStream out = new FileOutputStream(file);

        byte[] bytes = new byte[1024];
        int len = 0;
        while ((len = in.read(bytes)) != -1) {
            out.write(bytes, 0, len);
        }
        out.close();
        in.close();
    }
}

  

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;

public class DriverCommon {
	/**
     * 获取操作系统类型
     */
    public static String getOSType(){
        String temp  = System.getProperty("os.name");
        if(temp.contains("Mac")){
            return "mac";
        }else if(temp.contains("Win")){
            return "win";
        }else{
            try {
                Process process = Runtime.getRuntime().exec("getconf LONG_BIT");
                BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(process.getInputStream()));
                String s = bufferedReader.readLine();
                if(s.contains("64")){
                    return "linux64";
                }else{
                    return "linux32";
                }
            } catch (IOException e) {
                e.printStackTrace();
                return "linux64"; //默认Linux64
            }
        }
    }
     
    /**
     * 获取浏览器驱动
     * @param os
     * @return
     * @throws IOException
     */
    public static String getDriverName(String os) throws IOException{
        if(os == null)
            return null;
        switch (os) {
        case "win":
            return "C:/myworkspace/spiderMovie/msedgedriver.exe";
        case "mac":
            return "chromedriver_mac";
        case "linux_32":
            return "chromedriver_linux32";
        case "linux_64":
        default:
            return "chromedriver_linux64";
        }
    }
}

  

import org.openqa.selenium.By;
import org.openqa.selenium.edge.EdgeDriver;

public class Crawler {
	// 网站的登陆链接
	private String baseUrl = "https://accounts.douban.com/passport/login?redir=https%3A%2F%2Fwww.douban.com%2Fgroup%2F";
	private EdgeDriver edgeDriver;
	public Crawler() {
	}
	public Crawler(EdgeDriver driver) {
		super();
		this.edgeDriver = driver;
	}
	//登陆 豆瓣
	public void start() {
		// 登入网站
		edgeDriver.get(baseUrl);
		// 先点击 密码 登陆,使password 存在 才行
		edgeDriver.findElement(By.cssSelector("ul.tab-start > li:nth-child(2)")).click();
		// 输入密码
		edgeDriver.findElement(By.id("username")).sendKeys("139000000");// 手机号
		edgeDriver.findElement(By.id("password")).sendKeys("*********");
		try {
			Thread.sleep(1000);
		} catch (InterruptedException e) {
			e.printStackTrace();
		}
		edgeDriver.findElement(By.cssSelector("div.account-form-field-submit  > a")).click();
		try {
			Thread.sleep(1000);
		} catch (InterruptedException e) {
			e.printStackTrace();
		}
	}
}

  3、代码所需要的 jar 包  

链接:https://pan.baidu.com/s/1Cm_36caoq_UEFwYYxTR2qA
提取码:g6j8
复制这段内容后打开百度网盘手机App,操作更方便哦

 

4、爬取结果

 

posted @ 2020-10-18 15:26  一右四分之一  阅读(150)  评论(0编辑  收藏  举报