爬取豆瓣动画电影排行榜的海报
1、想需要爬取页面展示
2、代码
import java.io.File; import java.io.FileOutputStream; import java.io.InputStream; import java.io.OutputStream; import java.net.HttpURLConnection; import java.net.URL; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.openqa.selenium.edge.EdgeDriver; public class SpiderPlaybill { public static void main(String[] args) throws Exception { System.setProperty("webdriver.edge.driver", DriverCommon.getDriverName(DriverCommon.getOSType())); EdgeDriver driver = new EdgeDriver(); Crawler crawler = new Crawler(driver); crawler.start(); //先登陆豆瓣 for (int page = 0; page <= 20; page += 20) { String url = "https://movie.douban.com/explore#!type=movie&tag=动画&sort=recommend&page_limit=20&page_start=0"; driver.get(url); try { Thread.sleep(1000); } catch (InterruptedException e) { e.printStackTrace(); } Document doc = Jsoup.parse(driver.getPageSource()); //逐层分析html Elements a = doc.select("div[class=list-wp]"); Elements b= a.select("a[class=item]"); for(Element element : b){ Element first = element.select("img").first(); String video_name = first.attr("alt")+".jpg"; String videoImg = first.attr("src"); downloadFileFromUrl(videoImg, video_name, "D:\\image\\"); System.out.println("video_name"+video_name); } } driver.close(); } /** * 下载文件 * @param fileUrl * @param fileName * @param savePath * @throws Exception */ private static void downloadFileFromUrl(String fileUrl, String fileName, String savePath) throws Exception { //获取连接 URL url = new URL(fileUrl); HttpURLConnection connection = (HttpURLConnection) url.openConnection(); connection.setConnectTimeout(3 * 1000); //设置请求头 connection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.110 Safari/537.36"); //获取输入流 InputStream in = connection.getInputStream(); File saveDir = new File(savePath); if (!saveDir.exists()) { saveDir.mkdirs(); } File file = new File(savePath + fileName); OutputStream out = new FileOutputStream(file); byte[] bytes = new byte[1024]; int len = 0; while ((len = in.read(bytes)) != -1) { out.write(bytes, 0, len); } out.close(); in.close(); } }
import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; public class DriverCommon { /** * 获取操作系统类型 */ public static String getOSType(){ String temp = System.getProperty("os.name"); if(temp.contains("Mac")){ return "mac"; }else if(temp.contains("Win")){ return "win"; }else{ try { Process process = Runtime.getRuntime().exec("getconf LONG_BIT"); BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(process.getInputStream())); String s = bufferedReader.readLine(); if(s.contains("64")){ return "linux64"; }else{ return "linux32"; } } catch (IOException e) { e.printStackTrace(); return "linux64"; //默认Linux64 } } } /** * 获取浏览器驱动 * @param os * @return * @throws IOException */ public static String getDriverName(String os) throws IOException{ if(os == null) return null; switch (os) { case "win": return "C:/myworkspace/spiderMovie/msedgedriver.exe"; case "mac": return "chromedriver_mac"; case "linux_32": return "chromedriver_linux32"; case "linux_64": default: return "chromedriver_linux64"; } } }
import org.openqa.selenium.By; import org.openqa.selenium.edge.EdgeDriver; public class Crawler { // 网站的登陆链接 private String baseUrl = "https://accounts.douban.com/passport/login?redir=https%3A%2F%2Fwww.douban.com%2Fgroup%2F"; private EdgeDriver edgeDriver; public Crawler() { } public Crawler(EdgeDriver driver) { super(); this.edgeDriver = driver; } //登陆 豆瓣 public void start() { // 登入网站 edgeDriver.get(baseUrl); // 先点击 密码 登陆,使password 存在 才行 edgeDriver.findElement(By.cssSelector("ul.tab-start > li:nth-child(2)")).click(); // 输入密码 edgeDriver.findElement(By.id("username")).sendKeys("139000000");// 手机号 edgeDriver.findElement(By.id("password")).sendKeys("*********"); try { Thread.sleep(1000); } catch (InterruptedException e) { e.printStackTrace(); } edgeDriver.findElement(By.cssSelector("div.account-form-field-submit > a")).click(); try { Thread.sleep(1000); } catch (InterruptedException e) { e.printStackTrace(); } } }
3、代码所需要的 jar 包
链接:https://pan.baidu.com/s/1Cm_36caoq_UEFwYYxTR2qA
提取码:g6j8
复制这段内容后打开百度网盘手机App,操作更方便哦
4、爬取结果