爬取拉勾网职位等信息(Java)
工具:火狐浏览器,selenium IDE(3版本往上),Eclipse,selenium-java.jar(需导入),selenium-server-standalone-3.141.5.jar(需导入),poi-bin-5.2.2
1 package one; 2 3 import java.io.File; 4 import java.io.FileOutputStream; 5 import java.util.List; 6 7 import org.apache.poi.hssf.usermodel.HSSFRow; 8 import org.apache.poi.hssf.usermodel.HSSFSheet; 9 import org.apache.poi.hssf.usermodel.HSSFWorkbook; 10 import org.apache.poi.ss.usermodel.Workbook; 11 import org.openqa.selenium.By; 12 import org.openqa.selenium.WebDriver; 13 import org.openqa.selenium.WebDriver.Navigation; 14 import org.openqa.selenium.WebElement; 15 import org.openqa.selenium.firefox.FirefoxDriver; 16 17 public class search_lagouwang { 18 19 public static void main(String[] args) throws InterruptedException { 20 21 String s = "",job="",company="",money="",experience="",whole=""; 22 int i=1; 23 int j=1; //i表示表格第几行,j表示网页第几页 24 25 //创建工作表,需要导出到文件 26 HSSFWorkbook workbook = new HSSFWorkbook(); 27 HSSFSheet sheet = workbook.createSheet(); 28 HSSFRow row = sheet.createRow(0); 29 row.createCell(0).setCellValue("职位"); 30 row.createCell(1).setCellValue("公司"); 31 row.createCell(2).setCellValue("薪资"); 32 row.createCell(3).setCellValue("工作经验"); 33 34 System.setProperty ( "webdriver.firefox.bin" , "E:\\Mozilla Firefox\\firefox.exe" ); //需导入一堆.jar 35 System.setProperty("webdriver.gecko.driver", "E:\\Mozilla Firefox\\geckodriver.exe"); 36 //selenium3中没有火狐启动驱动,需要重新下载geckodriver.exe 37 WebDriver driver = new FirefoxDriver(); 38 Navigation navigation = driver.navigate(); 39 navigation.to("https://www.lagou.com/"); 40 Thread.sleep(3000); 41 42 driver.findElement(By.id("cboxClose")).click(); //点击弹出上面的X按钮 43 Thread.sleep(3000); //需要等待几秒,不然时间太快输入框获取不到 44 driver.findElement(By.id("search_input")).clear(); 45 driver.findElement(By.id("search_input")).sendKeys("java"); 46 driver.findElement(By.id("search_button")).click(); 47 Thread.sleep(2000); 48 //获取下一页按钮 49 WebElement next = driver.findElement(By.className("lg-pagination-next")); 50 while(next != null && next.isEnabled()==true) { //按钮存在且可点击 51 //搜索过程 52 53 List<WebElement> all = driver.findElements(By.className("item-top__1Z3Zo")); //是findElements,多个元素集合 54 55 for(WebElement a : all) { 56 //获得职位 57 job = a.findElement(By.className("p-top__1F7CL")) 58 .findElement(By.tagName("a")).getText(); 59 //获得公司 60 company = a.findElement(By.className("company-name__2-SjF")) 61 .findElement(By.tagName("a")).getText(); 62 //获得薪资、经验 63 money = a.findElement(By.className("p-bom__JlNur")) 64 .findElement(By.tagName("span")).getText(); 65 whole = a.findElement(By.className("p-bom__JlNur")).getText(); 66 experience = whole.substring(money.length(), whole.length()); 67 //输出到工作表 68 row = sheet.createRow(i); 69 row.createCell(0).setCellValue(job); 70 row.createCell(1).setCellValue(company); 71 row.createCell(2).setCellValue(money); 72 row.createCell(3).setCellValue(experience); 73 //System.out.println("公司: "+company+"职位:"+job+"薪资: "+money+"经验: "+experience); 74 i++; 75 76 } 77 //点击下一页 78 next.click(); 79 Thread.sleep(3000); 80 j++; //由于搜索过多页会弹出登录框,只搜素10页,j控制页数 81 if(j<=10) { 82 next = driver.findElement(By.className("lg-pagination-next")); 83 }else { 84 break; 85 } 86 } 87 88 89 try { 90 //创建输出流,将工作表写入到文件lagouwang.xls 91 File file = new File("E:/lagouwang_1.xls"); 92 file.createNewFile(); 93 FileOutputStream ot = new FileOutputStream(file); 94 workbook.write(ot); 95 }catch(Exception e) { 96 e.printStackTrace(); 97 } 98 99 } 100 101 }

浙公网安备 33010602011771号