爬取拉勾网职位等信息(Java)

工具:火狐浏览器,selenium IDE(3版本往上),Eclipse,selenium-java.jar(需导入),selenium-server-standalone-3.141.5.jar(需导入),poi-bin-5.2.2

  1 package one;
  2 
  3 import java.io.File;
  4 import java.io.FileOutputStream;
  5 import java.util.List;
  6 
  7 import org.apache.poi.hssf.usermodel.HSSFRow;
  8 import org.apache.poi.hssf.usermodel.HSSFSheet;
  9 import org.apache.poi.hssf.usermodel.HSSFWorkbook;
 10 import org.apache.poi.ss.usermodel.Workbook;
 11 import org.openqa.selenium.By;
 12 import org.openqa.selenium.WebDriver;
 13 import org.openqa.selenium.WebDriver.Navigation;
 14 import org.openqa.selenium.WebElement;
 15 import org.openqa.selenium.firefox.FirefoxDriver;
 16 
 17 public class search_lagouwang {
 18 
 19     public static void main(String[] args) throws InterruptedException {
 20         
 21         String s = "",job="",company="",money="",experience="",whole="";
 22         int i=1;
 23         int j=1;   //i表示表格第几行,j表示网页第几页
 24         
 25         //创建工作表,需要导出到文件
 26         HSSFWorkbook workbook = new HSSFWorkbook();
 27         HSSFSheet sheet = workbook.createSheet();     
 28         HSSFRow row = sheet.createRow(0);
 29         row.createCell(0).setCellValue("职位");
 30         row.createCell(1).setCellValue("公司");
 31         row.createCell(2).setCellValue("薪资");
 32         row.createCell(3).setCellValue("工作经验");
 33         
 34         System.setProperty ( "webdriver.firefox.bin" , "E:\\Mozilla Firefox\\firefox.exe" );  //需导入一堆.jar
 35         System.setProperty("webdriver.gecko.driver", "E:\\Mozilla Firefox\\geckodriver.exe");
 36         //selenium3中没有火狐启动驱动,需要重新下载geckodriver.exe
 37         WebDriver driver = new FirefoxDriver();
 38         Navigation navigation = driver.navigate();
 39         navigation.to("https://www.lagou.com/");
 40         Thread.sleep(3000);
 41         
 42         driver.findElement(By.id("cboxClose")).click();  //点击弹出上面的X按钮
 43         Thread.sleep(3000);   //需要等待几秒,不然时间太快输入框获取不到
 44         driver.findElement(By.id("search_input")).clear();
 45         driver.findElement(By.id("search_input")).sendKeys("java");
 46         driver.findElement(By.id("search_button")).click();
 47         Thread.sleep(2000);
 48         //获取下一页按钮
 49         WebElement next = driver.findElement(By.className("lg-pagination-next"));   
 50         while(next != null && next.isEnabled()==true) {  //按钮存在且可点击
 51             //搜索过程
 52             
 53             List<WebElement> all = driver.findElements(By.className("item-top__1Z3Zo")); //是findElements,多个元素集合
 54             
 55             for(WebElement a : all) {
 56                 //获得职位
 57                 job = a.findElement(By.className("p-top__1F7CL"))
 58                         .findElement(By.tagName("a")).getText();
 59                 //获得公司
 60                 company = a.findElement(By.className("company-name__2-SjF"))
 61                         .findElement(By.tagName("a")).getText();
 62                 //获得薪资、经验
 63                 money = a.findElement(By.className("p-bom__JlNur"))
 64                         .findElement(By.tagName("span")).getText();
 65                 whole = a.findElement(By.className("p-bom__JlNur")).getText();
 66                 experience = whole.substring(money.length(), whole.length());
 67                 //输出到工作表
 68                 row = sheet.createRow(i);
 69                 row.createCell(0).setCellValue(job);
 70                 row.createCell(1).setCellValue(company);
 71                 row.createCell(2).setCellValue(money);
 72                 row.createCell(3).setCellValue(experience);
 73                 //System.out.println("公司: "+company+"职位:"+job+"薪资: "+money+"经验: "+experience);
 74                 i++;
 75                 
 76             }
 77             //点击下一页
 78             next.click();
 79             Thread.sleep(3000);
 80             j++;   //由于搜索过多页会弹出登录框,只搜素10页,j控制页数
 81             if(j<=10) {
 82                 next = driver.findElement(By.className("lg-pagination-next"));
 83             }else {
 84                 break;
 85             }            
 86         }
 87         
 88         
 89         try {
 90             //创建输出流,将工作表写入到文件lagouwang.xls
 91             File file = new File("E:/lagouwang_1.xls");
 92             file.createNewFile();
 93             FileOutputStream ot = new FileOutputStream(file);
 94             workbook.write(ot);
 95         }catch(Exception e) {
 96             e.printStackTrace();
 97         }
 98             
 99     }
100     
101 }

 

posted @ 2022-05-17 17:25  能吃八碗饭  阅读(173)  评论(0)    收藏  举报