Jsoup爬取职位信息
待爬取的牛客网的实习信息
https://www.nowcoder.com/job/center

首先在Eclipse新建一个maven项目
1、在maven文件中加入以下的代码
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>com.wu</groupId> <artifactId>TopEssay</artifactId> <version>0.0.1-SNAPSHOT</version> <dependencies> <dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.11.3</version> </dependency> </dependencies> </project>
2、提取所需要的信息
这里编写CSS规则,有点麻烦,我们可以利用游览器自带的工具,帮助我们快速选择所需要的的元素

比如我们这里的标题,通过这种该方法,为 body > div.nk-container > div.nk-main.clearfix > div.nk-content > div > div.module-body > ul > li:nth-child(1) > div > div.reco-job-cont > a
然后我们可以在上面这个基础上进行相应的修改,有效节省了我们的时间。
package com.jsoup;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import com.entity.JobInfo;
public class NiuKeSpider {
private static final String url = "https://www.nowcoder.com/job/center";
public static void main(String[] args) {
try {
// 获取网页的源代码
Document document = Jsoup.connect(url).get();
// 筛选出和职位有关的网页源码
Elements jobs = document.getElementsByClass("reco-job-main");
System.out.println(jobs.size());
List<JobInfo> lists = new ArrayList<>();
//工作描述+公司+地点+工资+url
for(Element element : jobs) {
JobInfo jobInfo = new JobInfo();
jobInfo.setJobContent(element.getElementsByClass("reco-job-cont").text());
jobInfo.setUrl(element.select("div.reco-job-cont > a").attr("abs:href"));
jobInfo.setCompany(element.getElementsByClass("reco-job-com").text());
jobInfo.setAddress(element.getElementsByClass("job-address").text());
jobInfo.setSalary(element.select("div.reco-job-info > div:nth-child(1) > span:nth-child(2)").text().trim());
lists.add(jobInfo);
}
for(JobInfo job : lists) {
System.out.println(job);
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
3、封装所需的信息
package com.entity;
/**
* 职位有关的信息
* @author Administrator
*
*/
public class JobInfo {
private String jobContent;
private String url;
private String company;
private String address;
private String Salary;
public String getJobContent() {
return jobContent;
}
public void setJobContent(String jobContent) {
this.jobContent = jobContent;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public String getCompany() {
return company;
}
public void setCompany(String company) {
this.company = company;
}
public String getAddress() {
return address;
}
public void setAddress(String address) {
this.address = address;
}
public String getSalary() {
return Salary;
}
public void setSalary(String salary) {
Salary = salary;
}
@Override
public String toString() {
return "job [jobContent=" + jobContent + ", url=" + url + ", company=" + company + ", address=" + address
+ ", Salary=" + Salary + "]";
}
}
4、运行结果:

总结:
积沙成塔

浙公网安备 33010602011771号