JAVA-爬虫
JAVA-爬虫
一、工具类
爬取网站URL:中国吉林网
https://s.chinajilin.com.cn/was5/web/search
源码:
package com.ruoyi.knowledge.controller;
import org.apache.poi.xwpf.usermodel.*;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.*;
import java.net.HttpURLConnection;
import java.net.URL;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class JilinNewsCrawler {
public static List<Map<String, String>> getSearchResults(String keyword, int perpage) {
List<Map<String, String>> results = new ArrayList<>();
String baseUrl = "https://s.chinajilin.com.cn/was5/web/search?searchword="
+ keyword + "&channelid=269505&orderby=-IssueTime&perpage=" + perpage;
try {
HttpURLConnection conn = (HttpURLConnection) new URL(baseUrl).openConnection();
conn.setRequestMethod("GET");
conn.setRequestProperty("User-Agent", "Mozilla/5.0");
conn.setRequestProperty("Accept-Charset", "UTF-8");
Document doc = Jsoup.parse(conn.getInputStream(), "UTF-8", baseUrl);
// 修改为从tbody中的searchresult td获取内容
Element searchResult = doc.select("tbody td.searchresult").first();
if (searchResult != null) {
Elements items = searchResult.select("li"); // 从searchresult中获取li元素
for (Element item : items) {
Map<String, String> result = new HashMap<>();
// 提取标题并移除HTML标签
String title = Jsoup.parse(item.select("input.doctitle").attr("value")).text();
result.put("title", title);
// 获取详情页URL
String detailUrl = item.select("input.DOCPUBURL").attr("value");
String content = getUrlContent(detailUrl);
result.put("content", content);
results.add(result);
}
}
} catch (Exception e) {
System.err.println("获取搜索结果失败: " + e.getMessage());
}
return results;
}
private static String getUrlContent(String url) {
try {
HttpURLConnection conn = (HttpURLConnection) new URL(url).openConnection();
conn.setRequestMethod("GET");
conn.setRequestProperty("User-Agent", "Mozilla/5.0");
Document doc = Jsoup.parse(conn.getInputStream(), "UTF-8", url);
// 获取纯文本内容,移除所有HTML标签
Element contentDiv = doc.select("div.content").first();
return contentDiv != null ? contentDiv.text() : "";
} catch (Exception e) {
System.err.println("获取URL内容失败: " + url);
return "";
}
}
// saveToDocx方法保持不变,但会生成520个文件
public static void saveToDocx(List<Map<String, String>> results) {
String outputDir = "F:\\胡玉亭";
try {
Files.createDirectories(Paths.get(outputDir));
} catch (IOException e) {
System.err.println("创建目录失败: " + e.getMessage());
return;
}
for (int i = 0; i < results.size(); i++) {
Map<String, String> result = results.get(i);
// 使用标题作为文件名,替换非法字符
String title = result.get("title");
String safeFilename = title.replaceAll("[\\\\/:*?\"<>|]", "_");
String filename = outputDir + "\\" + safeFilename + ".docx";
try (XWPFDocument document = new XWPFDocument()) {
// 添加标题
XWPFParagraph titlePara = document.createParagraph();
XWPFRun titleRun = titlePara.createRun();
titleRun.setText(result.get("title"));
titleRun.setBold(true);
titleRun.setFontSize(14);
// 添加HTML格式内容
String htmlContent = result.get("content");
XWPFParagraph contentPara = document.createParagraph();
XWPFRun contentRun = contentPara.createRun();
contentRun.setText(htmlContent); // 直接写入包含HTML标签的文本
contentRun.setFontSize(12);
// 保存文件
try (FileOutputStream out = new FileOutputStream(filename)) {
document.write(out);
System.out.println("已保存: " + filename);
}
} catch (Exception e) {
System.err.println("保存文档失败: " + e.getMessage());
}
}
}
public static void main(String[] args) {
if(args.length < 2) {
System.out.println("请传入搜索关键词和导出条数作为参数,例如: java JilinNewsCrawler 胡玉亭 200");
return;
}
String keyword = args[0];
int perpage = Integer.parseInt(args[1]);
System.out.println("正在搜索关键词: " + keyword + ",导出条数: " + perpage);
List<Map<String, String>> results = getSearchResults(keyword, perpage);
if (!results.isEmpty()) {
saveToDocx(results);
System.out.println("关键词 '" + keyword + "' 的结果已保存");
} else {
System.out.println("未找到关键词 '" + keyword + "' 的结果");
}
}
}
POM文件
<!-- 爬数据-->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>5.2.3</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.15.3</version>
</dependency>
```
本文来自博客园,作者:skystrivegao,转载请注明原文链接:https://www.cnblogs.com/skystrive/p/18850904
整理不易,如果对您有所帮助 请点赞收藏,谢谢~