续上上文,从qq获取图片,处理图片重复问题(半成品)
直接将完整代码贴出,放上去就能用了,重复图片转移目录自己新建下。找dk写的,还算能用,图片量过大的话不清楚,有生成转移图片记录(根据哪张图片判断),在原文件夹内,可以找下。利用的是分块直方图方法处理图片的,目前用三百张图片测试,基本上能找出,貌似就一张查错了,那个图太像了。
目前处理方式是,执行完后再人工扫一遍,因为没有删除重复图片,所以有容错率。后面换更多图片再测试下,问题大的话再试试换方法处理。
import javax.imageio.ImageIO;
import java.awt.*;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.text.DecimalFormat;
import java.text.SimpleDateFormat;
import java.util.*;
import java.util.List;
import java.util.stream.Collectors;
public class ImageDuplicateFinderWithProgress {
private static final DecimalFormat df = new DecimalFormat("0.00");
private static final int BLOCK_SIZE = 4; // 将图片分为4x4=16个区块
private static int totalImages = 0;
private static int processedImages = 0;
private static List<String> detailedMoveReasons = new ArrayList<>();
// 新增:用于记录处理日志
private static List<String> operationLogs = new ArrayList<>();
private static Map<String, List<String>> duplicateGroups = new HashMap<>();
// 计算灰度直方图
public static Map<Integer, Integer> calculateHistogram(BufferedImage image) {
Map<Integer, Integer> histogram = new HashMap<>();
int width = image.getWidth();
int height = image.getHeight();
for (int y = 0; y < height; y++) {
for (int x = 0; x < width; x++) {
Color color = new Color(image.getRGB(x, y));
int gray = (color.getRed() + color.getGreen() + color.getBlue()) / 3;
histogram.put(gray, histogram.getOrDefault(gray, 0) + 1);
}
}
return histogram;
}
// 计算分块直方图
public static List<Map<Integer, Integer>> calculateBlockHistograms(BufferedImage image) {
List<Map<Integer, Integer>> blockHistograms = new ArrayList<>();
int width = image.getWidth();
int height = image.getHeight();
// 计算每个区块的大小
int blockWidth = width / BLOCK_SIZE;
int blockHeight = height / BLOCK_SIZE;
for (int blockY = 0; blockY < BLOCK_SIZE; blockY++) {
for (int blockX = 0; blockX < BLOCK_SIZE; blockX++) {
Map<Integer, Integer> histogram = new HashMap<>();
// 计算当前区块的像素范围
int startX = blockX * blockWidth;
int endX = (blockX == BLOCK_SIZE - 1) ? width : (blockX + 1) * blockWidth;
int startY = blockY * blockHeight;
int endY = (blockY == BLOCK_SIZE - 1) ? height : (blockY + 1) * blockHeight;
// 计算当前区块的直方图
for (int y = startY; y < endY; y++) {
for (int x = startX; x < endX; x++) {
Color color = new Color(image.getRGB(x, y));
int gray = (color.getRed() + color.getGreen() + color.getBlue()) / 3;
histogram.put(gray, histogram.getOrDefault(gray, 0) + 1);
}
}
blockHistograms.add(histogram);
}
}
return blockHistograms;
}
// 比较分块直方图相似度
public static double compareBlockHistograms(List<Map<Integer, Integer>> hist1,
List<Map<Integer, Integer>> hist2) {
double totalSimilarity = 0;
for (int i = 0; i < hist1.size(); i++) {
totalSimilarity += compareHistograms(hist1.get(i), hist2.get(i));
}
return totalSimilarity / hist1.size(); // 返回平均相似度
}
// 计算单个直方图相似度(保持不变)
public static double compareHistograms(Map<Integer, Integer> hist1, Map<Integer, Integer> hist2) {
double sum1 = hist1.values().stream().mapToInt(Integer::intValue).sum();
double sum2 = hist2.values().stream().mapToInt(Integer::intValue).sum();
double similarity = 0;
for (int key : hist1.keySet()) {
if (hist2.containsKey(key)) {
double p1 = hist1.get(key) / sum1;
double p2 = hist2.get(key) / sum2;
similarity += Math.sqrt(p1 * p2);
}
}
return similarity;
}
// 记录详细移动原因
private static void recordMoveReason(String groupId, File keptFile, File movedFile,
long keptSize, long movedSize, double similarity) {
String reason = String.format(
"相似组 %s:\n" +
" 保留文件: %s (%d KB)\n" +
" 移动文件: %s (%d KB)\n" +
" 分块相似度: %.2f%%\n" +
" 移动原因: 分块相似度高(%.2f%%)且文件较小(%d KB < %d KB)",
groupId,
keptFile.getName(), keptSize/1024,
movedFile.getName(), movedSize/1024,
similarity * 100,
similarity * 100,
movedSize/1024, keptSize/1024
);
detailedMoveReasons.add(reason);
}
// 保存详细结果到文件
private static void saveDetailedResults(String sourceFolder) {
String timestamp = new SimpleDateFormat("yyyyMMdd_HHmmss").format(new Date());
String resultFileName = "BlockBasedDuplicateFinder_Result_" + timestamp + ".txt";
Path resultPath = Paths.get(sourceFolder, resultFileName);
try {
List<String> outputLines = new ArrayList<>();
// 报告头信息
outputLines.add("==== 基于分块直方图的图片相似度检测报告 ====");
outputLines.add("生成时间: " + new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(new Date()));
outputLines.add("分块数量: " + BLOCK_SIZE + "x" + BLOCK_SIZE + "=" + (BLOCK_SIZE*BLOCK_SIZE));
outputLines.add("源文件夹: " + sourceFolder);
outputLines.add("");
// 处理摘要
outputLines.add("=== 处理摘要 ===");
outputLines.add("总图片数: " + totalImages);
outputLines.add("已处理图片数: " + processedImages);
outputLines.add("发现的相似组数: " + duplicateGroups.size());
outputLines.add("移动的重复文件数: " + detailedMoveReasons.size());
outputLines.add("");
// 详细移动原因
outputLines.add("=== 详细移动原因 ===");
outputLines.addAll(detailedMoveReasons);
outputLines.add("");
// 相似组概览
outputLines.add("=== 相似组概览 ===");
duplicateGroups.forEach((groupId, files) -> {
outputLines.add("\n相似组 " + groupId + ":");
outputLines.addAll(files);
});
Files.write(resultPath, outputLines);
System.out.println("\n详细结果报告已保存到: " + resultPath.toString());
} catch (IOException e) {
System.err.println("无法保存结果文件: " + e.getMessage());
}
}
// 获取图片文件列表
public static List<File> getImageFiles(String folderPath) {
System.out.println("[1/4] 正在扫描图片文件夹: " + folderPath);
File folder = new File(folderPath);
if (!folder.exists() || !folder.isDirectory()) {
System.err.println("错误: 文件夹不存在或不是目录: " + folderPath);
return Collections.emptyList();
}
List<File> imageFiles = Arrays.stream(Objects.requireNonNull(folder.listFiles()))
.filter(file -> {
String name = file.getName().toLowerCase();
return name.endsWith(".jpg") || name.endsWith(".jpeg")
|| name.endsWith(".png") || name.endsWith(".bmp");
})
.collect(Collectors.toList());
totalImages = imageFiles.size();
System.out.println("√ 找到 " + totalImages + " 张图片\n");
return imageFiles;
}
// 移动文件到目标文件夹
public static void moveSmallerDuplicate(File sourceFile, String targetFolder) throws IOException {
Path targetPath = Paths.get(targetFolder);
if (!Files.exists(targetPath)) {
operationLogs.add("创建目标文件夹: " + targetFolder);
System.out.println("创建目标文件夹: " + targetFolder);
Files.createDirectories(targetPath);
}
Path sourcePath = sourceFile.toPath();
Path destination = targetPath.resolve(sourceFile.getName());
// 处理文件名冲突
int counter = 1;
while (Files.exists(destination)) {
String fileName = sourceFile.getName();
String baseName = fileName.substring(0, fileName.lastIndexOf('.'));
String extension = fileName.substring(fileName.lastIndexOf('.'));
destination = targetPath.resolve(baseName + "_" + counter + extension);
counter++;
}
Files.move(sourcePath, destination);
}
// 显示进度条
public static void printProgress(int current, int total) {
int width = 50; // 进度条宽度
float percent = (float) current / total;
int progress = (int) (width * percent);
System.out.print("\r[");
for (int i = 0; i < width; i++) {
if (i < progress) System.out.print("=");
else if (i == progress) System.out.print(">");
else System.out.print(" ");
}
System.out.print("] " + (int)(percent * 100) + "% " + current + "/" + total);
}
// 新增:记录操作日志
private static void logOperation(String message) {
operationLogs.add(message);
System.out.println(message);
}
// 新增:打印详细报告
private static void printDetailedReport() {
System.out.println("\n===== 详细处理报告 =====");
System.out.println("总图片数: " + totalImages);
System.out.println("已处理图片数: " + processedImages);
System.out.println("发现的相似组数: " + duplicateGroups.size());
System.out.println("移动的重复文件数: " + operationLogs.stream().filter(log -> log.contains("已移动")).count());
System.out.println("\n=== 相似图片组详情 ===");
duplicateGroups.forEach((groupKey, files) -> {
System.out.println("\n相似组 " + groupKey + ":");
files.forEach(System.out::println);
});
System.out.println("\n=== 操作日志 ===");
operationLogs.forEach(System.out::println);
}
// 主处理逻辑
public static void processDuplicates(String sourceFolder, String targetFolder, double similarityThreshold) {
// 阶段1: 获取图片列表
List<File> imageFiles = getImageFiles(sourceFolder);
if (imageFiles.isEmpty()) return;
// 阶段2: 计算分块特征
System.out.println("[2/4] 正在计算图片分块特征...");
Map<File, List<Map<Integer, Integer>>> blockHistograms = new HashMap<>();
Map<File, Long> fileSizes = new HashMap<>();
for (int i = 0; i < imageFiles.size(); i++) {
File file = imageFiles.get(i);
try {
System.out.print("\r正在处理: " + (i+1) + "/" + totalImages + " - " + file.getName());
BufferedImage image = ImageIO.read(file);
if (image != null) {
blockHistograms.put(file, calculateBlockHistograms(image));
fileSizes.put(file, file.length());
processedImages++;
}
} catch (IOException e) {
System.err.println("\n× 无法读取图片: " + file.getName());
}
}
System.out.println("\n√ 分块特征计算完成\n");
// 阶段3: 比较图片
System.out.println("[3/4] 正在比较图片分块相似度...");
Set<File> processedFiles = new HashSet<>();
Set<File> keptFiles = new HashSet<>();
int totalComparisons = blockHistograms.size() * (blockHistograms.size() - 1) / 2;
int currentComparison = 0;
List<File> fileList = new ArrayList<>(blockHistograms.keySet());
for (int i = 0; i < fileList.size(); i++) {
File file1 = fileList.get(i);
if (processedFiles.contains(file1)) continue;
for (int j = i + 1; j < fileList.size(); j++) {
File file2 = fileList.get(j);
if (processedFiles.contains(file2)) continue;
currentComparison++;
printProgress(currentComparison, totalComparisons);
double similarity = compareBlockHistograms(
blockHistograms.get(file1),
blockHistograms.get(file2)
);
if (similarity > similarityThreshold) {
File smallerFile = fileSizes.get(file1) <= fileSizes.get(file2) ? file1 : file2;
File largerFile = fileSizes.get(file1) > fileSizes.get(file2) ? file1 : file2;
if (!keptFiles.contains(largerFile)) {
try {
String groupId = "G" + (duplicateGroups.size() + 1);
// 记录相似组信息
List<String> groupFiles = new ArrayList<>();
groupFiles.add("保留: " + largerFile.getName() + " (" + (fileSizes.get(largerFile)/1024) + "KB)");
groupFiles.add("移动: " + smallerFile.getName() + " (" + (fileSizes.get(smallerFile)/1024) + "KB)");
groupFiles.add("分块相似度: " + df.format(similarity * 100) + "%");
duplicateGroups.put(groupId, groupFiles);
// 记录详细移动原因
recordMoveReason(groupId, largerFile, smallerFile,
fileSizes.get(largerFile), fileSizes.get(smallerFile), similarity);
moveSmallerDuplicate(smallerFile, targetFolder);
processedFiles.add(smallerFile);
keptFiles.add(largerFile);
String logMessage = String.format(
"发现相似图片组 %s:\n 保留: %s (%dKB)\n 移动: %s (%dKB)\n 分块相似度: %.2f%%",
groupId,
largerFile.getName(),
fileSizes.get(largerFile)/1024,
smallerFile.getName(),
fileSizes.get(smallerFile)/1024,
similarity * 100
);
System.out.println("\n" + logMessage);
} catch (IOException e) {
System.err.println("\n× 移动文件失败: " + e.getMessage());
}
}
}
}
}
// 阶段4: 输出结果
System.out.println("\n\n[4/4] 处理结果:");
System.out.println("总图片数: " + totalImages);
System.out.println("已处理: " + processedImages);
System.out.println("发现相似组: " + duplicateGroups.size());
System.out.println("已移动到: " + targetFolder);
System.out.println("保留在原处的最大文件数: " + keptFiles.size());
// 保存详细结果
saveDetailedResults(sourceFolder);
}
public static void main(String[] args) {
Scanner scanner = new Scanner(System.in);
System.out.println("==== 基于分块直方图的图片相似度检测程序 ====");
System.out.println("分块数量: " + BLOCK_SIZE + "x" + BLOCK_SIZE);
// 获取用户输入的源文件夹路径
System.out.print("请输入源文件夹路径(包含要检查的图片): ");
String sourceFolder = scanner.nextLine().trim();
// 获取用户输入的目标文件夹路径
System.out.print("请输入目标文件夹路径(用于存放重复图片): ");
String targetFolder = scanner.nextLine().trim();
// 获取用户输入的相似度阈值
// System.out.print("请输入相似度阈值(0-100之间的数字,推荐95): ");
// double similarityThreshold = scanner.nextDouble() / 100.0;
System.out.println("\n配置信息:");
System.out.println("源文件夹: " + sourceFolder);
System.out.println("目标文件夹: " + targetFolder);
// System.out.println("相似度阈值: " + (int)(similarityThreshold * 100) + "%");
System.out.println("====================================\n");
System.out.print("确认开始处理吗?(Y/N): ");
String confirm = scanner.next();
if (confirm.equalsIgnoreCase("Y")) {
long startTime = System.currentTimeMillis();
processDuplicates(sourceFolder, targetFolder, 0.95);
long endTime = System.currentTimeMillis();
System.out.println("\n处理完成,耗时: " + (endTime - startTime)/1000.0 + "秒");
} else {
System.out.println("操作已取消");
}
scanner.close();
}
}

浙公网安备 33010602011771号