图片查重处理2

继续测试上文代码,重新调整了下,增加了个比较后较大的图片存放地点,增加了对无法识别图片的处理。暂时优化不好,经过测试九千图片需要处理3小时以上,还需要优化

 

 

import javax.imageio.IIOException;
import javax.imageio.ImageIO;
import javax.imageio.ImageReader;
import javax.imageio.stream.ImageInputStream;

import java.awt.*;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.text.DecimalFormat;
import java.text.SimpleDateFormat;
import java.util.*;
import java.util.List;
import java.util.stream.Collectors;

public class ImageDuplicateFinderWithProgress {

private static final DecimalFormat df = new DecimalFormat("0.00");
private static final int BLOCK_SIZE = 4; // 将图片分为4x4=16个区块
private static int totalImages = 0;
private static int processedImages = 0;

private static List<String> detailedMoveReasons = new ArrayList<>();

// 新增:用于记录处理日志
private static List<String> operationLogs = new ArrayList<>();
private static Map<String, List<String>> duplicateGroups = new HashMap<>();

// 计算灰度直方图
public static Map<Integer, Integer> calculateHistogram(BufferedImage image) {
Map<Integer, Integer> histogram = new HashMap<>();
int width = image.getWidth();
int height = image.getHeight();

for (int y = 0; y < height; y++) {
for (int x = 0; x < width; x++) {
Color color = new Color(image.getRGB(x, y));
int gray = (color.getRed() + color.getGreen() + color.getBlue()) / 3;
histogram.put(gray, histogram.getOrDefault(gray, 0) + 1);
}
}
return histogram;
}

/**
* 安全计算分块直方图
* @param image 输入图像(确保非null)
* @return 分块直方图列表(每个块一个直方图)
*/
public static List<Map<Integer, Integer>> calculateBlockHistograms(BufferedImage image) {
// 1. 前置条件检查(三重验证)
if (image == null) {
System.err.println("错误:图像为null");
return Collections.emptyList();
}

int width = image.getWidth();
int height = image.getHeight();

if (width <= 0 || height <= 0) {
System.err.printf("错误:无效图像尺寸 %dx%d%n", width, height);
return Collections.emptyList();
}

// 2. 图像数据可用性检查
try {
// 测试左上角和右下角像素是否可访问
image.getRGB(0, 0);
image.getRGB(width-1, height-1);
} catch (Exception e) {
System.err.printf("图像数据不可访问: %s (%dx%d)%n",
e.getClass().getSimpleName(), width, height);
return Collections.emptyList();
}

// 3. 准备分块参数(防除零)
int blockWidth = Math.max(1, width / BLOCK_SIZE);
int blockHeight = Math.max(1, height / BLOCK_SIZE);
List<Map<Integer, Integer>> blockHistograms = new ArrayList<>(BLOCK_SIZE * BLOCK_SIZE);

// 4. 分块处理(带边界检查)
for (int blockY = 0; blockY < BLOCK_SIZE; blockY++) {
for (int blockX = 0; blockX < BLOCK_SIZE; blockX++) {
Map<Integer, Integer> histogram = new HashMap<>();

// 计算当前块的安全边界
int startX = blockX * blockWidth;
int endX = (blockX == BLOCK_SIZE - 1) ? width : (blockX + 1) * blockWidth;
int startY = blockY * blockHeight;
int endY = (blockY == BLOCK_SIZE - 1) ? height : (blockY + 1) * blockHeight;

// 二次边界确认
endX = Math.min(endX, width);
endY = Math.min(endY, height);
startX = Math.max(0, startX);
startY = Math.max(0, startY);

// 5. 像素级安全访问
for (int y = startY; y < endY; y++) {
for (int x = startX; x < endX; x++) {
try {
// 使用Raster更安全的获取方式
int rgb;
if (image.getRaster() != null) {
rgb = image.getRaster().getDataElements(x, y, null) instanceof int[]
? ((int[])image.getRaster().getDataElements(x, y, null))[0]
: image.getRGB(x, y);
} else {
rgb = image.getRGB(x, y);
}

// 灰度计算
Color color = new Color(rgb);
int gray = (int)(0.299 * color.getRed() +
0.587 * color.getGreen() +
0.114 * color.getBlue());
histogram.merge(gray, 1, Integer::sum);

} catch (ArrayIndexOutOfBoundsException e) {
System.err.printf("安全忽略越界像素 @(%d,%d) in %dx%d 块[%d,%d]%n",
x, y, width, height, blockX, blockY);
} catch (Exception e) {
System.err.printf("像素处理错误 @(%d,%d): %s%n",
x, y, e.getClass().getSimpleName());
}
}
}
blockHistograms.add(histogram);
}
}
return blockHistograms;
}

/**
* 安全获取图像RGB值
*/
private static int safeGetRGB(BufferedImage image, int x, int y) {
if (x < 0 || x >= image.getWidth() || y < 0 || y >= image.getHeight()) {
return 0; // 返回透明黑色
}
try {
return image.getRGB(x, y);
} catch (Exception e) {
return 0;
}
}

/**
* 验证图像是否完全可读
*/
public static boolean validateImage(BufferedImage image) {
if (image == null) return false;

try {
// 测试四个角落和中心点
int[] testPoints = {
image.getRGB(0, 0),
image.getRGB(image.getWidth()-1, 0),
image.getRGB(0, image.getHeight()-1),
image.getRGB(image.getWidth()-1, image.getHeight()-1),
image.getRGB(image.getWidth()/2, image.getHeight()/2)
};
return true;
} catch (Exception e) {
return false;
}
}

// 比较分块直方图相似度
public static double compareBlockHistograms(List<Map<Integer, Integer>> hist1,
List<Map<Integer, Integer>> hist2) {
double totalSimilarity = 0;

for (int i = 0; i < hist1.size(); i++) {
totalSimilarity += compareHistograms(hist1.get(i), hist2.get(i));
}

return totalSimilarity / hist1.size(); // 返回平均相似度
}

// 计算单个直方图相似度(保持不变)
public static double compareHistograms(Map<Integer, Integer> hist1, Map<Integer, Integer> hist2) {
double sum1 = hist1.values().stream().mapToInt(Integer::intValue).sum();
double sum2 = hist2.values().stream().mapToInt(Integer::intValue).sum();

double similarity = 0;
for (int key : hist1.keySet()) {
if (hist2.containsKey(key)) {
double p1 = hist1.get(key) / sum1;
double p2 = hist2.get(key) / sum2;
similarity += Math.sqrt(p1 * p2);
}
}
return similarity;
}

// 记录详细移动原因
private static void recordMoveReason(String groupId, File keptFile, File movedFile,
long keptSize, long movedSize, double similarity) {
String reason = String.format(
"相似组 %s:\n" +
" 保留文件: %s (%d KB)\n" +
" 移动文件: %s (%d KB)\n" +
" 分块相似度: %.2f%%\n" +
" 移动原因: 分块相似度高(%.2f%%)且文件较小(%d KB < %d KB)",
groupId,
keptFile.getName(), keptSize/1024,
movedFile.getName(), movedSize/1024,
similarity * 100,
similarity * 100,
movedSize/1024, keptSize/1024
);
detailedMoveReasons.add(reason);
}
// 保存详细结果到文件
private static void saveDetailedResults(String sourceFolder, double similarityThreshold) {
String timestamp = new SimpleDateFormat("yyyyMMdd_HHmmss").format(new Date());
String resultFileName = "BlockBasedDuplicateFinder_Result_" + timestamp + ".txt";
Path resultPath = Paths.get(sourceFolder, resultFileName);

try {
List<String> outputLines = new ArrayList<>();

// 在报告开头添加处理摘要(关键位置5)
outputLines.add("=== 处理参数 ===");
outputLines.add("分块策略: " + BLOCK_SIZE + "x" + BLOCK_SIZE + "分块");
outputLines.add("相似度阈值: " + (int)(similarityThreshold * 100) + "%");
outputLines.add("有效图片比例: " + df.format((double)processedImages/totalImages*100) + "%");

// 报告头信息
outputLines.add("==== 基于分块直方图的图片相似度检测报告 ====");
outputLines.add("生成时间: " + new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(new Date()));
outputLines.add("分块数量: " + BLOCK_SIZE + "x" + BLOCK_SIZE + "=" + (BLOCK_SIZE*BLOCK_SIZE));
outputLines.add("源文件夹: " + sourceFolder);
outputLines.add("");

// 处理摘要
outputLines.add("=== 处理摘要 ===");
outputLines.add("总图片数: " + totalImages);
outputLines.add("已处理图片数: " + processedImages);
outputLines.add("发现的相似组数: " + duplicateGroups.size());
outputLines.add("移动的重复文件数: " + detailedMoveReasons.size());
outputLines.add("");

// 详细移动原因
outputLines.add("=== 详细移动原因 ===");
outputLines.addAll(detailedMoveReasons);
outputLines.add("");

// 相似组概览
outputLines.add("=== 相似组概览 ===");
duplicateGroups.forEach((groupId, files) -> {
outputLines.add("\n相似组 " + groupId + ":");
outputLines.addAll(files);
});



Files.write(resultPath, outputLines);
System.out.println("\n详细结果报告已保存到: " + resultPath.toString());
} catch (IOException e) {
System.err.println("无法保存结果文件: " + e.getMessage());
}
}


// 获取图片文件列表
public static List<File> getImageFiles(String folderPath) {
System.out.println("[1/4] 正在扫描图片文件夹: " + folderPath);
File folder = new File(folderPath);
if (!folder.exists() || !folder.isDirectory()) {
System.err.println("错误: 文件夹不存在或不是目录: " + folderPath);
return Collections.emptyList();
}

List<File> imageFiles = Arrays.stream(Objects.requireNonNull(folder.listFiles()))
.filter(file -> {
String name = file.getName().toLowerCase();
// 只处理明确扩展名的文件
return name.matches(".*\\.(jpg|jpeg|png|bmp)$") &&
!name.contains("corrupt") && // 排除可疑文件
file.length() > 1024; // 至少1KB
})
.collect(Collectors.toList());

totalImages = imageFiles.size();
System.out.println("√ 找到 " + totalImages + " 张图片\n");
return imageFiles;
}

// 移动文件到目标文件夹
public static void moveSmallerDuplicate(File sourceFile, String targetFolder) throws IOException {
Path targetPath = Paths.get(targetFolder);
if (!Files.exists(targetPath)) {
operationLogs.add("创建目标文件夹: " + targetFolder);
System.out.println("创建目标文件夹: " + targetFolder);
Files.createDirectories(targetPath);
}

Path sourcePath = sourceFile.toPath();
Path destination = targetPath.resolve(sourceFile.getName());

// 处理文件名冲突
int counter = 1;
while (Files.exists(destination)) {
String fileName = sourceFile.getName();
String baseName = fileName.substring(0, fileName.lastIndexOf('.'));
String extension = fileName.substring(fileName.lastIndexOf('.'));
destination = targetPath.resolve(baseName + "_" + counter + extension);
counter++;
}

Files.move(sourcePath, destination);
}

// 显示进度条
public static void printProgress(int current, int total) {
int width = 50; // 进度条宽度
float percent = (float) current / total;
int progress = (int) (width * percent);

System.out.print("\r[");
for (int i = 0; i < width; i++) {
if (i < progress) System.out.print("=");
else if (i == progress) System.out.print(">");
else System.out.print(" ");
}
System.out.print("] " + (int)(percent * 100) + "% " + current + "/" + total);
}

// 新增:记录操作日志
private static void logOperation(String message) {
operationLogs.add(message);
System.out.println(message);
}

// 新增:打印详细报告
private static void printDetailedReport() {
System.out.println("\n===== 详细处理报告 =====");
System.out.println("总图片数: " + totalImages);
System.out.println("已处理图片数: " + processedImages);
System.out.println("发现的相似组数: " + duplicateGroups.size());
System.out.println("移动的重复文件数: " + operationLogs.stream().filter(log -> log.contains("已移动")).count());

System.out.println("\n=== 相似图片组详情 ===");
duplicateGroups.forEach((groupKey, files) -> {
System.out.println("\n相似组 " + groupKey + ":");
files.forEach(System.out::println);
});

System.out.println("\n=== 操作日志 ===");
operationLogs.forEach(System.out::println);
}

// 主处理逻辑
public static void processDuplicates(String sourceFolder, String largerTargetFolder,
String smallerTargetFolder, String logFolder,
double similarityThreshold) {
// 阶段1: 获取图片列表
List<File> imageFiles = getImageFiles(sourceFolder);
if (imageFiles.isEmpty()) return;

// 阶段2: 计算分块特征
System.out.println("[2/4] 正在计算图片分块特征...");
Map<File, List<Map<Integer, Integer>>> blockHistograms = new HashMap<>();
Map<File, Long> fileSizes = new HashMap<>();

for (int i = 0; i < imageFiles.size(); i++) {
File file = imageFiles.get(i);

BufferedImage image = safeImageRead(file);
if (image == null || !validateImage(image)) {
System.err.println("跳过不可用图像: " + file.getName());
continue;
}

if (image.getWidth() < 10 || image.getHeight() < 10) {
System.err.printf("\n! 跳过极小图片: %s (%dx%d)%n",
file.getName(), image.getWidth(), image.getHeight());
continue;
}

List<Map<Integer, Integer>> histograms = calculateBlockHistograms(image);
if (histograms.isEmpty()) {
System.err.println("无法计算直方图: " + file.getName());
continue;
}

System.out.printf("\r处理中: %d/%d - %s (%dx%d)",
i+1, totalImages, file.getName(), image.getWidth(), image.getHeight());
blockHistograms.put(file, histograms);
fileSizes.put(file, file.length());
processedImages++;
}
System.out.println("\n√ 分块特征计算完成\n");

// 阶段3: 比较图片
System.out.println("[3/4] 正在比较图片分块相似度...");
Set<File> processedFiles = new HashSet<>();
int totalComparisons = blockHistograms.size() * (blockHistograms.size() - 1) / 2;
int currentComparison = 0;
int movedLargerFiles = 0;
int movedSmallerFiles = 0;

List<File> fileList = new ArrayList<>(blockHistograms.keySet());
for (int i = 0; i < fileList.size(); i++) {
File file1 = fileList.get(i);
if (processedFiles.contains(file1)) continue;

for (int j = i + 1; j < fileList.size(); j++) {
File file2 = fileList.get(j);
if (processedFiles.contains(file2)) continue;

currentComparison++;
printProgress(currentComparison, totalComparisons);

double similarity = compareBlockHistograms(
blockHistograms.get(file1),
blockHistograms.get(file2)
);

if (similarity > similarityThreshold) {
System.out.printf("\n发现相似图片: %s ↔ %s (相似度: %.2f%%)%n",
file1.getName(), file2.getName(), similarity * 100);

// 确定哪个文件更大
File largerFile = fileSizes.get(file1) > fileSizes.get(file2) ? file1 : file2;
File smallerFile = fileSizes.get(file1) <= fileSizes.get(file2) ? file1 : file2;

try {
String groupId = "G" + (duplicateGroups.size() + 1);

// 记录相似组信息
List<String> groupFiles = new ArrayList<>();
groupFiles.add("较大文件: " + largerFile.getName() + " (" + (fileSizes.get(largerFile)/1024) + "KB)");
groupFiles.add("较小文件: " + smallerFile.getName() + " (" + (fileSizes.get(smallerFile)/1024) + "KB)");
groupFiles.add("分块相似度: " + df.format(similarity * 100) + "%");
duplicateGroups.put(groupId, groupFiles);

// 移动较大文件到较大文件专用文件夹
moveFileToFolder(largerFile, largerTargetFolder, "较大");
movedLargerFiles++;
processedFiles.add(largerFile);

// 移动较小文件到较小文件专用文件夹
moveFileToFolder(smallerFile, smallerTargetFolder, "较小");
movedSmallerFiles++;
processedFiles.add(smallerFile);

String logMessage = String.format(
"发现相似图片组 %s:\n 较大文件: %s (%dKB) → %s\n 较小文件: %s (%dKB) → %s\n 分块相似度: %.2f%%",
groupId,
largerFile.getName(),
fileSizes.get(largerFile)/1024,
largerTargetFolder,
smallerFile.getName(),
fileSizes.get(smallerFile)/1024,
smallerTargetFolder,
similarity * 100
);
System.out.println("\n" + logMessage);
} catch (IOException e) {
System.err.println("\n× 移动文件失败: " + e.getMessage());
}
}
}
}

// 阶段4: 输出结果
System.out.println("\n\n[4/4] 处理结果:");
System.out.println("总图片数: " + totalImages);
System.out.println("已处理: " + processedImages);
System.out.println("发现相似组: " + duplicateGroups.size());
System.out.println("移动到较大文件文件夹: " + movedLargerFiles + " (" + largerTargetFolder + ")");
System.out.println("移动到较小文件文件夹: " + movedSmallerFiles + " (" + smallerTargetFolder + ")");

// 保存详细结果
saveDetailedResults(logFolder, similarityThreshold);
}

// 通用文件移动方法
public static void moveFileToFolder(File sourceFile, String targetFolder, String fileType) throws IOException {
Path targetPath = Paths.get(targetFolder);
if (!Files.exists(targetPath)) {
operationLogs.add("创建目标文件夹: " + targetFolder);
System.out.println("创建目标文件夹: " + targetFolder);
Files.createDirectories(targetPath);
}

Path sourcePath = sourceFile.toPath();
Path destination = targetPath.resolve(sourceFile.getName());

// 处理文件名冲突
int counter = 1;
while (Files.exists(destination)) {
String fileName = sourceFile.getName();
String baseName = fileName.substring(0, fileName.lastIndexOf('.'));
String extension = fileName.substring(fileName.lastIndexOf('.'));
destination = targetPath.resolve(baseName + "_" + counter + extension);
counter++;
}

Files.move(sourcePath, destination);
operationLogs.add("已移动" + fileType + "文件: " + sourceFile.getName() + " 到 " + targetFolder);
}

private static BufferedImage safeImageRead(File file) {
try {
// 1. 通过文件头确认真实格式
String realFormat = getRealImageFormat(file);
if (realFormat == null) {
System.err.printf("无法识别图片格式: %s%n", file.getName());
return null;
}

// 2. 使用指定格式读取器
ImageInputStream stream = ImageIO.createImageInputStream(file);
Iterator<ImageReader> readers = ImageIO.getImageReadersByFormatName(realFormat);

if (!readers.hasNext()) {
System.err.printf("无%s解码器: %s%n", realFormat.toUpperCase(), file.getName());
stream.close();
return null;
}

ImageReader reader = readers.next();
try {
reader.setInput(stream);
return reader.read(0);
} finally {
reader.dispose();
stream.close();
}
} catch (IOException e) {
System.err.printf("文件读取失败: %s (%s)%n",
file.getName(), e.getClass().getSimpleName());
return null;
}
}

private static String getRealImageFormat(File file) throws IOException {
try (InputStream is = Files.newInputStream(file.toPath())) {
byte[] header = new byte[8];
if (is.read(header) < header.length) return null;

if (header[0] == (byte)0xFF && header[1] == (byte)0xD8) return "jpeg";
if (header[1] == 'P' && header[2] == 'N' && header[3] == 'G') return "png";
if (header[0] == 'B' && header[1] == 'M') return "bmp";
// 其他格式判断...
}
return null;
}
private static BufferedImage tryAlternativeRead(File file) {
try {
// 方法1: 使用Toolkit作为fallback
Image tkImage = Toolkit.getDefaultToolkit().createImage(file.getPath());
if (tkImage != null) {
return new BufferedImage(
tkImage.getWidth(null),
tkImage.getHeight(null),
BufferedImage.TYPE_INT_RGB
);
}

// 方法2: 使用ImageIO简单读取(可能失败但值得尝试)
return ImageIO.read(file);
} catch (Exception e) {
System.err.printf("备用读取方案失败: %s%n", file.getName());
return null;
}
}


public static void main(String[] args) {
Scanner scanner = new Scanner(System.in);
System.setProperty("jdk.imageio.plugins.disabled", "com.sun.imageio.plugins.gif");
System.out.println("==== 基于分块直方图的图片相似度检测程序 ====");
System.out.println("分块数量: " + BLOCK_SIZE + "x" + BLOCK_SIZE);

// 获取用户输入的源文件夹路径
System.out.print("请输入源文件夹路径(包含要检查的图片): ");
String sourceFolder = scanner.nextLine().trim();

// 获取用户输入的目标文件夹路径
System.out.print("请输入目标文件夹路径(用于存放比较后较小图片): ");
String smallerTargetFolder = scanner.nextLine().trim();

// 获取用户输入比较相似图形时存放较大图片的路径
System.out.print("请输入目标文件夹路径(用于存放比较后较大图片): ");
String largerTargetFolder = scanner.nextLine().trim();

// 获取 输入的日志文件夹路径
System.out.print("请输入日志存放文件夹路径: ");
String logFolder = scanner.nextLine().trim();

// 获取用户输入的相似度阈值
// System.out.print("请输入相似度阈值(0-100之间的数字,推荐95): ");
// double similarityThreshold = scanner.nextDouble() / 100.0;

System.out.println("\n配置信息:");
System.out.println("源文件夹: " + sourceFolder);
System.out.println("较大图片目标文件夹: " + largerTargetFolder);
System.out.println("较小图片目标文件夹: " + smallerTargetFolder);
System.out.println("日志目标文件夹: " + logFolder);
// System.out.println("相似度阈值: " + (int)(similarityThreshold * 100) + "%");
System.out.println("====================================\n");

System.out.print("确认开始处理吗?(Y/N): ");
String confirm = scanner.next();
if (confirm.equalsIgnoreCase("Y")) {
long startTime = System.currentTimeMillis();
processDuplicates(sourceFolder, largerTargetFolder,smallerTargetFolder,logFolder, 0.95);
long endTime = System.currentTimeMillis();

System.out.println("\n处理完成,耗时: " + (endTime - startTime)/1000.0 + "秒");
} else {
System.out.println("操作已取消");
}

scanner.close();
}
}

posted @ 2025-07-11 16:45  小周^^  阅读(17)  评论(0)    收藏  举报