# -*- coding: utf-8 -*-
'''
Python用哈希算法查找相似图片并放入[_df]的文件夹中
相似图片包括不同分辨率,不同大小,不同格式,只要图片相似就会算重复文件
安装cv2
pip install opencv-python
'''
import os
import cv2
import numpy as np
import shutil
import random
class DuplicateFiles (object):
dir = ''
def __init__(self, dir):
self.dir = dir # 实例属性
# 均值哈希算法
def aHash(self,img,shape=(10,10)):
# 缩放为10*10
img = cv2.resize(img, shape)
# 转换为灰度图
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# s为像素和初值为0,hash_str为hash值初值为''
s = 0
hash_str = ''
# 遍历累加求像素和
for i in range(shape[0]):
for j in range(shape[1]):
s = s + gray[i, j]
# 求平均灰度
avg = s / 100
# 灰度大于平均值为1相反为0生成图片的hash值
for i in range(shape[0]):
for j in range(shape[1]):
if gray[i, j] > avg:
hash_str = hash_str + '1'
else:
hash_str = hash_str + '0'
return hash_str
# 差值感知算法
def dHash(self,img,shape=(10,10)):
# 缩放10*11
img = cv2.resize(img, (shape[0]+1, shape[1]))
# 转换灰度图
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
hash_str = ''
# 每行前一个像素大于后一个像素为1,相反为0,生成哈希
for i in range(shape[0]):
for j in range(shape[1]):
if gray[i, j] > gray[i, j + 1]:
hash_str = hash_str + '1'
else:
hash_str = hash_str + '0'
return hash_str
# 感知哈希算法(pHash)
def pHash(self,img,shape=(10,10)):
# 缩放32*32
img = cv2.resize(img, (32, 32)) # , interpolation=cv2.INTER_CUBIC
# 转换为灰度图
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# 将灰度图转为浮点型,再进行dct变换
dct = cv2.dct(np.float32(gray))
# opencv实现的掩码操作
dct_roi = dct[0:10, 0:10]
hash = []
avreage = np.mean(dct_roi)
for i in range(dct_roi.shape[0]):
for j in range(dct_roi.shape[1]):
if dct_roi[i, j] > avreage:
hash.append(1)
else:
hash.append(0)
return hash
# 通过得到RGB每个通道的直方图来计算相似度
def classify_hist_with_split(self,image1, image2, size=(256, 256)):
# 将图像resize后,分离为RGB三个通道,再计算每个通道的相似值
image1 = cv2.resize(image1, size)
image2 = cv2.resize(image2, size)
sub_image1 = cv2.split(image1)
sub_image2 = cv2.split(image2)
sub_data = 0
for im1, im2 in zip(sub_image1, sub_image2):
sub_data += self.calculate(im1, im2)
sub_data = sub_data / 3
return sub_data
# 计算单通道的直方图的相似值
def calculate(self,image1, image2):
hist1 = cv2.calcHist([image1], [0], None, [256], [0.0, 255.0])
hist2 = cv2.calcHist([image2], [0], None, [256], [0.0, 255.0])
# 计算直方图的重合度
degree = 0
for i in range(len(hist1)):
if hist1[i] != hist2[i]:
degree = degree + (1 - abs(hist1[i] - hist2[i]) / max(hist1[i], hist2[i]))
else:
degree = degree + 1
degree = degree / len(hist1)
return degree
# Hash值对比
def cmpHash(self,hash1, hash2,shape=(10,10)):
n = 0
# hash长度不同则返回-1代表传参出错
if len(hash1)!=len(hash2):
return -1
# 遍历判断
for i in range(len(hash1)):
# 相等则n计数+1,n最终为相似度
if hash1[i] == hash2[i]:
n = n + 1
return n/(shape[0]*shape[1])
def mymovefile(self,srcfile,dstpath,ffname): # 移动函数
if not os.path.isfile(srcfile):
print ("%s not exist!"%(srcfile))
else:
fpath,fname=os.path.split(srcfile) # 分离文件名和路径
if(ffname):fname=ffname
if not os.path.exists(dstpath):
os.makedirs(dstpath) # 创建路径
shutil.move(srcfile, dstpath + fname) # 移动文件
#print ("move %s -> %s"%(srcfile, dstpath + fname))
# 定义函数
def list_all_files(self,rootdir):
_files = []
# 列出文件夹下所有的目录与文件
list = os.listdir(rootdir)
for i in range(0, len(list)):
# 构造路径
path = os.path.join(rootdir, list[i])
# 判断路径是否为文件目录或者文件
# 如果是目录则继续递归
if os.path.isdir(path):
_files.extend(list_all_files(path))
if os.path.isfile(path):
_files.append(path)
return _files
#处理文件
def mvPhoto(self):
photoList = self.list_all_files(self.dir)
#print(photoList)
for i,photo in enumerate(photoList):
mvPhoto = False #是否移动主文件
#如果不是文件则跳出
if(not os.path.isfile(photo)):
continue
fpath,fname=os.path.split(photo)
print('Master:'+fname)
ffname = fname.split('.')
#不是下列文件形式跳出
if(ffname[1] not in {'jpg', 'bmp', 'png', 'jpeg', 'gif'}):
continue
img1 = cv2.imdecode(np.fromfile(photo,dtype=np.uint8),cv2.IMREAD_COLOR)
for j in range(i+1,len(photoList)):
#print(' ',j,photoList[j])
if(not os.path.isfile(photo) or not os.path.isfile(photoList[j])):
continue
spath,sname=os.path.split(photoList[j])
#print(sname)
ssname = sname.split('.')
if(ssname[1] not in {'jpg', 'bmp', 'png', 'jpeg', 'jfif'}):
continue
#img1 = cv2.imread(photo)
img2 = cv2.imdecode(np.fromfile(photoList[j],dtype=np.uint8),cv2.IMREAD_COLOR)
#hash1 = aHash(img1)
#hash2 = aHash(img2)
n1 = self.cmpHash(self.aHash(img1), self.aHash(img2))
n2 = self.cmpHash(self.dHash(img1), self.dHash(img2))
n3 = self.cmpHash(self.pHash(img1), self.pHash(img2))
n4 = self.classify_hist_with_split(img1, img2)
n5 = self.calculate(img1, img2)
#print(' ',n1,n2,n3,n4,n5)
if(n1>0.90 or n2>0.90 or n3>0.90 or n4>0.90 or n5>0.90):
mvPhoto = True
print(' move file:'+photoList[j])
if(os.path.isfile(photoList[j])):
print('ffname[0]:'+ffname[0])
#mymovefile(photoList[j],dir+'_重复'+'/',ffname[0]+'_'+str(random.randint(10,99))+'.'+ffname[1])
self.mymovefile(photoList[j],dir+'_df'+'/',ffname[0]+'_'+sname)
#最后移动主文件
if(mvPhoto==True):
self.mymovefile(photo,dir+'_df'+'/',fname)
if __name__ == "__main__":
#指定路径
#dir = r'E:\python\photoCompare\328' #指定目录地址
dir = os.getcwd() #当前文件所在目录
duplicateFiles = DuplicateFiles(dir)
duplicateFiles.mvPhoto()