我写了一些代码,包括在下面,以帮助我在包含大约65K图像的文件夹中查找几乎重复的图像。现在,我分三个步骤进行操作:
在大文件夹上,步骤1仅花费大约3分钟,但是步骤2花费了ETERNITY。现在运行+5小时,没有结果。我想有一种更好的方法可以做到这一点,但是我对提高计算速度并不了解。
什么是提高速度的好方法?
import imagehash
import PIL
from pathlib import Path
import os
import time
import pandas as pd
import math
def timer(start,end):
hours, rem = divmod(end-start, 3600)
minutes, seconds = divmod(rem, 60)
return("{:0>2}:{:0>2}:{:02.0f}".format(int(hours),int(minutes),seconds))
#step 1 make hashes
def get_hash_dict(mypath, hash_size = 16 ):
start = time.time()
d = {}
file_list = os.listdir(mypath)
for i in file_list:
try:
im = PIL.Image.open(os.path.join(mypath, i))
except:
continue
d[i] = imagehash.average_hash(im, hash_size=hash_size)
print(len(d),len(file_list))
print("Elapsed time {}".format(timer(start,time.time())))
return(d)
# remove all with a small cutoff
# double-check those with larger cutoff
#step 2
def get_dups(file_list,d, hash_size=16):
start = time.time()
cutoff_small = int(.06*hash_size**2) #6% difference is identical
cutoff_large = int(.15*hash_size**2) # 16% has many false positives
cutoff_rms = 580
res = []
double_check = []
dont_check = set()
to_delete=[]
freqs=[]
another_check=[]
for i, f1 in enumerate(file_list[:-2]):
hash0 = d[f1]
temp1 = [f1]
temp2 = [f1]
temp3 = [f1]
for f2 in file_list[i+1:]:
if f2 not in dont_check:
dubbles = []
hash1 = d[f2]
temp = hash0-hash1
if temp < cutoff_small:
temp1.append([f2,temp])
temp2.append(f2)
dont_check.add(f2)
freqs.append(temp)
# double check if the error is too high
elif temp < cutoff_large:
freqs.append(temp)
i1 = PIL.Image.open(os.path.join(mypath, f1)).histogram()
i2 = PIL.Image.open(os.path.join(mypath, f2)).histogram()
rms = math.sqrt(sum([(a-b)**2 for (a,b) in zip(i1, i2)])/len(i1))
if rms < cutoff_rms:
temp3.append(f2)
another_check.append([f1,f2])
if len(temp1)>=2:
res.append(temp1)
to_delete.append(temp2)
if len(temp3)>=2:
double_check.extend(temp3)
double_check_2 = list(set(double_check) - dont_check)
res_doubles = []
dont_check = set()
for i,f1 in enumerate(double_check_2):
if f1 not in dont_check:
temp = [f1]
for j,f2 in enumerate(double_check_2[i+1:]):
i1 = PIL.Image.open(os.path.join(mypath, f1)).histogram()
i2 = PIL.Image.open(os.path.join(mypath, f2)).histogram()
rms = math.sqrt(sum([(a-b)**2 for (a,b) in zip(i1, i2)])/len(i1))
if rms < cutoff_rms and d[f1]-d[f2]<=cutoff_large:
temp.append(f2)
dont_check.add(f2)
res_doubles.append(temp)
print("Elapsed time {}".format(timer(start,time.time())))
plt.hist(freqs, color='g')
plt.show()
return(to_delete,res_doubles,another_check)
#find biggest file, remove others
def keep_largest(all_files):
count=0
to_keep=[]
to_remove=[]
for i in all_files:
s_max = 0
for j in i:
temp = os.stat(os.path.join(mypath, j)).st_size
if temp > s_max:
keep_temp = j
s_max = temp
to_keep.append(keep_temp)
to_keep = set(to_keep)
print('Keeping ',len(to_keep))
for i in all_files:
for j in i:
if j not in to_keep:
try:
#os.remove(os.path.join(mypath,i))
count+=1
except:
continue
print("Deleted {} files".format(count))
答案 0 :(得分:0)
与其创建每个图像的直方图,不如创建一个getHistogram函数来缓存直方图,而不是针对同一文件多次生成该直方图。