Question

我写了一些代码，包括在下面，以帮助我在包含大约65K图像的文件夹中查找几乎重复的图像。现在，我分三个步骤进行操作：

计算目录中每张图片的平均哈希并保存到字典中
遍历目录的元素并计算散列之间的差。我还计算了散列差异较大（以捕获某些额外情况）的某些“中间”情况下图像直方图之间的RMS差异
删除图像

在大文件夹上，步骤1仅花费大约3分钟，但是步骤2花费了ETERNITY。现在运行+5小时，没有结果。我想有一种更好的方法可以做到这一点，但是我对提高计算速度并不了解。

什么是提高速度的好方法？

import imagehash 
import PIL 
from pathlib import Path
import os
import time
import pandas as pd
import math

def timer(start,end):
    hours, rem = divmod(end-start, 3600)
    minutes, seconds = divmod(rem, 60)
    return("{:0>2}:{:0>2}:{:02.0f}".format(int(hours),int(minutes),seconds))

#step 1 make hashes 
def get_hash_dict(mypath, hash_size = 16 ):
    start = time.time()
    d = {} 
    file_list = os.listdir(mypath)
    for i in file_list: 
        try: 
            im = PIL.Image.open(os.path.join(mypath, i)) 
        except: 
            continue 
        d[i] = imagehash.average_hash(im, hash_size=hash_size)  

    print(len(d),len(file_list)) 
    print("Elapsed time {}".format(timer(start,time.time())))
    return(d)

# remove all with a small cutoff 
# double-check those with larger cutoff
#step 2
def get_dups(file_list,d, hash_size=16):
    start = time.time()
    cutoff_small = int(.06*hash_size**2) #6% difference is identical
    cutoff_large = int(.15*hash_size**2) # 16% has many false positives
    cutoff_rms = 580 
    res = []
    double_check = []
    dont_check = set()
    to_delete=[]
    freqs=[]
    another_check=[]

    for i, f1 in enumerate(file_list[:-2]):
        hash0 = d[f1] 
        temp1 = [f1]
        temp2 = [f1]
        temp3 = [f1]
        for f2 in file_list[i+1:]:
            if f2 not in dont_check:
                dubbles = []
                hash1 = d[f2]
                temp = hash0-hash1

                if temp < cutoff_small:
                    temp1.append([f2,temp]) 
                    temp2.append(f2)
                    dont_check.add(f2)
                    freqs.append(temp)
                # double check if the error is too high
                elif temp < cutoff_large:
                    freqs.append(temp)
                    i1 = PIL.Image.open(os.path.join(mypath, f1)).histogram()
                    i2 = PIL.Image.open(os.path.join(mypath, f2)).histogram()
                    rms = math.sqrt(sum([(a-b)**2 for (a,b) in zip(i1, i2)])/len(i1))
                    if rms < cutoff_rms:
                        temp3.append(f2)
                        another_check.append([f1,f2])

        if len(temp1)>=2:
            res.append(temp1)
            to_delete.append(temp2)
        if len(temp3)>=2:
            double_check.extend(temp3)

    double_check_2 = list(set(double_check) - dont_check)
    res_doubles = []
    dont_check = set()
    for i,f1 in enumerate(double_check_2):
        if f1 not in dont_check:
            temp = [f1]
            for j,f2 in enumerate(double_check_2[i+1:]):
                i1 = PIL.Image.open(os.path.join(mypath, f1)).histogram()
                i2 = PIL.Image.open(os.path.join(mypath, f2)).histogram()
                rms = math.sqrt(sum([(a-b)**2 for (a,b) in zip(i1, i2)])/len(i1))
                if rms < cutoff_rms and d[f1]-d[f2]<=cutoff_large:
                    temp.append(f2)
                    dont_check.add(f2)
            res_doubles.append(temp)

    print("Elapsed time {}".format(timer(start,time.time())))
    plt.hist(freqs, color='g')
    plt.show() 
    return(to_delete,res_doubles,another_check)

#find biggest file, remove others
def keep_largest(all_files):
    count=0
    to_keep=[]
    to_remove=[]
    for i in all_files:
        s_max = 0
        for j in i:
            temp = os.stat(os.path.join(mypath, j)).st_size
            if temp > s_max:
                keep_temp = j
                s_max = temp
        to_keep.append(keep_temp)

    to_keep = set(to_keep)
    print('Keeping ',len(to_keep))
    for i in all_files:
        for j in i:
            if j not in to_keep:
                try:
                    #os.remove(os.path.join(mypath,i))
                    count+=1
                except:
                    continue

    print("Deleted {} files".format(count))

Answer 1

与其创建每个图像的直方图，不如创建一个getHistogram函数来缓存直方图，而不是针对同一文件多次生成该直方图。

快速遍历Python大列表的方法

1 个答案: