我有一个包含许多未识别重复项的图像目录。我的目标是识别重复项。由于重复项已被裁剪,调整大小或转换为其他图像格式,因此无法通过比较其哈希值来检测它们。
我编写了一个脚本,该脚本可以成功检测到重复项,但是有一个主要缺点:脚本运行缓慢。在带有包含60个项目的文件夹的测试驱动器上,它花了五个小时才能运行(这也可能反映了我越来越多的错误和缓慢的计算机)。由于我的目录中大约有66,000张图片,因此我估计脚本需要229天才能完成。
有人可以建议解决方案吗?我的research透露,您可以通过在循环完成时“释放”存储在变量中的图像来释放内存,但是有关如何执行此操作的所有信息似乎都是用C而不是python编写的。我还考虑过尝试使用orb而不是筛选,但对其准确性有所担忧。有谁对这两种选择中的哪一种有更好的建议?还是一种重写脚本以使其占用更少内存的方法?预先非常感谢。
from __future__ import division
import cv2
import numpy as np
import glob
import pandas as pd
listOfTitles1 = []
listOfTitles2 = []
listOfSimilarities = []
# Sift and Flann
sift = cv2.xfeatures2d.SIFT_create()
index_params = dict(algorithm=0, trees=5)
search_params = dict()
flann = cv2.FlannBasedMatcher(index_params, search_params)
# Load all the images1
countInner = 0
countOuter = 1
folder = r"/Downloads/images/**/*"
for a in glob.iglob(folder,recursive=True):
for b in glob.iglob(folder,recursive=True):
if not a.lower().endswith(('.jpg','.png','.tif','.tiff','.gif')):
continue
if not b.lower().endswith(('.jpg','.png','.tif','.tiff','.gif')):
continue
if b.lower().endswith(('.jpg','.png','.tif','.tiff','.gif')):
countInner += 1
print(countInner, "", countOuter)
if countInner <= countOuter:
continue
image1 = cv2.imread(a)
kp_1, desc_1 = sift.detectAndCompute(image1, None)
image2 = cv2.imread(b)
kp_2, desc_2 = sift.detectAndCompute(image2, None)
matches = flann.knnMatch(desc_1, desc_2, k=2)
good_points = []
if good_points == 0:
continue
for m, n in matches:
if m.distance < 0.6*n.distance:
good_points.append(m)
number_keypoints = 0
if len(kp_1) >= len(kp_2):
number_keypoints = len(kp_1)
else:
number_keypoints = len(kp_2)
percentage_similarity = float(len(good_points)) / number_keypoints * 100
listOfSimilarities.append(str(int(percentage_similarity)))
listOfTitles2.append(b)
listOfTitles1.append(a)
countInner = 0
if a.lower().endswith(('.jpg','.png','.tif','.tiff','.gif')):
countOuter += 1
zippedList = list(zip(listOfTitles1,listOfTitles2, listOfSimilarities))
print(zippedList)
dfObj = pd.DataFrame(zippedList, columns = ['Original', 'Title' , 'Similarity'])
dfObj.to_csv(r"/Downloads/images/DuplicateImages3.csv")
答案 0 :(得分:3)
我认为通过简单的更改就可以显着提高性能:
files = ... # preload all file names with glob
for a_idx in range(len(files)):
for b_idx in range(a_idx, len(files)): # notice loop here
image_1 = cv2.imread(files[a_idx])
image_2 = cv2.imread(files[b_idx])
这将考虑所有对而不重复例如(a,b)&&(b,a)
for a_idx in range(len(files)):
image_1 = cv2.imread(files[a_idx])
kp_1, desc_1 = sift.detectAndCompute(image1, None) # never recoompute SIFT!
for b_idx in range(a_idx, len(files)):
image_2 = cv2.imread(files[b_idx])
kp_2, desc_2 = sift.detectAndCompute(image2, None)
答案 1 :(得分:0)
我在计算机上以100张图像运行了您现有的实现。该代码运行了6个小时31分钟。然后,我按照注释中的建议更改了实现,以仅对每个图像计算一次sift.detectAndCompute,缓存结果并在比较中使用缓存的结果。这将我在同一张100映像上的计算机上的执行时间从6小时31分钟减少到6分钟29秒。我不知道这样对您所有图像的速度是否足够快,但这会大大减少图像。
请参阅下面的修改后的实现。
from __future__ import division
import cv2
import numpy as np
import glob
import pandas as pd
listOfTitles1 = []
listOfTitles2 = []
listOfSimilarities = []
# Sift and Flann
sift = cv2.xfeatures2d.SIFT_create()
index_params = dict(algorithm=0, trees=5)
search_params = dict()
flann = cv2.FlannBasedMatcher(index_params, search_params)
# Load all the images1
countInner = 0
countOuter = 1
folder = r"/Downloads/images/**/*"
folder = "SiftImages/*"
siftOut = {}
for a in glob.iglob(folder,recursive=True):
if not a.lower().endswith(('.jpg','.png','.tif','.tiff','.gif')):
continue
image1 = cv2.imread(a)
kp_1, desc_1 = sift.detectAndCompute(image1, None)
siftOut[a]=(kp_1,desc_1)
for a in glob.iglob(folder,recursive=True):
if not a.lower().endswith(('.jpg','.png','.tif','.tiff','.gif')):
continue
(kp_1,desc_1) = siftOut[a]
for b in glob.iglob(folder,recursive=True):
if not b.lower().endswith(('.jpg','.png','.tif','.tiff','.gif')):
continue
if b.lower().endswith(('.jpg','.png','.tif','.tiff','.gif')):
countInner += 1
print(countInner, "", countOuter)
if countInner <= countOuter:
continue
#### image1 = cv2.imread(a)
#### kp_1, desc_1 = sift.detectAndCompute(image1, None)
####
#### image2 = cv2.imread(b)
#### kp_2, desc_2 = sift.detectAndCompute(image2, None)
(kp_2,desc_2) = siftOut[b]
matches = flann.knnMatch(desc_1, desc_2, k=2)
good_points = []
if good_points == 0:
continue
for m, n in matches:
if m.distance < 0.6*n.distance:
good_points.append(m)
number_keypoints = 0
if len(kp_1) >= len(kp_2):
number_keypoints = len(kp_1)
else:
number_keypoints = len(kp_2)
percentage_similarity = float(len(good_points)) / number_keypoints * 100
listOfSimilarities.append(str(int(percentage_similarity)))
listOfTitles2.append(b)
listOfTitles1.append(a)
countInner = 0
if a.lower().endswith(('.jpg','.png','.tif','.tiff','.gif')):
countOuter += 1
zippedList = list(zip(listOfTitles1,listOfTitles2, listOfSimilarities))
print(zippedList)
dfObj = pd.DataFrame(zippedList, columns = ['Original', 'Title' , 'Similarity'])
### dfObj.to_csv(r"/Downloads/images/DuplicateImages3.csv")
dfObj.to_csv(r"DuplicateImages3.2.csv")