我正在编写一个Python程序来查找和删除文件夹中的重复文件。
我有多个mp3文件副本和一些其他文件。我正在使用sh1算法。
如何找到这些重复文件并将其删除?
答案 0 :(得分:40)
其他解决方案中的方法非常酷,但是他们忘记了重复文件的重要属性 - 它们具有相同的文件大小。仅在具有相同大小的文件上计算昂贵的哈希将节省大量的CPU;最后的表现比较,这是解释。
迭代@nosklo给出的可靠答案并借用@Raffi的思想来快速散列每个文件的开头,并仅计算快速散列中的冲突的完整散列,以下是步骤:
代码:
#!/usr/bin/env python
import sys
import os
import hashlib
def chunk_reader(fobj, chunk_size=1024):
"""Generator that reads a file in chunks of bytes"""
while True:
chunk = fobj.read(chunk_size)
if not chunk:
return
yield chunk
def get_hash(filename, first_chunk_only=False, hash=hashlib.sha1):
hashobj = hash()
file_object = open(filename, 'rb')
if first_chunk_only:
hashobj.update(file_object.read(1024))
else:
for chunk in chunk_reader(file_object):
hashobj.update(chunk)
hashed = hashobj.digest()
file_object.close()
return hashed
def check_for_duplicates(paths, hash=hashlib.sha1):
hashes_by_size = {}
hashes_on_1k = {}
hashes_full = {}
for path in paths:
for dirpath, dirnames, filenames in os.walk(path):
for filename in filenames:
full_path = os.path.join(dirpath, filename)
try:
# if the target is a symlink (soft one), this will
# dereference it - change the value to the actual target file
full_path = os.path.realpath(full_path)
file_size = os.path.getsize(full_path)
except (OSError,):
# not accessible (permissions, etc) - pass on
continue
duplicate = hashes_by_size.get(file_size)
if duplicate:
hashes_by_size[file_size].append(full_path)
else:
hashes_by_size[file_size] = [] # create the list for this file size
hashes_by_size[file_size].append(full_path)
# For all files with the same file size, get their hash on the 1st 1024 bytes
for __, files in hashes_by_size.items():
if len(files) < 2:
continue # this file size is unique, no need to spend cpy cycles on it
for filename in files:
try:
small_hash = get_hash(filename, first_chunk_only=True)
except (OSError,):
# the file access might've changed till the exec point got here
continue
duplicate = hashes_on_1k.get(small_hash)
if duplicate:
hashes_on_1k[small_hash].append(filename)
else:
hashes_on_1k[small_hash] = [] # create the list for this 1k hash
hashes_on_1k[small_hash].append(filename)
# For all files with the hash on the 1st 1024 bytes, get their hash on the full file - collisions will be duplicates
for __, files in hashes_on_1k.items():
if len(files) < 2:
continue # this hash of fist 1k file bytes is unique, no need to spend cpy cycles on it
for filename in files:
try:
full_hash = get_hash(filename, first_chunk_only=False)
except (OSError,):
# the file access might've changed till the exec point got here
continue
duplicate = hashes_full.get(full_hash)
if duplicate:
print "Duplicate found: %s and %s" % (filename, duplicate)
else:
hashes_full[full_hash] = filename
if sys.argv[1:]:
check_for_duplicates(sys.argv[1:])
else:
print "Please pass the paths to check as parameters to the script"
而且,这是有趣的部分 - 性能比较。
基线 -
处理器:Feroceon 88FR131 rev 1(v5l) BogoMIPS:1599.07
(即我的低端NAS :),运行Python 2.7.11。
所以,@ nosklo非常方便的解决方案的输出:
root@NAS:InstantUpload# time ~/scripts/checkDuplicates.py
Duplicate found: ./IMG_20151231_143053 (2).jpg and ./IMG_20151231_143053.jpg
Duplicate found: ./IMG_20151125_233019 (2).jpg and ./IMG_20151125_233019.jpg
Duplicate found: ./IMG_20160204_150311.jpg and ./IMG_20160204_150311 (2).jpg
Duplicate found: ./IMG_20160216_074620 (2).jpg and ./IMG_20160216_074620.jpg
real 5m44.198s
user 4m44.550s
sys 0m33.530s
并且,这里是带有过滤器大小检查的版本,然后是小哈希,最后是完全哈希,如果发现冲突:
root@NAS:InstantUpload# time ~/scripts/checkDuplicatesSmallHash.py . "/i-data/51608399/photo/Todor phone"
Duplicate found: ./IMG_20160216_074620 (2).jpg and ./IMG_20160216_074620.jpg
Duplicate found: ./IMG_20160204_150311.jpg and ./IMG_20160204_150311 (2).jpg
Duplicate found: ./IMG_20151231_143053 (2).jpg and ./IMG_20151231_143053.jpg
Duplicate found: ./IMG_20151125_233019 (2).jpg and ./IMG_20151125_233019.jpg
real 0m1.398s
user 0m1.200s
sys 0m0.080s
两个版本每次运行3次,以获得所需的平均时间。
所以v1是(用户+ sys) 284s ,另一个是 2s ;相当不同,呵呵:) 随着这种增加,人们可以去SHA512,甚至更高级 - 通过所需的较少计算可以减轻性能损失。
否定:
答案 1 :(得分:38)
此版本使用文件大小和内容的哈希来查找重复项。 您可以传递多个路径,它将递归扫描所有路径并报告找到的所有重复项。
import sys
import os
import hashlib
def chunk_reader(fobj, chunk_size=1024):
"""Generator that reads a file in chunks of bytes"""
while True:
chunk = fobj.read(chunk_size)
if not chunk:
return
yield chunk
def check_for_duplicates(paths, hash=hashlib.sha1):
hashes = {}
for path in paths:
for dirpath, dirnames, filenames in os.walk(path):
for filename in filenames:
full_path = os.path.join(dirpath, filename)
hashobj = hash()
for chunk in chunk_reader(open(full_path, 'rb')):
hashobj.update(chunk)
file_id = (hashobj.digest(), os.path.getsize(full_path))
duplicate = hashes.get(file_id, None)
if duplicate:
print "Duplicate found: %s and %s" % (full_path, duplicate)
else:
hashes[file_id] = full_path
if sys.argv[1:]:
check_for_duplicates(sys.argv[1:])
else:
print "Please pass the paths to check as parameters to the script"
答案 2 :(得分:16)
def remove_duplicates(dir):
unique = []
for filename in os.listdir(dir):
if os.path.isfile(filename):
filehash = md5.md5(file(filename).read()).hexdigest()
if filehash not in unique:
unique.append(filehash)
else:
os.remove(filename)
//编辑:
对于mp3,您可能也对此主题Detect duplicate MP3 files with different bitrates and/or different ID3 tags?
感兴趣答案 3 :(得分:7)
我前段时间用Python写了一篇 - 欢迎你使用它。
import sys
import os
import hashlib
check_path = (lambda filepath, hashes, p = sys.stdout.write:
(lambda hash = hashlib.sha1 (file (filepath).read ()).hexdigest ():
((hash in hashes) and (p ('DUPLICATE FILE\n'
' %s\n'
'of %s\n' % (filepath, hashes[hash])))
or hashes.setdefault (hash, filepath)))())
scan = (lambda dirpath, hashes = {}:
map (lambda (root, dirs, files):
map (lambda filename: check_path (os.path.join (root, filename), hashes), files), os.walk (dirpath)))
((len (sys.argv) > 1) and scan (sys.argv[1]))
答案 4 :(得分:4)
如果应该分析许多“大尺寸”文件(图像,mp3,pdf文档),那么使用以下比较算法会更有趣/更快:
对文件的前N个字节(比如1KB)执行第一次快速哈希。这个哈希会说毫无疑问文件是否不同,但不会说两个文件是否完全相同(哈希的准确性,从磁盘读取的有限数据)
第二个较慢的哈希值,如果在第一阶段发生冲突,则更准确并对文件的整个内容执行
以下是此算法的实现:
import hashlib
def Checksum(current_file_name, check_type = 'sha512', first_block = False):
"""Computes the hash for the given file. If first_block is True,
only the first block of size size_block is hashed."""
size_block = 1024 * 1024 # The first N bytes (1KB)
d = {'sha1' : hashlib.sha1, 'md5': hashlib.md5, 'sha512': hashlib.sha512}
if(not d.has_key(check_type)):
raise Exception("Unknown checksum method")
file_size = os.stat(current_file_name)[stat.ST_SIZE]
with file(current_file_name, 'rb') as f:
key = d[check_type].__call__()
while True:
s = f.read(size_block)
key.update(s)
file_size -= size_block
if(len(s) < size_block or first_block):
break
return key.hexdigest().upper()
def find_duplicates(files):
"""Find duplicates among a set of files.
The implementation uses two types of hashes:
- A small and fast one one the first block of the file (first 1KB),
- and in case of collision a complete hash on the file. The complete hash
is not computed twice.
It flushes the files that seems to have the same content
(according to the hash method) at the end.
"""
print 'Analyzing', len(files), 'files'
# this dictionary will receive small hashes
d = {}
# this dictionary will receive full hashes. It is filled
# only in case of collision on the small hash (contains at least two
# elements)
duplicates = {}
for f in files:
# small hash to be fast
check = Checksum(f, first_block = True, check_type = 'sha1')
if(not d.has_key(check)):
# d[check] is a list of files that have the same small hash
d[check] = [(f, None)]
else:
l = d[check]
l.append((f, None))
for index, (ff, checkfull) in enumerate(l):
if(checkfull is None):
# computes the full hash in case of collision
checkfull = Checksum(ff, first_block = False)
l[index] = (ff, checkfull)
# for each new full hash computed, check if their is
# a collision in the duplicate dictionary.
if(not duplicates.has_key(checkfull)):
duplicates[checkfull] = [ff]
else:
duplicates[checkfull].append(ff)
# prints the detected duplicates
if(len(duplicates) != 0):
print
print "The following files have the same sha512 hash"
for h, lf in duplicates.items():
if(len(lf)==1):
continue
print 'Hash value', h
for f in lf:
print '\t', f.encode('unicode_escape') if \
type(f) is types.UnicodeType else f
return duplicates
find_duplicates
函数获取文件列表。这样,也可以比较两个目录(例如,更好地同步它们的内容。)创建具有指定扩展名的文件列表并避免输入某些目录的函数示例如下:
def getFiles(_path, extensions = ['.png'],
subdirs = False, avoid_directories = None):
"""Returns the list of files in the path :'_path',
of extension in 'extensions'. 'subdir' indicates if
the search should also be performed in the subdirectories.
If extensions = [] or None, all files are returned.
avoid_directories: if set, do not parse subdirectories that
match any element of avoid_directories."""
l = []
extensions = [p.lower() for p in extensions] if not extensions is None \
else None
for root, dirs, files in os.walk(_path, topdown=True):
for name in files:
if(extensions is None or len(extensions) == 0 or \
os.path.splitext(name)[1].lower() in extensions):
l.append(os.path.join(root, name))
if(not subdirs):
while(len(dirs) > 0):
dirs.pop()
elif(not avoid_directories is None):
for d in avoid_directories:
if(d in dirs): dirs.remove(d)
return l
此方法很方便不解析.svn
路径,这肯定会触发find_duplicates
中的冲突文件。
欢迎提供反馈。
答案 5 :(得分:3)
import hashlib
import os
import sys
from sets import Set
def read_chunk(fobj, chunk_size = 2048):
""" Files can be huge so read them in chunks of bytes. """
while True:
chunk = fobj.read(chunk_size)
if not chunk:
return
yield chunk
def remove_duplicates(dir, hashfun = hashlib.sha512):
unique = Set()
for filename in os.listdir(dir):
filepath = os.path.join(dir, filename)
if os.path.isfile(filepath):
hashobj = hashfun()
for chunk in read_chunk(open(filepath,'rb')):
hashobj.update(chunk)
# the size of the hashobj is constant
# print "hashfun: ", hashfun.__sizeof__()
hashfile = hashobj.hexdigest()
if hashfile not in unique:
unique.add(hashfile)
else:
os.remove(filepath)
try:
hashfun = hashlib.sha256
remove_duplicates(sys.argv[1], hashfun)
except IndexError:
print """Please pass a path to a directory with
duplicate files as a parameter to the script."""
答案 6 :(得分:3)
@ IanLee1521有一个很好的解决方案here。它非常有效,因为它首先根据文件大小检查副本。
#! /usr/bin/env python
# Originally taken from:
# http://www.pythoncentral.io/finding-duplicate-files-with-python/
# Original Auther: Andres Torres
# Adapted to only compute the md5sum of files with the same size
import argparse
import os
import sys
import hashlib
def find_duplicates(folders):
"""
Takes in an iterable of folders and prints & returns the duplicate files
"""
dup_size = {}
for i in folders:
# Iterate the folders given
if os.path.exists(i):
# Find the duplicated files and append them to dup_size
join_dicts(dup_size, find_duplicate_size(i))
else:
print('%s is not a valid path, please verify' % i)
return {}
print('Comparing files with the same size...')
dups = {}
for dup_list in dup_size.values():
if len(dup_list) > 1:
join_dicts(dups, find_duplicate_hash(dup_list))
print_results(dups)
return dups
def find_duplicate_size(parent_dir):
# Dups in format {hash:[names]}
dups = {}
for dirName, subdirs, fileList in os.walk(parent_dir):
print('Scanning %s...' % dirName)
for filename in fileList:
# Get the path to the file
path = os.path.join(dirName, filename)
# Check to make sure the path is valid.
if not os.path.exists(path):
continue
# Calculate sizes
file_size = os.path.getsize(path)
# Add or append the file path
if file_size in dups:
dups[file_size].append(path)
else:
dups[file_size] = [path]
return dups
def find_duplicate_hash(file_list):
print('Comparing: ')
for filename in file_list:
print(' {}'.format(filename))
dups = {}
for path in file_list:
file_hash = hashfile(path)
if file_hash in dups:
dups[file_hash].append(path)
else:
dups[file_hash] = [path]
return dups
# Joins two dictionaries
def join_dicts(dict1, dict2):
for key in dict2.keys():
if key in dict1:
dict1[key] = dict1[key] + dict2[key]
else:
dict1[key] = dict2[key]
def hashfile(path, blocksize=65536):
afile = open(path, 'rb')
hasher = hashlib.md5()
buf = afile.read(blocksize)
while len(buf) > 0:
hasher.update(buf)
buf = afile.read(blocksize)
afile.close()
return hasher.hexdigest()
def print_results(dict1):
results = list(filter(lambda x: len(x) > 1, dict1.values()))
if len(results) > 0:
print('Duplicates Found:')
print(
'The following files are identical. The name could differ, but the'
' content is identical'
)
print('___________________')
for result in results:
for subresult in result:
print('\t\t%s' % subresult)
print('___________________')
else:
print('No duplicate files found.')
def main():
parser = argparse.ArgumentParser(description='Find duplicate files')
parser.add_argument(
'folders', metavar='dir', type=str, nargs='+',
help='A directory to parse for duplicates',
)
args = parser.parse_args()
find_duplicates(args.folders)
if __name__ == '__main__':
sys.exit(main())
答案 7 :(得分:0)
为了安全起见(如果出现问题,自动删除它们会很危险!),根据@ zalew的回答,这是我使用的。
请注意,md5和代码与@ zalew有点不同,因为他的代码生成了太多错误的重复文件(这就是为什么我说自动删除它们的原因是危险!)。
import hashlib, os
unique = dict()
for filename in os.listdir('.'):
if os.path.isfile(filename):
filehash = hashlib.md5(open(filename, 'rb').read()).hexdigest()
if filehash not in unique:
unique[filehash] = filename
else:
print filename + ' is a duplicate of ' + unique[filehash]