我有3个文件夹 - 1个主人和2个补充。我正在编写一个脚本,通过SHA1散列识别所有三个文件中的重复文件。对于在主服务器和补充程序(或其子目录)中找到的任何重复项,我想删除补充文件夹中的文件并将文件保存在主文件夹中。 如果在补充文件夹而不是主文件夹中找到重复文件,我希望保留它们并最终与主文件合并。
我编写了一个脚本(下面),成功地删除了补充文件夹中的重复文件。但是,即使在主文件夹树中的某处找不到该文件,它也会删除所有重复项。从逻辑上讲,如果它们已经存在于主文件夹中,我只能想办法删除补充文件夹中的重复文件。任何建议,建议或提示将不胜感激!
def deleteDups(maindirectory, pnhpdirectory, dupdirectories):
hashmap = {}
for path, dirs, files in os.walk(maindirectory):
for name in files:
fullname = os.path.join(path, name)
with open(fullname, 'rb') as f:
d = f.read()
h = hashlib.md5(d).hexdigest()
filelist = hashmap.setdefault(h, [])
filelist.append(fullname)
# delete records in dictionary that have only 1 item (meaning no duplicate)
for k, v in hashmap.items():
if len(v) == 1:
del hashmap[k]
# make dictionary into flat list
try:
dups = reduce(lambda x, y: x+y, hashmap.values())
paths = [] # list of all files in duplicate directories
for directory in dupdirectories:
for root, dirs, files in os.walk(directory):
for name in files:
paths.append(os.path.join(root, name))
# if file in directory is also in duplicates list, it will be deleted
DeletedFileSize = 0.00
for file in paths:
if file in dups:
FileSize = os.path.getsize(file)
DeletedFileSize = DeletedFileSize + FileSize
print "Deleting file: " + file
os.remove(file)
else:
pass
if DeletedFileSize == 0:
print "No duplicate files found"
print "Space saved: " + str(DeletedFileSize) + " gigabytes"
else:
DeletedFileSize = DeletedFileSize / 1073741824
print "Space saved: " + str(DeletedFileSize) + " gigabytes"
except TypeError:
print "No duplicate files found."