是否有一个简单的快速 python代码,用于根据 filesize和上次写入时间来识别目录树中的重复文件? (一些误报是可以的。忘记哈希,太慢,不需要潜在真实重复的初始ID。)
S / O有很多相似的问题,但他们倾向于使用md5或逐字节比较。有什么建议吗?或者,我需要运行下面的代码并比较前两列中的重复行? (也许只在匹配LWT和大小的那些上运行哈希)?
def get_size(filename):
st = os.stat(filename)
return str(st.st_size)
def get_last_write_time(filename):
st = os.stat(filename)
convert_time_to_human_readable = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(st.st_mtime))
return convert_time_to_human_readable
答案 0 :(得分:0)
LOL!那是我的代码! :)))))))
试一试(最后更新):
import os, hashlib, time
your_target_folder = "." # change with your target folder.
def size_check(get_path):
try:
st = os.stat(get_path)
except:
return "Error"
else:
return str(st.st_size)
def md5_check(get_path):
try:
hash_md5 = hashlib.md5()
with open(get_path, "rb") as f:
for chunk in iter(lambda: f.read(2 ** 20), b""):
hash_md5.update(chunk)
except:
return "Error"
else:
return hash_md5.hexdigest()
def save_data(get_output):
with open("./data.txt", 'a') as output_data:
output_data.write(get_output)
print("Waking On All Files In Your Target Directory and Grabbing Their Personal Hashes, Plz Wait ... \n")
files_and_sizes = {}
for dirpath, _, filenames in os.walk(your_target_folder):
for items in filenames:
file_full_path = os.path.abspath(os.path.join(dirpath, items))
get_size = size_check(file_full_path)
if get_size in files_and_sizes:
files_and_sizes[get_size].append(file_full_path)
else:
files_and_sizes[get_size] = [file_full_path]
new_dict = {}
error_box = []
for key, box_name in files_and_sizes.items():
if not key == "Error" and len(box_name) > 1:
for files in box_name:
get_file_hash = md5_check(files)
if not get_file_hash == "Error":
if get_file_hash in new_dict:
new_dict[get_file_hash].append(files)
else:
new_dict[get_file_hash] = [files]
else:
error_box.append(files)
elif key == "Error" and len(box_name) > 0:
do = [error_box.append(error_files) for error_files in box_name]
else:
pass
for hashes, names in new_dict.items():
if len(names) > 1:
for each_files in names:
result = each_files + "\n"
print(result)
save_data(result)
else:
pass
if len(error_box) > 0:
print("Something Went Wrong On These Files ( I could not access them ): " + str(error_box) + "\n")
print("Good By.")
祝你好运......