基于大小和上次写入时间的快速和脏的重复查找器

时间:2017-11-09 05:32:31

标签: python python-3.x list hash duplicates

是否有一个简单的快速 python代码,用于根据 filesize和上次写入时间来识别目录树中的重复文件? (一些误报是可以的。忘记哈希,太慢,不需要潜在真实重复的初始ID。)

S / O有很多相似的问题,但他们倾向于使用md5或逐字节比较。

有什么建议吗?或者,我需要运行下面的代码并比较前两列中的重复行? (也许只在匹配LWT和大小的那些上运行哈希)?

def get_size(filename):
    st = os.stat(filename)
    return str(st.st_size)

def get_last_write_time(filename):
    st = os.stat(filename)
    convert_time_to_human_readable = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(st.st_mtime))
    return convert_time_to_human_readable

1 个答案:

答案 0 :(得分:0)

LOL!那是我的代码! :)))))))

试一试(最后更新):

import os, hashlib, time

your_target_folder = "." # change with your target folder.


def size_check(get_path):

    try:
        st = os.stat(get_path)
    except:
        return "Error"

    else:
        return str(st.st_size)


def md5_check(get_path):

    try:
        hash_md5 = hashlib.md5()
        with open(get_path, "rb") as f:
            for chunk in iter(lambda: f.read(2 ** 20), b""):
                hash_md5.update(chunk)
    except:
        return "Error"
    else:
        return hash_md5.hexdigest()


def save_data(get_output):

    with open("./data.txt", 'a') as output_data:
        output_data.write(get_output)



print("Waking On All Files In Your Target Directory and Grabbing Their Personal Hashes, Plz Wait ... \n")

files_and_sizes = {}
for dirpath, _, filenames in os.walk(your_target_folder):

    for items in filenames:

        file_full_path = os.path.abspath(os.path.join(dirpath, items))
        get_size = size_check(file_full_path)

        if get_size in files_and_sizes:
            files_and_sizes[get_size].append(file_full_path)

        else:
            files_and_sizes[get_size] = [file_full_path]


new_dict = {}
error_box = []

for key, box_name in files_and_sizes.items():

    if not key == "Error" and len(box_name) > 1:

        for files in box_name:

            get_file_hash = md5_check(files)

            if not get_file_hash == "Error":

                if get_file_hash in new_dict:
                    new_dict[get_file_hash].append(files)

                else:
                    new_dict[get_file_hash] = [files]

            else:
                error_box.append(files)

    elif key == "Error" and len(box_name) > 0:

        do = [error_box.append(error_files) for error_files in box_name]

    else:
        pass


for hashes, names in new_dict.items():

    if len(names) > 1:

        for each_files in names:

            result = each_files + "\n"
            print(result)
            save_data(result)

    else:
        pass

if len(error_box) > 0:
    print("Something Went Wrong On These Files ( I could not access them ): " + str(error_box) + "\n")


print("Good By.")

祝你好运......