
时间:2016-01-29 14:11:24

标签: python

我正在使用许多非常大的文件(例如1000 x 512MB),我实现了一种通过将某些信息写入数据库来加快速度的方法,当我重新运行软件时可以访问这些数据。出于这个原因,我需要能够为这些文件的任意子集生成唯一的文件名。



import os
import glob
import datetime

file_paths = glob.glob("path/to/files/*.foo")

def modification_date(file_path):
    return datetime.datetime.fromtimestamp(os.path.getmtime(filename=file_path))

uid = [modification_date(f) for f in file_paths]
uid = [d.year + + + d.hour + d.minute + d.second for d in uid]
uid = sum(uid) // len(uid) + sum([os.path.getsize(f) for f in file_paths])

2 个答案:

答案 0 :(得分:1)


import hashlib, glob
file_paths = glob.glob("/home/rolf/StackOverflow/*.py")
hash_object = hashlib.sha256(str(file_paths))
file_name= hash_object.hexdigest()

file_paths = glob.glob("/home/rolf/Research/*.py")
hash_object = hashlib.sha256(str(file_paths))
file_name= hash_object.hexdigest()

答案 1 :(得分:0)


import struct
import base64

#encodes a positive integer of any size into a string.
def string_from_int(value):
    bytes = []
    while value > 0:
        bytes.append(value % 2**8)
        value >>= 8
    return struct.pack("i"*len(bytes), *bytes)

#decodes a string into a positive integer. Only works on strings whose length is divisible by 4.
def int_from_string(s):
    bytes = struct.unpack("i" * (len(s)/4), s)
    return sum(byte * (256**i) for i, byte in enumerate(bytes))

#encodes a subset of a list into a filename-safe string.
def encode(master_file_list, subset):
    bitmask = [int(filename in subset) for filename in master_file_list]
    int_value = sum(bit*2**(i) for i, bit in enumerate(bitmask))
    string_value = string_from_int(int_value)
    #the string may contain slashes or other characters illegal in filenames, so we b64 endoce it, at the expense of increasing the length by %20 or so
    return base64.urlsafe_b64encode(string_value)

#the reverse of `encode`.
def decode(master_file_list, filename):
    string_value = base64.urlsafe_b64decode(filename)
    int_value = int_from_string(string_value)
    subset = [value for i,value in enumerate(master_file_list) if (2**i) & int_value]
    return subset

master_file_list = ['caddy.jpg', 'fjeld.dat', 'gyve.ini', 'karts.png', 'laves.txt', 'nimbi.jpg', 'ocas.ini', 'sipes.dat', 'wacky.png', 'waff.png']
subset = ["fjeld.dat", "laves.txt", "ocas.ini"]
filename = encode(master_file_list, subset)
print "Subset is:", subset
print "Filename is:", filename
print "Filename decodes back to:", decode(master_file_list, filename)


Subset is: ['fjeld.dat', 'laves.txt', 'ocas.ini']
Filename is: UgAAAA==
Filename decodes back to: ['fjeld.dat', 'laves.txt', 'ocas.ini']