我想要实现的是提取给定目录下文件的元数据,并使用python的sqlite3将其存储在.db文件中。我使用python的多重处理来获得对每个文件的某些哈希值进行并行计算的优势(一次为每个文件使用一个内核),尽管串行代码始终比并行代码获得更少的计算时间。我在具有2-8gb ram和2-4核的许多计算机上进行了测试,从许多文件(其中一些文件的大小大于1gb)中收集数据,结果始终相同。下面,我向您介绍串行和并行python脚本,任何想法都将非常有帮助。
多重处理脚本:
import itertools
import multiprocessing
from multiprocessing import Pool
import os, sys
import stat
import sqlite3
import time
import hashlib
def worker(filename):
conn = sqlite3.connect('metadata.db', timeout=30.0)
c = conn.cursor() #database cursor
result = os.stat(filename) #stat instance to get info about the current file
print("Gathering metadata for file: " + filename)
split_filename = filename.split('/')
path_to_file = '/'.join(split_filename[:-1])
file_name = split_filename[len(split_filename) - 1]
#just things to get info about file
if '.' in file_name:
file_type = file_name.split('.', 1)
name = file_type[0]
file_type = file_type[1]
else:
file_type = 'null'
name = file_name
hash_md5 = hashlib.md5()
with open(path_to_file + '/' + file_name, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
hash_md5.update(chunk)
md5 = hash_md5.hexdigest()
hash_sha256 = hashlib.sha256()
with open(path_to_file + '/' + file_name, 'rb') as f:
for chunk in iter(lambda: f.read(4096), b''):
hash_sha256.update(chunk)
sha256 = hash_sha256.hexdigest()
print('Current process: ' + str(multiprocessing.current_process()))
#insert into database
try:
c.execute("INSERT INTO metadata VALUES (null,?,?,?,?,?,?,?,?,?,?,?)", (name, file_name, result.st_mode, result.st_size,
result.st_atime, result.st_mtime, result.st_ctime, path_to_file, file_type, md5, sha256))
except sqlite3.Error as e:
print('!!Error Aqlite3: ' + e)
conn.commit()
def main():
conn = sqlite3.connect('metadata.db', timeout=30.0)
c = conn.cursor()
#check if table exists otherwise create one
tb_exists = "SELECT name FROM sqlite_master WHERE type='table' AND name='metadata'"
if not conn.execute(tb_exists).fetchone():
c.execute('''CREATE TABLE metadata
(unique_id INTEGER PRIMARY KEY AUTOINCREMENT, name text, full_name text, mode text, size real,
atime real, mtime real, ctime real, location text, type text, md5 text, sha256 text)''')
conn.close()
print('Number of CPUs: ' + str(multiprocessing.cpu_count()))
pool = Pool(multiprocessing.cpu_count()) # pool of cpu_count processes
walk = os.walk("/directoy/you/want/to/make/extraction")
fn_gen = itertools.chain.from_iterable((os.path.join(root, file)
for file in files)
for root, dirs, files in walk)
t1 = time.time()
results_of_work = pool.map(worker, fn_gen) # this does the parallel processing
pool.close()
pool.join()
print('Entire Computation took: ' + str(time.time() - t1) + ' seconds')
if __name__ == '__main__':
main()
串行脚本:
import itertools
import multiprocessing
from multiprocessing import Pool
import os, sys
import stat
import sqlite3
import time
import hashlib
def worker(file_list, conn):
c = conn.cursor() #database cursor
for file_name in file_list:
result = os.stat(file_name) #stat instance to get info about the current file
print("Gathering metadata for file: " + file_name)
split_filename = file_name.split('/')
path_to_file = '/'.join(split_filename[:-1])
file_name = split_filename[len(split_filename) - 1]
#just things to get info about file
if '.' in file_name:
file_type = file_name.split('.', 1)
name = file_type[0]
file_type = file_type[1]
else:
file_type = 'null'
name = file_name
hash_md5 = hashlib.md5()
with open(path_to_file + '/' + file_name, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
hash_md5.update(chunk)
md5 = hash_md5.hexdigest()
hash_sha256 = hashlib.sha256()
with open(path_to_file + '/' + file_name, 'rb') as f:
for chunk in iter(lambda: f.read(4096), b''):
hash_sha256.update(chunk)
sha256 = hash_sha256.hexdigest()
#insert into database
try:
c.execute("INSERT INTO metadata VALUES (null,?,?,?,?,?,?,?,?,?,?,?)", (name, file_name, result.st_mode, result.st_size,
result.st_atime, result.st_mtime, result.st_ctime, path_to_file, file_type, md5, sha256))
except sqlite3.Error as e:
print('!!Error Aqlite3: ' + e)
conn.commit()
def main():
conn = sqlite3.connect('metadata_serial.db', timeout=30.0)
c = conn.cursor()
#check if table exists otherwise create one
tb_exists = "SELECT name FROM sqlite_master WHERE type='table' AND name='metadata'"
if not conn.execute(tb_exists).fetchone():
c.execute('''CREATE TABLE metadata
(unique_id INTEGER PRIMARY KEY AUTOINCREMENT, name text, full_name text, mode text, size real,
atime real, mtime real, ctime real, location text, type text, md5 text, sha256 text)''')
walk = os.walk("/directoy/you/want/to/make/extraction")
fn_gen = itertools.chain.from_iterable((os.path.join(root, file)
for file in files)
for root, dirs, files in walk)
file_list = list(fn_gen)
t1 = time.time()
worker(file_list, conn)
print('Entire Computation took: ' + str(time.time() - t1) + ' seconds')
conn.close()
if __name__ == '__main__':
main()
答案 0 :(得分:2)
看来根本原因是sqlite3在多处理中不能很好地工作:
SQLite本身不欢迎高并发事务,因为它会在写入过程中锁定文件。若要解决此限制,应建立一个排队系统。 ...
来源:How to do multiprocessing/multi-threading in Python with SQLite