我在python 2.7中有一个程序需要读取目录中的所有文件并聚合它们的内容。现在,我一个接一个地在一个线程中这样做:
def read_file(path):
with open(path, 'r') as f:
return f.read()
files = map(read_file, paths)
但是我想优化它,以便在阅读下一个文件之前不必等待每个文件被读取。我一直在寻找如何并行执行此操作并提出了多种解决方案,包括使用多处理,线程和队列,其中最快的是以下内容:
from threading import Thread
import Queue
def add_to_queue(q, f):
q.put(read_file(f))
q = Queue.Queue()
files = []
for f in paths:
t = Thread(target=add_to_queue, args = (q, f))
t.daemon = True
t.start()
for f in paths:
files.append(q.get())
然而,在绑定了许多选项之后,一个接一个地读取单个线程上的文件似乎是最快的方法。我在这里错过了什么吗?什么是最有效的方法?
答案 0 :(得分:1)
假设您确实在从单个磁盘读取多个文件,那么您的操作将是 I / O绑定而不是 CPU绑定。
任何数量的多处理,排队,多线程,弯曲,折叠,扭曲,箍跳或其他gimcrackery都会使磁盘主轴转速更快,或者磁头在圆柱体上移动得更快。
为了从中获得更高的性能,要么考虑提高I / O性能,要么考虑采用不同的解决方案方法。 (您是否可以通过不同服务器提供不同部分的方式重新设计体系结构,或者更少的部分,或者......?)
根据文件的大小和数量,您可能会考虑:使用SSD驱动器,使用多个驱动器,使用RAID控制器,使用SAN或使用服务器群集。
答案 1 :(得分:0)
从磁盘读取是一个顺序操作,无论你有多少进程都没关系,基本上只有一个进程一次读取。如果您在读取文件时需要执行其他操作,例如搜索已加载文件的特定字符串或正则表达式文件内容,则并发将非常有用。
答案 2 :(得分:0)
使用线程填充数据库中的数据湖。使用记录密钥EDA的断点刷新和查询。加快整个处理和汇总文件可能会使清理更加耗时。我只是用带有FP-growth算法的BSON文件做到了这一点。
class FPTreeNode():
def __init__(self, item=None, support=1):
# 'Value' of the item
self.item = item
# Number of times the item occurs in a
# transaction
self.support = support
# Child nodes in the FP Growth Tree
self.children = {}
class FPGrowth():
def __init__(self, min_sup=0.3):
self.min_sup = min_sup
# The root of the initial FP Growth Tree
self.tree_root = None
# Prefixes of itemsets in the FP Growth Tree
self.prefixes = {}
self.frequent_itemsets = []
# Count the number of transactions that contains item.
def _calculate_support(self, item, transactions):
count = 0
for transaction in transactions:
if item in transaction:
count += 1
support = count
return support
# Returns a set of frequent items. An item is determined to
# be frequent if there are atleast min_sup transactions that
contains
# it.
def _get_frequent_items(self, transactions):
# Get all unique items in the transactions
unique_items = set(
item for transaction in transactions for item in transaction)
items = []
for item in unique_items:
sup = self._calculate_support(item, transactions)
if sup >= self.min_sup:
items.append([item, sup])
# Sort by support - Highest to lowest
items.sort(key=lambda item: item[1], reverse=True)
frequent_items = [[el[0]] for el in items]
# Only return the items
return frequent_items
# Recursive method which adds nodes to the tree.
def _insert_tree(self, node, children):
if not children:
return
# Create new node as the first item in children list
child_item = children[0]
child = FPTreeNode(item=child_item)
# If parent already contains item => increase the support
if child_item in node.children:
node.children[child.item].support += 1
else:
node.children[child.item] = child
# Execute _insert_tree on the rest of the children list
# from the new node
self._insert_tree(node.children[child.item], children[1:])
def _construct_tree(self, transactions, frequent_items=None):
if not frequent_items:
# Get frequent items sorted by support
frequent_items = self._get_frequent_items(transactions)
unique_frequent_items = list(
set(item for itemset in frequent_items for item in itemset))
# Construct the root of the FP Growth tree
root = FPTreeNode()
for transaction in transactions:
# Remove items that are not frequent according to
# unique_frequent_items
transaction = [item for item in transaction
if item in unique_frequent_items]
transaction.sort(key=lambda item:
frequent_items.index([item]))
self._insert_tree(root, transaction)
return root
# Recursive method which prints the FP Growth Tree
def print_tree(self, node=None, indent_times=0):
if not node:
node = self.tree_root
indent = " " * indent_times
print ("%s%s:%s" % (indent, node.item, node.support))
for child_key in node.children:
child = node.children[child_key]
self.print_tree(child, indent_times + 1)
# Makes sure that the first item in itemset
# is a child of node and that every following item
# in itemset is reachable via that path
def _is_prefix(self, itemset, node):
for item in itemset:
if not item in node.children:
return False
node = node.children[item]
return True
# Recursive method that adds prefixes to the itemset by
# traversing the FP Growth Tree
def _determine_prefixes(self, itemset, node, prefixes=None):
if not prefixes:
prefixes = []
# If the current node is a prefix to the itemset
# add the current prefixes value as prefix to the itemset
if self._is_prefix(itemset, node):
itemset_key = self._get_itemset_key(itemset)
if not itemset_key in self.prefixes:
self.prefixes[itemset_key] = []
self.prefixes[itemset_key] += [{"prefix": prefixes,
"support": node.children[itemset[0]].support}]
for child_key in node.children:
child = node.children[child_key]
# Recursive call with child as new node. Add the child
#item as potential
# prefix.
self._determine_prefixes(itemset, child, prefixes +
[child.item])
# Determines the look of the hashmap key for self.prefixes
# List of more strings than one gets joined by '-'
def _get_itemset_key(self, itemset):
if len(itemset) > 1:
itemset_key = "-".join(itemset)
else:
itemset_key = str(itemset[0])
return itemset_key
def _determine_frequent_itemsets(self, conditional_database,
suffix):
# Calculate new frequent items from the conditional database
# of suffix
frequent_items =
self._get_frequent_items(conditional_database)
cond_tree = None
if suffix:
cond_tree = self._construct_tree(conditional_database,
frequent_items)
# Output new frequent itemset as the suffix added to the
# frequent
# items
self.frequent_itemsets += [el + suffix for el in
frequent_items]
# Find larger frequent itemset by finding prefixes
# of the frequent items in the FP Growth Tree for the #
# conditional
# database.
self.prefixes = {}
for itemset in frequent_items:
# If no suffix (first run)
if not cond_tree:
cond_tree = self.tree_root
# Determine prefixes to itemset
self._determine_prefixes(itemset, cond_tree)
conditional_database = []
itemset_key = self._get_itemset_key(itemset)
# Build new conditional database
if itemset_key in self.prefixes:
for el in self.prefixes[itemset_key]:
# If support = 4 => add 4 of the corresponding
# prefix set
for _ in range(el["support"]):
conditional_database.append(el["prefix"])
# Create new suffix
new_suffix = itemset + suffix if suffix else itemset
self._determine_frequent_itemsets(conditional_database, suffix=new_suffix)
def find_frequent_itemsets(self, transactions, suffix=None,
show_tree=False):
self.transactions = transactions
# Build the FP Growth Tree
self.tree_root = self._construct_tree(transactions)
if show_tree:
print ("FP-Growth Tree:")
self.print_tree(self.tree_root)
self._determine_frequent_itemsets(transactions, suffix=None)
return self.frequent_itemsets
def main():