Question

我在python 2.7中有一个程序需要读取目录中的所有文件并聚合它们的内容。现在，我一个接一个地在一个线程中这样做：

def read_file(path):
with open(path, 'r') as f:
    return f.read()
files = map(read_file, paths)

但是我想优化它，以便在阅读下一个文件之前不必等待每个文件被读取。我一直在寻找如何并行执行此操作并提出了多种解决方案，包括使用多处理，线程和队列，其中最快的是以下内容：

from threading import Thread
import Queue

def add_to_queue(q, f):
    q.put(read_file(f))
q = Queue.Queue()
files = []
for f in paths:
    t = Thread(target=add_to_queue, args = (q, f))
    t.daemon = True
    t.start()
for f in paths:
    files.append(q.get())

然而，在绑定了许多选项之后，一个接一个地读取单个线程上的文件似乎是最快的方法。我在这里错过了什么吗？什么是最有效的方法？

Answer 1

假设您确实在从单个磁盘读取多个文件，那么您的操作将是 I / O绑定而不是 CPU绑定。

任何数量的多处理，排队，多线程，弯曲，折叠，扭曲，箍跳或其他gimcrackery都会使磁盘主轴转速更快，或者磁头在圆柱体上移动得更快。

为了从中获得更高的性能，要么考虑提高I / O性能，要么考虑采用不同的解决方案方法。（您是否可以通过不同服务器提供不同部分的方式重新设计体系结构，或者更少的部分，或者......？）

根据文件的大小和数量，您可能会考虑：使用SSD驱动器，使用多个驱动器，使用RAID控制器，使用SAN或使用服务器群集。

Answer 2

从磁盘读取是一个顺序操作，无论你有多少进程都没关系，基本上只有一个进程一次读取。如果您在读取文件时需要执行其他操作，例如搜索已加载文件的特定字符串或正则表达式文件内容，则并发将非常有用。

Answer 3

使用线程填充数据库中的数据湖。使用记录密钥EDA的断点刷新和查询。加快整个处理和汇总文件可能会使清理更加耗时。我只是用带有FP-growth算法的BSON文件做到了这一点。

class FPTreeNode():
    def __init__(self, item=None, support=1):
        # 'Value' of the item
        self.item = item
        # Number of times the item occurs in a
        # transaction
        self.support = support
        # Child nodes in the FP Growth Tree
        self.children = {}


class FPGrowth():
    def __init__(self, min_sup=0.3):
        self.min_sup = min_sup
        # The root of the initial FP Growth Tree
        self.tree_root = None
        # Prefixes of itemsets in the FP Growth Tree
        self.prefixes = {}
        self.frequent_itemsets = []

    # Count the number of transactions that contains item.
    def _calculate_support(self, item, transactions):
        count = 0
        for transaction in transactions:
            if item in transaction:
                count += 1
        support = count
        return support

    # Returns a set of frequent items. An item is determined to
    # be frequent if there are atleast min_sup transactions that 
    contains
    # it.
    def _get_frequent_items(self, transactions):
        # Get all unique items in the transactions
        unique_items = set(
        item for transaction in transactions for item in transaction)
        items = []
        for item in unique_items:
            sup = self._calculate_support(item, transactions)
            if sup >= self.min_sup:
                items.append([item, sup])
        # Sort by support - Highest to lowest
        items.sort(key=lambda item: item[1], reverse=True)
        frequent_items = [[el[0]] for el in items]
        # Only return the items
        return frequent_items

    # Recursive method which adds nodes to the tree.
    def _insert_tree(self, node, children):
        if not children:
            return
        # Create new node as the first item in children list
        child_item = children[0]
        child = FPTreeNode(item=child_item)
        # If parent already contains item => increase the support
        if child_item in node.children:
            node.children[child.item].support += 1
        else:
            node.children[child.item] = child

        # Execute _insert_tree on the rest of the children list
        # from the new node
        self._insert_tree(node.children[child.item], children[1:])

    def _construct_tree(self, transactions, frequent_items=None):
        if not frequent_items:
            # Get frequent items sorted by support
            frequent_items = self._get_frequent_items(transactions)
        unique_frequent_items = list(
        set(item for itemset in frequent_items for item in itemset))
        # Construct the root of the FP Growth tree
        root = FPTreeNode()
        for transaction in transactions:
            # Remove items that are not frequent according to
            # unique_frequent_items
            transaction = [item for item in transaction 
            if item in unique_frequent_items] 
            transaction.sort(key=lambda item:     
                frequent_items.index([item]))
                self._insert_tree(root, transaction)

        return root

    # Recursive method which prints the FP Growth Tree
    def print_tree(self, node=None, indent_times=0):
        if not node:
            node = self.tree_root
        indent = "    " * indent_times
        print ("%s%s:%s" % (indent, node.item, node.support))
        for child_key in node.children:
            child = node.children[child_key]
            self.print_tree(child, indent_times + 1)

    # Makes sure that the first item in itemset
    # is a child of node and that every following item
    # in itemset is reachable via that path
    def _is_prefix(self, itemset, node):
        for item in itemset:
            if not item in node.children:
                return False
            node = node.children[item]
        return True

    # Recursive method that adds prefixes to the itemset by
    # traversing the FP Growth Tree
    def _determine_prefixes(self, itemset, node, prefixes=None):
        if not prefixes:
            prefixes = []

        # If the current node is a prefix to the itemset
        # add the current prefixes value as prefix to the itemset
        if self._is_prefix(itemset, node):
            itemset_key = self._get_itemset_key(itemset)
            if not itemset_key in self.prefixes:
                self.prefixes[itemset_key] = []
            self.prefixes[itemset_key] += [{"prefix": prefixes,     
            "support": node.children[itemset[0]].support}]

        for child_key in node.children:
            child = node.children[child_key]
            # Recursive call with child as new node. Add the child 
            #item as potential
            # prefix.
            self._determine_prefixes(itemset, child, prefixes + 
            [child.item])

    # Determines the look of the hashmap key for self.prefixes
    # List of more strings than one gets joined by '-'
    def _get_itemset_key(self, itemset):
        if len(itemset) > 1:
            itemset_key = "-".join(itemset)
        else:
            itemset_key = str(itemset[0])
        return itemset_key

    def _determine_frequent_itemsets(self, conditional_database, 
    suffix):
    # Calculate new frequent items from the conditional database
    # of suffix
        frequent_items = 
        self._get_frequent_items(conditional_database)

        cond_tree = None

        if suffix:
            cond_tree = self._construct_tree(conditional_database, 
            frequent_items)
            # Output new frequent itemset as the suffix added to the 
            # frequent
            # items
            self.frequent_itemsets += [el + suffix for el in 
            frequent_items]

        # Find larger frequent itemset by finding prefixes
        # of the frequent items in the FP Growth Tree for the # 
        # conditional
        # database.
        self.prefixes = {}
        for itemset in frequent_items:
            # If no suffix (first run)
            if not cond_tree:
                cond_tree = self.tree_root
            # Determine prefixes to itemset
            self._determine_prefixes(itemset, cond_tree)
            conditional_database = []
            itemset_key = self._get_itemset_key(itemset)
            # Build new conditional database
            if itemset_key in self.prefixes:
                for el in self.prefixes[itemset_key]:
                    # If support = 4 => add 4 of the corresponding 
                    # prefix set
                    for _ in range(el["support"]):
                        conditional_database.append(el["prefix"])
                # Create new suffix
                new_suffix = itemset + suffix if suffix else itemset
                self._determine_frequent_itemsets(conditional_database, suffix=new_suffix)

    def find_frequent_itemsets(self, transactions, suffix=None,     
    show_tree=False):
        self.transactions = transactions

        # Build the FP Growth Tree
        self.tree_root = self._construct_tree(transactions)
        if show_tree:
            print ("FP-Growth Tree:")
            self.print_tree(self.tree_root)

        self._determine_frequent_itemsets(transactions, suffix=None)

        return self.frequent_itemsets

def main（）：

如何在python中优化读取多个文件

3 个答案: