Question

问题陈述

我正在寻找一种有效的方法来生成完整的二进制笛卡尔积（具有True和False的所有组合的表，并具有一定的列数），通过某些排除条件过滤。例如，对于三列/位n=3，我们会得到完整的表格

df_combs = pd.DataFrame(itertools.product(*([[True, False]] * n)))
       0      1      2
0   True   True   True
1   True   True  False
2   True  False   True
3   True  False  False
...

应该由定义互斥的字典过滤组合如下：

mutually_excl = [{0: False, 1: False, 2: True},
                 {0: True, 2: True}]

其中的键表示上表中的列。该示例将被读取为：

如果0为False，而1为False，则2不能为True
如果0为True，则2不能为True

基于这些过滤器，预期输出为：

       0      1      2
1   True   True  False
3   True  False  False
4  False   True   True
5  False   True  False
7  False  False  False

在我的用例中，过滤后的表格比完整的笛卡尔积小几个数量级（例如，约1000而不是2**24 (16777216)）。

下面是我目前的三个解决方案，每个解决方案都有各自的优缺点，最后进行了讨论。

import random
import pandas as pd
import itertools
import wrapt
import time
import operator
import functools

def get_mutually_excl(n, nfilt):  # generate random example filter
    ''' Example: `get_mutually_excl(9, 2)` creates a list of two filters with
    maximum index `n=9` and each filter length between 2 and `int(n/3)`:
    `[{1: True, 2: False}, {3: False, 2: True, 6: False}]` '''
    random.seed(2)
    return [{random.choice(range(n)): random.choice([True, False])
                           for _ in range(random.randint(2, int(n/3)))}
                           for _ in range(nfilt)]

@wrapt.decorator
def timediff(f, _, args, kwargs):
    t = time.perf_counter()
    res = f(*args)
    return res, time.perf_counter() - t

解决方案1：先过滤，然后合并。

将每个单个过滤器条目（例如{0: True, 2: True}）展开为子表，其列对应于此过滤器条目（[0, 2]）中的索引。从此子表（[True, True]）中删除单个过滤的行。合并全表以获取已过滤组合的完整列表。

@timediff
def make_df_comb_filt_merge(n, nfilt):

    mutually_excl = get_mutually_excl(n, nfilt)

    # determine missing (unfiltered) columns
    cols_missing = set(range(n)) - set(itertools.chain.from_iterable(mutually_excl))

    # complete dataframe of unfiltered columns with column "temp" for full outer merge
    df_comb = pd.DataFrame(itertools.product(*([[True, False]] * len(cols_missing))),
                            columns=cols_missing).assign(temp=1)

    for filt in mutually_excl:  # loop through individual filters

        # get columns and bool values of this filters as two tuples with same order
        list_col, list_bool = zip(*filt.items())

        # construct dataframe
        df = pd.DataFrame(itertools.product(*([[True, False]] * len(list_col))),
                                columns=list_col)

        # filter remove a *single* row (by definition)
        df = df.loc[df.apply(tuple, axis=1) != list_bool]

        # determine which rows to merge on
        merge_cols = list(set(df.columns) & set(df_comb.columns))
        if not merge_cols:
            merge_cols = ['temp']
            df['temp'] = 1

        # merge with full dataframe
        df_comb = pd.merge(df_comb, df, on=merge_cols)

    df_comb.drop('temp', axis=1, inplace=True)
    df_comb = df_comb[range(n)]
    df_comb = df_comb.sort_values(df_comb.columns.tolist(), ascending=False)

    return df_comb.reset_index(drop=True)

解决方案2：完全展开，然后过滤

完全生成DataFrame 笛卡尔积：整个事情最终在内存中。循环过滤并为每个蒙版创建一个蒙版。将每个蒙版应用到表格上。


@timediff
def make_df_comb_exp_filt(n, nfilt):

    mutually_excl = get_mutually_excl(n, nfilt)

    # expand all bool combinations into dataframe
    df_comb = pd.DataFrame(itertools.product(*([[True, False]] * n)),
                           dtype=bool)

    for filt in mutually_excl:

        # generate total filter mask for given excluded combination
        mask = pd.Series(True, index=df_comb.index)
        for col, bool_act in filt.items():
            mask = mask & (df_comb[col] == bool_act)

        # filter dataframe
        df_comb = df_comb.loc[~mask]

    return df_comb.reset_index(drop=True)

解决方案3：过滤器迭代器

使整个笛卡尔积保持迭代器。循环播放检查每一行是否被任何过滤器排除。

@timediff
def make_df_iter_filt(n, nfilt):

    mutually_excl = get_mutually_excl(n, nfilt)

    # switch to [[(1, 13), (True, False)], [(4, 9), (False, True)], ...]
    mutually_excl_index = [list(zip(*comb.items()))
                                for comb in mutually_excl]

    # create iterator
    combs_iter = itertools.product(*([[True, False]] * n))

    @functools.lru_cache(maxsize=1024, typed=True)  # small benefit
    def get_getter(list_):
        # Used to access combs_iter row values as indexed by the filter
        return operator.itemgetter(*list_)

    def check_comb(comb_inp, comb_check):
        return get_getter(comb_check[0])(comb_inp) == comb_check[1]

    # loop through the iterator
    # drop row if any of the filter matches
    df_comb = pd.DataFrame([comb_inp for comb_inp in combs_iter
                       if not any(check_comb(comb_inp, comb_check)
                                  for comb_check in mutually_excl_index)])

    return df_comb.reset_index(drop=True)

运行示例

dict_time = dict.fromkeys(itertools.product(range(16, 23, 2), range(3, 20)))

for n, nfilt in dict_time:
    dict_time[(n, nfilt)] = {'exp_filt': make_df_comb_exp_filt(n, nfilt)[1],
                             'filt_merge': make_df_comb_filt_merge(n, nfilt)[1],
                             'iter_filt': make_df_iter_filt(n, nfilt)[1]}

分析

import seaborn as sns
import matplotlib.pyplot as plt

df_time = pd.DataFrame.from_dict(dict_time, orient='index',
                                 ).rename_axis(["n", "nfilt"]
                                 ).stack().reset_index().rename(columns={'level_2': 'solution', 0: 'time'})

g = sns.FacetGrid(df_time.query('n in %s' % str([16,18,20,22])),
                  col="n",  hue="solution", sharey=False)
g = (g.map(plt.plot, "nfilt", "time", marker="o").add_legend())

解决方案3 ：基于迭代器的方法（comb_iterator）运行时间短，但没有明显的用途的记忆。我觉得仍有改进的空间，尽管不可避免可能会在运行时间方面施加硬限制。

解决方案2 ：将整个笛卡尔积扩展为DataFrame（exp_filt）会导致内存峰值，我想避免。运行时间还可以。

解决方案1 ：合并由各个过滤器（filt_merge）创建的DataFrame感觉很好我的实际应用程序的解决方案（请注意，由于使用较小的cols_missing表，可以减少过滤器数量，从而减少运行时间）。不过，这种方法并不完全令人满意：如果单个过滤器包含所有列，则整个笛卡尔积（2**n）最终将存储在内存中，从而使该解决方案比comb_iterator更糟。

问题：还有其他想法吗？疯狂的聪明的麻木两线？可以以某种方式改进基于迭代器的方法吗？

Answer 1

基于@ayhan的评论，我实现了一个基于or-tools SAT的解决方案。尽管这个主意很棒，但是对于大量的二进制变量来说，这确实很困难。我怀疑这类似于大型IP问题，它们也不是在公园里散步的。但是，对过滤器编号的强烈依赖性可能使它成为某些参数配置的有效选项。但作为一般解决方案，我不会使用它。

from ortools.sat.python import cp_model

class VarArraySolutionCollector(cp_model.CpSolverSolutionCallback):

    def __init__(self, variables):
        cp_model.CpSolverSolutionCallback.__init__(self)
        self.__variables = variables
        self.solution_list = []

    def on_solution_callback(self):
        self.solution_list.append([self.Value(v) for v in self.__variables])


@timediff
def make_df_comb_sat(n, nfilt):

    mutually_excl = get_mutually_excl(n, nfilt)

    model = cp_model.CpModel()

    make_var_name = 'x{:02d}'.format
    vrs = dict.fromkeys(map(make_var_name, range(n)))
    for var_name in vrs:
        vrs[var_name] = model.NewBoolVar(var_name)

    for filt in mutually_excl:
        list_expr = [vrs[make_var_name(iv)]
                     if not bool_ else getattr(vrs[make_var_name(iv)], 'Not')()
                     for iv, bool_ in filt.items()]
        model.AddBoolOr(list_expr)

    solver = cp_model.CpSolver()
    solution_printer = VarArraySolutionCollector(vrs.values())
    solver.SearchForAllSolutions(model, solution_printer)

    df_comb = pd.DataFrame(solution_printer.solution_list).astype(bool)
    df_comb = df_comb.sort_values(df_comb.columns.tolist(), ascending=False)
    df_comb = df_comb.reset_index(drop=True)

    return df_comb

Answer 2

尝试计时以下内容：

def in_filter(arr, arr_filt, n):
    return ((arr[:, None] >> (n-1-arr_filt[:, 0])) & 1 == arr_filt[:, 1]).all(axis=1)

def bits_to_boolean(arr, n):
    return ((arr[:, None] >> np.arange(n, dtype=arr.dtype)[::-1]) & 1).astype(bool)

@timediff
def recursive_filter(n, nfilt, dtype='uint32'):
    filts = get_mutually_excl(n, nfilt)
    out = np.arange(2**n, dtype=dtype)
    for filt in filts:
        arr_filt = np.array(list(filt.items()))
        out = out[~in_filter(out, arr_filt, n)]
    return bits_to_boolean(out, n)[::-1]

它将笛卡尔二进制乘积视为在整数0..<2**n范围内编码的位，并使用矢量化函数递归删除具有与给定过滤器匹配的位序列的数字。

与分配所有[True, False]笛卡尔积相比，内存效率要好得多，因为每个布尔值将至少存储8位（比要求多使用7位），但是它将比基于迭代器的存储使用更多的内存方法。如果您需要大型n的解决方案，则可以通过一次分配和操作一个子范围来分解此任务。我在第一个实现中确实做到了这一点，但是它对n<=22并没有太大的好处，它需要计算输出数组的大小，当有重叠的过滤器时，这会变得很复杂。

生成过滤后的二元笛卡尔积

问题陈述

解决方案1：先过滤，然后合并。

解决方案2：完全展开，然后过滤

解决方案3：过滤器迭代器

运行示例

分析

2 个答案: