Question

考虑以下功能：

import numpy
import scipy.stats


def return_category(values, categories):
    n = len(categories)

    result = numpy.empty(values.shape, dtype='U25')

    boundaries = scipy.stats.norm.ppf(numpy.arange(0, n+1, 1)/n)
    for i, category in enumerate(categories):
        a, b = boundaries[i], boundaries[i + 1]
        numpy.putmask(result, (values < b) & (values >= a), category)

    return result


print(return_category(numpy.array([0.1, -100, 100, 0.44]), ['a', 'b', 'c']))
# ['b' 'a' 'c' 'c']

即它会根据值的位置从类别列表中分配一个类别，这样，如果values是从正态分布（0，1）中提取的，则每个类别的可能性都是相同的。

问题是：如何将其向量化？即如何摆脱需要进行大量更改的循环（针对大量类别和值）。

这个问题通常可以概括为：存在一个映射M={I1: c1, I2: c2, ...}，其中Ii是一个区间，使得所有区间的并集为]-inf,inf[，它们的交集为空，并且ci是一个类别。给定一个值数组[a1, a2, ..., aM]，创建一个新数组

[
 M[Ii such that a1 in Ii],
 M[Ii such that a2 in Ii], 
 ...
 M[Ii such that aM in Ii],
]

在上述特定情况下，间隔为scipy.stats.norm.ppf(numpy.arange(0, n+1, 1)/n)

Answer 1

我认为这可能会满足您的要求

import numpy 
import scipy.stats


def return_category(values, categories):
    n = len(categories)
    categories = numpy.array(categories)
    result = numpy.empty(values.shape, dtype='U25')
    boundaries = scipy.stats.norm.ppf(numpy.arange(0, n+1, 1)/n)
    # array of "left" boundaries
    bndrs0 = boundaries[:-1]
    # array of "right" boundaries
    bndrs1 = boundaries[1:]
    # build an array such that the j-th column in the
    # i-th row is True if the j-th column of values is in the i-th category
    whereCat = numpy.where(numpy.logical_and(values>=numpy.tile(bndrs0, (values.size,1)).T, values < numpy.tile(bndrs1, (values.size,1)).T))
    # broadcast categories to the corresponding rows
    sortedCats = numpy.take_along_axis(categories, whereCat[0],0)
    # place categories in the correct column
    numpy.put_along_axis(result,whereCat[1],sortedCats,0)
    return result


print(return_category(numpy.array([0.1, -100, 100, 0.44]), ['a', 'b', 'c']))
# ['b' 'a' 'c' 'c']

numpy：根据条件对值的向量化

1 个答案: