我想从随机分布中构建一个2d numpy数组,以使每行最后一列中的每个值都超过阈值。
这是我现在拥有的工作代码。有没有一种更清洁的方法来构建具有任意条件的numpy数组?
def new_array(
num_rows: int,
dist: Callable[[int], np.ndarray],
min_hours: int) -> np.ndarray:
# Get the 40th percentile as a reasonable guess for how many samples we need.
# Use a lower percentile to increase num_cols and avoid looping in most cases.
p40_val = np.quantile(dist(20), 0.4)
# Generate at least 10 columns each time.
num_cols = max(int(min_hours / p40_val), 10)
def create_starts() -> np.ndarray:
return dist(num_rows * num_cols).reshape((num_rows, num_cols)).cumsum(axis=1)
max_iters = 20
starts = create_starts()
for _ in range(max_iters):
if np.min(starts[:, -1]) >= min_hours:
# All the last columns exceed min_hours.
break
last_col_vals = starts[:, -1].repeat(num_cols).reshape(starts.shape)
next_starts = create_starts() + last_col_vals
starts = np.append(starts, next_starts, axis=1)
else:
# We didn't break out of the for loop, so we hit the max iterations.
raise AssertionError('Failed to create enough samples to exceed '
'sim duration for all columns')
# Only keep columns up to the column where each value > min_hours.
mins_per_col = np.min(starts, axis=0)
cols_exceeding_sim_duration = np.nonzero(mins_per_col > min_hours)[0]
cols_to_keep = cols_exceeding_sim_duration[0]
return np.delete(starts, np.s_[cols_to_keep:], axis=1)
new_array(5, lambda size: np.random.normal(3, size=size), 7)
# Example output
array([[1.47584632, 4.04034105, 7.19592256],
[3.10804306, 6.46487043, 9.74177227],
[1.03633165, 2.62430309, 6.92413189],
[3.46100139, 6.53068143, 7.37990547],
[2.70152742, 6.09488369, 9.58376664]])
答案 0 :(得分:1)
我简化了几件事,并用Numpy的逻辑索引代替了它们。现在是for循环,无需处理该错误,因为它会一直运行直到有足够的行为止。
这是否仍按预期工作?
def new_array(num_rows, dist, min_hours):
# Get the 40th percentile as a reasonable guess for how many samples we need.
# Use a lower percentile to increase num_cols and avoid looping in most cases.
p40_val = np.quantile(dist(20), 0.4)
# Generate at least 10 columns each time.
num_cols = max(int(min_hours / p40_val), 10)
# no need to reshape here, size can be a shape tuple
def create_starts() -> np.ndarray:
return dist((num_rows, num_cols)).cumsum(axis=1)
# append to list, in the end stack it into a Numpy array once.
# faster than numpy.append
# due to Numpy's pre-allocation which will slow down things here.
storage = []
while True:
starts = create_starts()
# boolean / logical array
is_larger = starts[:, -1] >= min_hours
# Use Numpy boolean indexing instead to find the rows
# fitting your condition
good_rows = starts[is_larger, :]
# can also be empty array if none found, but will
# be skipped later
storage.append(good_rows)
# count what is in storage so far, empty arrays will be skipped
# due to shape (0, x)
number_of_good_rows = sum([_a.shape[0] for _a in storage])
print('number_of_good_rows', number_of_good_rows)
if number_of_good_rows >= num_rows:
starts = np.vstack(storage)
print(starts)
break
# Only keep columns up to the column where each value > min_hours.
# also use logical indexing here
is_something = np.logical_not(np.all(starts > min_hours, axis=0))
return starts[:, is_something]