import pandas as pd
import numpy as np
data_dir = 'data_r14.csv'
data = pd.read_csv(data_dir)
# print(data)
signals = data['signal']
value_counts = signals.value_counts()
buy_count = value_counts[1]
signals_code = [1, 2]
buy_sell_rows = data.loc[data['signal'].isin(signals_code)]
data_without_signals = data[~data['signal'].isin(signals_code)]
random_0_indexes = np.random.choice(data_without_signals.index.values, buy_count)
value_counts2 = data_without_signals['signal'].value_counts()
# print(value_counts2)
for index in random_0_indexes:
row = data.loc[index, :]
# df = row.to_frame()
print(row)
buy_sell_rows.append(row)
# print(buy_sell_rows)
# print(signals.loc[index, :])
# print(random_0_rows)
print(buy_sell_rows)
# print(buy_sell_rows['signal'].value_counts())
所以我有一个数据框,其中有一个名为 signal
的列,其中的值是 0、1 或 2,我想通过为每个值设置相等数量的行来平衡它们,因为它们非常不平衡我只有 1984 行非零值和 20000 多行零值。
所以我创建了一个新的数据框,其中所有的值都为零,并将其命名为 data_without_signals
,然后从中获取一个随机索引列表,然后运行一个循环以获取该 row
以附加它到我创建的另一个名为 buy_sell_rows
的数据框,其中只有非零值,但问题是 row
被附加。
答案 0 :(得分:1)
正如我在评论中所说,我认为可以通过随机采样不同的信号来简化您的一般方法:
# my test signal of 0s, 1s and 2s
test = pd.DataFrame({"data" : [0,0,0,1,1,1,1,1,1,1,2,2,2,2,2,2]})
# get the lowest size of any group, which is the size all groups should be reduced to
max_size = test.groupby("data")["data"].count().min()
# sample
output = (test
.groupby(["data"])
.agg(sample = ("data", lambda x : x.sample(max_size).to_list()))
.explode("sample")
.reset_index(drop=True)
)
这个测试的输出是:
示例 | |
---|---|
0 | 0 |
1 | 0 |
2 | 0 |
3 | 1 |
4 | 1 |
5 | 1 |
6 | 2 |
7 | 2 |
8 | 2 |