我试图确定某一列连续多少天高于或低于某个阈值。
>>> df.head()
Open High Low Close Volume
Date
2004-08-19 49.96 51.98 47.93 50.12 NaN
2004-08-20 50.69 54.49 50.20 54.10 NaN
2004-08-23 55.32 56.68 54.47 54.65 NaN
2004-08-24 55.56 55.74 51.73 52.38 NaN
2004-08-25 52.43 53.95 51.89 52.95 NaN
>>>
对于上面的示例,我希望另一列df [' RDA']连续每天增加,即Open列超过50.连续每天低于50,我&# 39; d喜欢第二列df [' RDB']增加和df [' RDA']重置为0.我已尝试if / then逻辑但它没有& #39; t那样,并给我一个值错误:
ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all(). how can i sort it out
我希望我的数据框的输出看起来像这样:
>>> df.head()
Open High Low Close Volume RDA RDB
Date
2004-08-19 51.96 51.98 47.93 50.12 NaN 1 0
2004-08-20 50.69 54.49 50.20 54.10 NaN 2 0
2004-08-23 55.32 56.68 54.47 54.65 NaN 3 0
2004-08-24 45.56 55.74 51.73 52.38 NaN 0 1
2004-08-25 42.43 53.95 51.89 52.95 NaN 0 2
2004-08-26 41.96 51.98 47.93 50.12 NaN 0 3
2004-08-27 40.69 54.49 50.20 54.10 NaN 0 4
2004-08-28 55.32 56.68 54.47 54.65 NaN 1 0
2004-08-29 55.56 55.74 51.73 52.38 NaN 2 0
2004-08-30 52.43 53.95 51.89 52.95 NaN 3 0
>>>
这是熊猫可以做到的事情吗?我知道你可以计算一列中的值,但我到目前为止还没能找到连续值的方法。带有2个变量的if / then语句可以工作,但就像我上面提到的那样,当我尝试这个时,我得到一个值错误。任何帮助将不胜感激。
答案 0 :(得分:2)
np.sign
和Open
之间的差异使用50
。当小于-1
时,50
,0
时50
,1
大于50
时为np.diff
。 cumsum
来确定何时从一个值切换到另一个值cumcount
来定义连续符号组np.where
来获取群组内的计数cumcounts
拆分o = df.Open.values - 50
signs = np.sign(o)
changes = np.append(False, signs[:-1] != signs[1:])
g = changes.cumsum()
cumcounts = df.groupby(g).cumcount() + 1
a = np.where(signs == 1, cumcounts, 0)
b = np.where(signs == -1, cumcounts, 0)
df.assign(RDA=a, RDB=b)
Open High Low Close Volume RDA RDB
Date
2004-08-19 51.96 51.98 47.93 50.12 NaN 1 0
2004-08-20 50.69 54.49 50.20 54.10 NaN 2 0
2004-08-23 55.32 56.68 54.47 54.65 NaN 3 0
2004-08-24 45.56 55.74 51.73 52.38 NaN 0 1
2004-08-25 42.43 53.95 51.89 52.95 NaN 0 2
2004-08-26 41.96 51.98 47.93 50.12 NaN 0 3
2004-08-27 40.69 54.49 50.20 54.10 NaN 0 4
2004-08-28 55.32 56.68 54.47 54.65 NaN 1 0
2004-08-29 55.56 55.74 51.73 52.38 NaN 2 0
2004-08-30 52.43 53.95 51.89 52.95 NaN 3 0
" Looks for a pattern in the buffers.
" Usage :GrepBuffers [pattern] [matchCase] [matchWholeWord] [prefix]
" If pattern is not specified then usage instructions will get printed.
" If matchCase = '1' then exclude matches that do not have the same case. If matchCase = '0' then ignore case.
" If prefix == 'c' then put results in the QuickFix list. If prefix == 'l' then put results in the location list for the current window.
function! s:GrepBuffers(...)
if a:0 > 4
throw "Too many arguments"
endif
if a:0 >= 1
let l:pattern = a:1
else
echo 'Usage :GrepBuffers [pattern] [matchCase] [matchWholeWord] [prefix]'
return
endif
let l:matchCase = 0
if a:0 >= 2
if a:2 !~ '^\d\+$' || a:2 > 1 || a:2 < 0
throw "ArgumentException: matchCase value '" . a:2 . "' is not in the bounds [0,1]."
endif
let l:matchCase = a:2
endif
let l:matchWholeWord = 0
if a:0 >= 3
if a:3 !~ '^\d\+$' || a:3 > 1 || a:3 < 0
throw "ArgumentException: matchWholeWord value '" . a:3 . "' is not in the bounds [0,1]."
endif
let l:matchWholeWord = a:3
endif
let l:prefix = 'c'
if a:0 >= 4
if a:4 != 'c' && a:4 != 'l'
throw "ArgumentException: prefix value '" . a:4 . "' is not 'c' or 'l'."
endif
let l:prefix = a:4
endif
let ignorecase = &ignorecase
let &ignorecase = l:matchCase == 0
try
if l:prefix == 'c'
let l:vimgrep = 'vimgrep'
elseif l:prefix == 'l'
let l:vimgrep = 'lvimgrep'
endif
if l:matchWholeWord
let l:pattern = '\<' . l:pattern . '\>'
endif
let str = 'silent ' . l:vimgrep . ' /' . l:pattern . '/'
for buf in getbufinfo()
if buflisted(buf.bufnr) " Skips unlisted buffers because they are not used for normal editing
if !bufexists(buf.bufnr)
throw 'Buffer does not exist: "' . buf.bufnr . '"'
elseif empty(bufname(buf.bufnr)) && getbufvar(buf.bufnr, '&buftype') != 'quickfix'
if len(getbufline(buf.bufnr, '2')) != 0 || strlen(getbufline(buf.bufnr, '1')[0]) != 0
echohl warningmsg | echomsg 'Skipping unnamed buffer: [' . buf.bufnr . ']' | echohl normal
endif
else
let str = str . ' ' . fnameescape(bufname(buf.bufnr))
endif
endif
endfor
try
execute str
catch /^Vim\%((\a\+)\)\=:E\%(683\|480\):/ "E683: File name missing or invalid pattern --- E480: No match:
" How do you want to handle this exception?
echoerr v:exception
return
endtry
execute l:prefix . 'window'
"catch /.*/
finally
let &ignorecase = ignorecase
endtry
endfunction
答案 1 :(得分:2)
首先,在数据框中添加一个标志列,以指示Open是否高于目标价格50(真或假)。
然后,您可以使用compare-cumsum-groupby pattern来识别此标记的累积分组,并将cumsum
应用于每个此类组。
我们现在需要反转标志,使其为零,零为1,然后使用相同的策略计算rdb
。
最后,我们删除flag
列(我使用.iloc[:, :-1]
删除它,因为我将其添加为最后一列)并附加新的RDA
和RDB
列
target_price = 50
df = df.assign(flag=df.Open.gt(target_price)) # True if `Open` greater than `target_price`, otherwise False.
rda = df.groupby((df['flag'] != df['flag'].shift()).cumsum()).flag.cumsum()
df['flag'] = ~df['flag'] # Invert flag for RDB.
rdb = df.groupby((df['flag'] != df['flag'].shift()).cumsum()).flag.cumsum()
df = df.iloc[:, :-1].assign(RDA=rda, RDB=rdb)
>>> df
Date Open High Low Close Volume RDA RDB
0 8/19/04 51.96 51.98 47.93 50.12 NaN 1 0
1 8/20/04 50.69 54.49 50.20 54.10 NaN 2 0
2 8/23/04 55.32 56.68 54.47 54.65 NaN 3 0
3 8/24/04 45.56 55.74 51.73 52.38 NaN 0 1
4 8/25/04 42.43 53.95 51.89 52.95 NaN 0 2
5 8/26/04 41.96 51.98 47.93 50.12 NaN 0 3
6 8/27/04 40.69 54.49 50.20 54.10 NaN 0 4
7 8/28/04 55.32 56.68 54.47 54.65 NaN 1 0
8 8/29/04 55.56 55.74 51.73 52.38 NaN 2 0
9 8/30/04 52.43 53.95 51.89 52.95 NaN 3 0
答案 2 :(得分:1)
这也可以使用Python提供的functools.reduce
方法完成。首先创建一个可迭代的目标数据,所以在你的情况下:
target = df.Open > 50
这将是您稍后将functools.reduce
传递给&#34;减少&#34;。 Reduce基本上是map
,但在列表元素中保留一个值。这可以用来做你想要的。
我会尝试分解您可以使用的功能(在帖子末尾完整显示)。
functools.reduce
可让您访问两个参数。您的累计值以及您当前的列表项。它还允许您传入自己的初始化程序(在查看任何内容之前的第一项)。有了这个,我们可以查看我们的列表,如果它是True
,由上面的目标系列确定,我们可以在列表的最后一个元素中加1,否则在我们的累加器中添加0
通过将初始化程序设置为0
中的值为[0]
的列表,这需要一点点精力,因此在第一遍中它可以采用&#34; last&# 34;元素,并做一些事情,而不是错误。
完成后,列表前面会出现一个拖尾者0
,您可以使用切片[1:]
删除该切片器RDB
,仅使用第二个元素及之后的颜色。
您的True
列完全相同,但您要确保目标列表中不是not
,只需要在条件语句中添加import functools
# Create a boolean series of your Open column
target = df.Open > 50
# For every item in your boolean series add a 1 to the previous value if it's over 50, otherwise reset
df['RDA'] = functools.reduce(lambda x, y: x + ([x[-1] + 1] if y else [0]), target, [0])[1:]
# Repeat, but for every `False` value in the series
df['RDB'] = functools.reduce(lambda x, y: x + ([x[-1] + 1] if not y else [0]), target, [0])[1:]
>>> df.head()
Open High Low Close Volume RDA RDB
Date
2004-08-19 49.96 51.98 47.93 50.12 NaN 0 1
2004-08-20 50.69 54.49 50.20 54.10 NaN 1 0
2004-08-23 55.32 56.68 54.47 54.65 NaN 2 0
2004-08-24 55.56 55.74 51.73 52.38 NaN 3 0
2004-08-25 52.43 53.95 51.89 52.95 NaN 4 0
。< / p>
完整代码如下所示:
{
"compilerOptions":{
"allowSyntheticDefaultImports":true,
"declaration":false,
"emitDecoratorMetadata":true,
"experimentalDecorators":true,
"lib":[
"dom",
"es2015"
],
"module":"es2015",
"moduleResolution":"node",
"sourceMap":true,
"target":"es5"
},
"include":[
"src/**/*.ts"
],
"exclude":[
"node_modules"
],
"compileOnSave":false,
"atom":{
"rewriteTsconfig":false
}
答案 3 :(得分:0)
<bean id="geoFilter"
class="com.package.CountryGeoFilter" />
答案 4 :(得分:0)
我最初误解了,并没有意识到你想要在它发生变化时重新启动计数。我认为最好的方法是在行中使用DataFrame.apply
,如下所示:
In [226]: def increment(row):
...: global rda
...: global rdb
...: if row.Open > 50:
...: row.RDA = int(next(rda))
...: rdb = count()
...: else:
...: row.RDB = next(rdb)
...: rda = int(count())
...: return row
In [227]: df['RDA'] = 0
In [228]: df['RDB'] = 0
In [229]: df.apply(increment, axis=1)
Open High Low Close Volume RDA RDB
Date
2004-08-19 49.96 51.98 47.93 50.12 NaN 0.0 1.0
2004-08-20 50.69 54.49 50.20 54.10 NaN 0.0 0.0
2004-08-23 55.32 56.68 54.47 54.65 NaN 1.0 0.0
2004-08-24 55.56 55.74 51.73 52.38 NaN 2.0 0.0
2004-08-25 52.43 53.95 51.89 52.95 NaN 3.0 0.0
我不知道为什么他们会在列中出现漂浮物,我猜大熊猫认为这就是你想要的。数据最初来自count
int
。我通常不喜欢全局变量,但DataFrame.apply
在变量超出increment
函数时无法访问变量。