我有一个看起来像这样的数据框:
df = DataFrame({'date': {379724: '2017-01-31',
379725: '2017-01-31',
414510: '2017-02-14',
414509: '2017-02-28',
414511: '2017-02-28',
507215: '2017-04-27',
507213: '2017-04-27',
507214: '2017-04-27',
507235: '2017-04-27',
562139: '2017-04-27',
672967: '2017-07-27',
672968: '2017-07-27',
672969: '2017-07-27',
910729: '2017-12-07',
990263: '2018-01-30',
990265: '2018-01-30',
990264: '2018-01-30',
121543: '2018-06-26',
255129: '2018-09-20'},
'id': {379724: '110000078451',
379725: '110000078451',
414510: '110000078451',
414509: '110000078451',
414511: '110000078451',
507215: '110000078451',
507213: '110000078451',
507214: '110000078451',
507235: '110000078451',
562139: '110000078451',
672967: '110000078451',
672968: '110000078451',
672969: '110000078451',
910729: '110000078451',
990263: '110000078451',
990265: '110000078451',
990264: '110000078451',
121543: '110000078451',
255129: '110000078451'},
'limit': {379724: 0,
379725: 1,
414510: 1,
414509: 0,
414511: 0,
507215: 0,
507213: 0,
507214: 1,
507235: 0,
562139: 0,
672967: 0,
672968: 0,
672969: 0,
910729: 0,
990263: 0,
990265: 0,
990264: 0,
121543: 0,
255129: 0})
我需要计算'limit'
的每一组中'id'
中的值更改为另一个的次数。
我想到的代码是:
count01 = df.groupby('id')['limit'].rolling(2,min_periods=1)
.apply(lambda x: ((x[0] != x[-1]) & (x[0] == 1)), raw=True)
.groupby('id').sum().astype(int).reset_index(name='count01')
count10 = df.groupby('id')['limit'].rolling(2,min_periods=1)
.apply(lambda x: ((x[0] != x[-1]) & (x[0] == 0)), raw=True)
.groupby('id').sum().astype(int).reset_index(name='count10')
count_total = count01.merge(count10, on='id')
有时它提供正确的结果,有时却不能。我认为可以将组中的第一个apply
值分配为NaN,结果会受到该值的影响,但也许不是。
结果应为:
id | count01 | count10
-------------------------------
110000078451| 2 | 2
谢谢!
编辑:我更新了示例,使其更符合真实数据。
答案 0 :(得分:1)
您可以先创建一个列,并在其中包含相同的ID,然后使用ivot_table来计算这些过渡:
df2 = df.shift()
df2['limit'] = df2['limit'].bfill().astype(int) # force limit to type int in shifted df
df.loc[(df.id==df2.id)&(df.limit!=df2.limit),'transition'] = \
df2.limit.astype(str)+df.limit.astype(str)
resul = df.pivot_table(index='id', columns='transition', aggfunc='count',values='date', fill_value=0)
给予:
transition 01 10
id
111 2 1
22 0 1
您可以改善演示文稿:
resul = resul.rename(columns=lambda x: 'count'+x).rename_axis('', axis=1).reset_index()
最终得到:
id count01 count10
0 111 2 1
1 22 0 1
答案 1 :(得分:0)
count01更改:
(x[0] == 1)) --> (x[0] == 0))
并在count10中进行更改:
(x[0] == 0)) --> (x[0] == 1))
答案 2 :(得分:0)
这应该有效。
import pandas as pd
def limit_change_counter(limits, _from, _to):
tmp = list(limits)
counter = 0
for idx, limit in enumerate(tmp):
if idx > 0:
if tmp[idx - 1] == _from and limit == _to:
counter += 1
return counter
df = pd.DataFrame.from_dict({'date': {379724: '2017-01-31',
379725: '2017-01-31',
414510: '2017-02-14',
414509: '2017-02-28',
414511: '2017-02-28',
507215: '2017-04-27',
507213: '2017-04-27',
507214: '2017-04-27',
507235: '2017-04-27',
562139: '2017-04-27',
672967: '2017-07-27',
672968: '2017-07-27',
672969: '2017-07-27',
910729: '2017-12-07',
990263: '2018-01-30',
990265: '2018-01-30',
990264: '2018-01-30',
121543: '2018-06-26',
255129: '2018-09-20'},
'id': {379724: '110000078451',
379725: '110000078451',
414510: '110000078451',
414509: '110000078451',
414511: '110000078451',
507215: '110000078451',
507213: '110000078451',
507214: '110000078451',
507235: '110000078451',
562139: '110000078451',
672967: '110000078451',
672968: '110000078451',
672969: '110000078451',
910729: '110000078451',
990263: '110000078451',
990265: '110000078451',
990264: '110000078451',
121543: '110000078451',
255129: '110000078451'},
'limit': {379724: 0,
379725: 1,
414510: 1,
414509: 0,
414511: 0,
507215: 0,
507213: 0,
507214: 1,
507235: 0,
562139: 0,
672967: 0,
672968: 0,
672969: 0,
910729: 0,
990263: 0,
990265: 0,
990264: 0,
121543: 0,
255129: 0}})
df.sort_values(by='date', inplace=True)
print(df)
df['limit_changes_0_to_1'] = df.groupby(['id'])['limit'].transform(limit_change_counter, 0, 1)
df['limit_changes_1_to_0'] = df.groupby(['id'])['limit'].transform(limit_change_counter, 1, 0)
df.drop_duplicates(subset="id", keep="first", inplace=True)
print(df)