python python groupby agg获取lag1

时间:2018-08-23 03:04:51

标签: python pandas aggregate

def lag1(x):
    return x[(len(x)-1)]
x=pd.Series([12,3,4,5,6])
lag1(x)
Out[65]: 6
dat.shape
Out[70]: (247619, 33)
d2=dat.groupby('PATID_CD').agg(lag1)
Traceback (most recent call last):

  File "<ipython-input-71-f514757a3da8>", line 1, in <module>
    d2=dat.groupby('PATID_CD').agg(lag1)

  File "D:\Users\shan xu\Anaconda3\lib\site-packages\pandas\core\groupby\groupby.py", line 4658, in aggregate
    return super(DataFrameGroupBy, self).aggregate(arg, *args, **kwargs)

  File "D:\Users\shan xu\Anaconda3\lib\site-packages\pandas\core\groupby\groupby.py", line 4109, in aggregate
    result = self._aggregate_generic(arg, *args, **kwargs)

  File "D:\Users\shan xu\Anaconda3\lib\site-packages\pandas\core\groupby\groupby.py", line 4133, in _aggregate_generic
    return self._aggregate_item_by_item(func, *args, **kwargs)

  File "D:\Users\shan xu\Anaconda3\lib\site-packages\pandas\core\groupby\groupby.py", line 4162, in _aggregate_item_by_item
    colg.aggregate(func, *args, **kwargs), data)

  File "D:\Users\shan xu\Anaconda3\lib\site-packages\pandas\core\groupby\groupby.py", line 3497, in aggregate
    result = self._aggregate_named(func_or_funcs, *args, **kwargs)

  File "D:\Users\shan xu\Anaconda3\lib\site-packages\pandas\core\groupby\groupby.py", line 3627, in _aggregate_named
    output = func(group, *args, **kwargs)

  File "<ipython-input-64-be977293b7b9>", line 2, in lag1
    return x[(len(x)-1)]

  File "D:\Users\shan xu\Anaconda3\lib\site-packages\pandas\core\series.py", line 766, in __getitem__
    result = self.index.get_value(self, key)

  File "D:\Users\shan xu\Anaconda3\lib\site-packages\pandas\core\indexes\base.py", line 3103, in get_value
    tz=getattr(series.dtype, 'tz', None))

  File "pandas\_libs\index.pyx", line 106, in pandas._libs.index.IndexEngine.get_value

  File "pandas\_libs\index.pyx", line 114, in pandas._libs.index.IndexEngine.get_value

  File "pandas\_libs\index.pyx", line 162, in pandas._libs.index.IndexEngine.get_loc

  File "pandas\_libs\hashtable_class_helper.pxi", line 958, in pandas._libs.hashtable.Int64HashTable.get_item

  File "pandas\_libs\hashtable_class_helper.pxi", line 964, in pandas._libs.hashtable.Int64HashTable.get_item
KeyError: 23

我不知道为什么我的函数不能正常工作,它给了我一个密钥错误,提示该名称不存在。这有点令人困惑。我是在做正确的方法,还是有其他解决方案?

dat.groupby('PATID_CD').agg('mean')
Out[73]: 
           MONTH_LOOKBACK_NR      CCYYMM_CD    ...     ENG_SPOKEN  EVENT_FL
PATID_CD                                       ...                         
584                     12.0  201556.500000    ...            1.0       0.0
4277                    12.0  201556.500000    ...            1.0       0.0

我也尝试过:

dat.groupby('PATID_CD').agg(lambda x : x.iloc[-1,:])

这是一个好方法,但是我不能将此函数放入可以与其他函数进行计算的列表中:

def lag1(x):
    return x.iloc[-1,:]
d2=dat.groupby(dat['PATID_CD']).agg({'mean','max','min','std','skew', lambda x:len(x),kurtosis,lag1})
Traceback (most recent call last):

  File "<ipython-input-86-ac95a8297b5c>", line 1, in <module>

    d2=dat.groupby(dat['PATID_CD']).agg({'mean','max','min','std','skew', lambda x:len(x),kurtosis,lag1})

  File "D:\Users\shan xu\Anaconda3\lib\site-packages\pandas\core\groupby\groupby.py", line 4658, in aggregate
    return super(DataFrameGroupBy, self).aggregate(arg, *args, **kwargs)

  File "D:\Users\shan xu\Anaconda3\lib\site-packages\pandas\core\groupby\groupby.py", line 4089, in aggregate
    result, how = self._aggregate(arg, _level=_level, *args, **kwargs)

  File "D:\Users\shan xu\Anaconda3\lib\site-packages\pandas\core\base.py", line 551, in _aggregate
    _axis=_axis), None

  File "D:\Users\shan xu\Anaconda3\lib\site-packages\pandas\core\base.py", line 596, in _aggregate_multiple_funcs
    results.append(colg.aggregate(arg))

  File "D:\Users\shan xu\Anaconda3\lib\site-packages\pandas\core\groupby\groupby.py", line 3485, in aggregate
    (_level or 0) + 1)

  File "D:\Users\shan xu\Anaconda3\lib\site-packages\pandas\core\groupby\groupby.py", line 3558, in _aggregate_multiple_funcs
    results[name] = obj.aggregate(func)

  File "D:\Users\shan xu\Anaconda3\lib\site-packages\pandas\core\groupby\groupby.py", line 3497, in aggregate
    result = self._aggregate_named(func_or_funcs, *args, **kwargs)

  File "D:\Users\shan xu\Anaconda3\lib\site-packages\pandas\core\groupby\groupby.py", line 3627, in _aggregate_named
    output = func(group, *args, **kwargs)

  File "<ipython-input-85-6bbffa1ca952>", line 2, in lag1
    return x.iloc[-1,:]

  File "D:\Users\shan xu\Anaconda3\lib\site-packages\pandas\core\indexing.py", line 1472, in __getitem__
    return self._getitem_tuple(key)

  File "D:\Users\shan xu\Anaconda3\lib\site-packages\pandas\core\indexing.py", line 2013, in _getitem_tuple
    self._has_valid_tuple(tup)

  File "D:\Users\shan xu\Anaconda3\lib\site-packages\pandas\core\indexing.py", line 220, in _has_valid_tuple
    raise IndexingError('Too many indexers')  

IndexingError:索引器过多 与此相同:

x=pd.Series([12,3,4,5,6])
lag1(x)
Traceback (most recent call last):

  File "<ipython-input-85-6bbffa1ca952>", line 5, in <module>
    lag1(x)

  File "<ipython-input-85-6bbffa1ca952>", line 2, in lag1
    return x.iloc[-1,:]

  File "D:\Users\shan xu\Anaconda3\lib\site-packages\pandas\core\indexing.py", line 1472, in __getitem__
    return self._getitem_tuple(key)

  File "D:\Users\shan xu\Anaconda3\lib\site-packages\pandas\core\indexing.py", line 2013, in _getitem_tuple
    self._has_valid_tuple(tup)

  File "D:\Users\shan xu\Anaconda3\lib\site-packages\pandas\core\indexing.py", line 220, in _has_valid_tuple
    raise IndexingError('Too many indexers')

IndexingError: Too many indexers

1 个答案:

答案 0 :(得分:0)

您还没有解释您要完成的工作,而且代码中也不清楚。我在这里的代表不足,无法将其添加为评论,因此请考虑:

1)您的第一个示例是在隐式使用class Location(models.Model): """ Model representing a Location (attached to Hashtag objects through a M2M relationship) """ name = models.CharField(max_length=14000) latitude = models.CharField(max_length=30) longitude = models.CharField(max_length=30) country = models.CharField(max_length=50) tweet_date = models.DateTimeField() class Hashtag(models.Model): """ Model representing a specific Hashtag serch by user """ search_text = models.CharField(max_length=140, primary_key=True) location = models.ManyToManyField(Location, blank=True) histogram = models.ImageField(upload_to='img', blank=True) def __str__(self): return self.search_text def display_locations(self): """ Function to create a dict by frequency of the locations associated with search_text """ country_list = list(self.location.values_list('country', flat=True).all()) for country in country_list: location_freq = {i:country_list.count(i) for i in set(country_list)} return location_freq @property def get_histogram(self): """ Function to create a histogram of locations associated with search_text """ location_freq = self.display_locations() plt.bar(list(location_freq.keys()), location_freq.values(), color='g') file_location = 'mapping_twitter/static/mapping_twitter/images/histogram.png' plt.show() plt.savefig(file_location) f = open(file_location) self.histogram.save('histogram.png', File(f)) 的{​​{1}}上执行的,因此是<img src="{{ hashtag.histogram.url }}" alt="Histogram" /> 。您的第二个示例似乎是一个Series为索引的DataFrame。如果RangeIndex是例如x[(5-1)] == 6,仅仅因为您将错误的参数类型(PATID_CD)传递给了熊猫索引表达式PATID_CD.dtype docs而得到object异常。 / p>

2)如果您只想提取组的最后一行,请通过类似这样的方式编写组

KeyError