Question

我在DataFrame df中每天都有降水，看起来像是：

   Jan   Feb   Mar   Apr   May   Jun   Jul   
0   0.00  0.00  0.07  0.02  0.00  0.00   NaN  
1   0.80   NaN  0.00  0.00  0.03  0.00  0.00  
2   0.20  0.00   NaN  0.14  0.00  0.00  0.00 
3   0.00  0.00  0.00  0.01  0.00  0.00  0.00  
4    NaN   NaN  0.00  0.00  0.90  0.50  0.00  
5   0.01  0.00  0.00  0.12  0.17   NaN  0.77  
6   0.77   NaN   NaN  0.00  0.18   NaN  0.00  
7   0.00   NaN  0.04  0.00  0.00  0.00  0.11  
8   0.00  0.56  0.00  0.00  0.02  0.00  0.00  
9   0.00  0.00  0.04  0.00  0.00  0.00  0.00  
10  0.16  0.00  0.00  0.00  0.42  0.00  0.00  
11  0.00  0.08  0.00  0.00  0.78  0.00  0.00  
...

每个月的所有日子一年。我想将所有这些数据绘制在一个显示积累的单一运行折线图上（即如果第1天和第3天下雨，那么第3天的积分点将是第1天和第3天的总降雨量，然后如果在第5天下雨，情节将是第1 + 3 + 5天，依此类推）。添加值和这样的情节的最佳方法是什么？

Answer 1

您似乎正在寻找.stack()。但是，您的问题似乎比.stack() + .cumsum()更复杂。这是因为，假设您有一个31x12的DataFrame（日行，月列），您需要告诉pandas如何将每个月/日组合映射到一年中的某一天;否则，熊猫会假设你有372天的年。下面的第一个函数就是这样，下面的行首先创建一些示例数据，然后使用该函数来帮助获得按日计算的累积总和。

import datetime


def stack_daily(df, year='current'):
    """Construct pd.DatetimeIndex from unstacked day/month format."""
    # Confirm index is 1-indexed and ends at 31
    if not np.array_equal(df.index, pd.RangeIndex(1, 32)):
        raise ValueError('`df` should have `pd.RangeIndex(1, 32)`')
    # Same logic for columns
    if not np.array_equal(df.columns, pd.RangeIndex(1, 13)):
        raise ValueError('`df` should have columns `pd.RangeIndex(1, 13)`')
    if year == 'current':
        year = datetime.date.today().year
    stacked = df.stack()  # Implicit dropna=True
    day, month = zip(*stacked.index.get_values())
    dates = {'year': [year] * stacked.shape[0],
             'month': month,
             'day': day}
    return pd.to_datetime(dates)


# Create random precipitation data
import numpy as np
import pandas as pd


np.random.seed(123)
data = np.empty((31, 12))
data[:] = np.nan
mask = np.random.randint(0, 2, size=data.shape, dtype=np.bool)
vals = np.random.rand(*data.shape)
data[mask] = vals[mask]
data[29:] = np.nan
df = pd.DataFrame(data, index=pd.RangeIndex(1, 32),
                  columns=pd.RangeIndex(1, 13))

# Manipulate to get day-of-year index
idx = stack_daily(df)
doy = idx.dt.dayofyear

total_precip = df.stack().reset_index(drop=True)
total_precip.index = doy
total_precip.dropna(inplace=True)
total_precip = total_precip.sort_index().cumsum()

# %matplotlib inline
total_precip.plot()

更新

新功能：

import datetime


def stack_daily(df, year='current', sort=True, dropna=True):
    """Construct pd.DatetimeIndex from unstacked day/month format."""
    # Confirm index is 1-indexed and ends at 31
    if not np.array_equal(df.index, pd.RangeIndex(1, 32)):
        raise ValueError('`df` should have `pd.RangeIndex(1, 32)`')
    # Same logic for columns
    if not np.array_equal(df.columns, pd.RangeIndex(1, 13)):
        raise ValueError('`df` should have columns `pd.RangeIndex(1, 13)`')
    if year == 'current':
        year = datetime.date.today().year
    stacked = df.stack(dropna=False)
    year = np.repeat(year, stacked.shape[0])  # len == 372
    day, month = zip(*stacked.index.get_values())

    # Drop the *difference* between a valid calendar and the 372-day calendar
    #     created by using 12 31-day months.
    # Use a pure-Python solution here because NumPy set logic doesn't generally
    #     support 2d arrays and we have fairly small data (1 year).
    true_dates = pd.date_range(start='{}-01-01'.format(year[0]),
                               end='{}-12-31'.format(year[0]))
    true_dates = list(zip(true_dates.day,
                          true_dates.month,
                          true_dates.year))
    full_dates = list(zip(day, month, year))
    # We want a boolean mask False where dates are invalid
    # This should yield len(mask[mask == 1]) == 365
    mask = np.array([date in true_dates for date in full_dates])

    # Now filter stacked data on this mask
    stacked = stacked.loc[mask]

    # And finally repeat above process converting to datetime
    #     and then getting day of year.
    day, month = zip(*stacked.index.get_values())
    dates = {'year': year[:stacked.shape[0]].tolist(),
             'month': month,
             'day': day}
    stacked.index = pd.to_datetime(dates).dt.dayofyear

    if sort:
        stacked.sort_index(inplace=True)
    if dropna:
        stacked.dropna(inplace=True)
    return stacked

示例：

# Create random precipitation data
# This gets you a DataFrame with 12 months on the columns and
#     31 days on the index.  Both are 1-indexed i.e. start at 1.
#     There is a mix of 0.00s, NaN, and other floats, mimicking
#     the data from your question.
import numpy as np
import pandas as pd

np.random.seed(123)
data = np.zeros((31, 12))
mask1 = np.random.randint(0, 2, size=data.shape, dtype=np.bool)
mask2 = np.random.randint(0, 2, size=data.shape, dtype=np.bool)
vals = np.random.rand(*data.shape)
nans = np.zeros(data.shape)
nans[:] = np.nan
data[mask1] = vals[mask1]
data[mask2] = nans[mask2]
df = pd.DataFrame(data, index=pd.RangeIndex(1, 32),
                  columns=pd.RangeIndex(1, 13))

# %matplotlib inline
stack_daily(df).cumsum().plot()

Python数据帧累积线图

1 个答案:

更新