import pandas as pd
import numpy as np

def ewma_cov_pairwise_pd(x, y, alpha=0.06):
    x = x.mask(y.isnull(), np.nan)
    y = y.mask(x.isnull(), np.nan)
    covariation = ((x - x.mean()) * (y - y.mean()).dropna()
    return covariation.ewm(alpha=0.06).mean().iloc[-1]

def ewma_cov_pd(rets, alpha=0.06):
    assets = rets.columns
    n = len(assets)
    cov = np.zeros((n, n))
    for i in range(n):
        for j in range(i, n):
            cov[i, j] = cov[j, i] = ewma_cov_pairwise_pd(
                rets.iloc[:, i], rets.iloc[:, j], alpha=alpha)
    return pd.DataFrame(cov, columns=assets, index=assets)



n = 100  # n is typically 2000
rets = pd.DataFrame(np.random.normal(0, 1., size=(n, n)))
cov_pd = ewma_cov_pd(rets)



利用Quang Hoang提供的答案并在更合理的时间内产生预期结果的潜在解决方案将类似于:

def ewma_cov_frame_qh(rets, alpha=0.06):
    weights = (1-alpha) ** np.arange(len(df))[::-1]
    normalized = (rets-rets.mean()).to_numpy()    
    out = (weights * normalized.T) @ normalized / weights.sum()
    return pd.DataFrame(out, index=rets.columns, columns=rets.columns)

def ewma_cov_qh(rets, alpha=0.06):
    syms = rets.columns
    covar = pd.DataFrame(index=rets.columns, columns=rets.columns)
    delta = rets.isnull().sum(axis=1).shift(1) - rets.isnull().sum(axis=1)
    dates = delta.loc[delta != 0].index.tolist()
    for date in dates:
        frame = rets.loc[rets.index >= date].dropna(axis=1, how='any')
        cov = ewma_cov_frame_qh(frame).reindex(index=syms, columns=syms)
        covar = covar.fillna(cov)
    return covar

cov_qh = ewma_cov_qh(rets)

这违反了使用本地Pandas / Numpy函数计算基础协方差的要求,并且计算时间将取决于数据集中前导na的数量。



from concurrent.futures import ProcessPoolExecutor, as_completed
from functools import partial
def ewma_cov_mp_worker(date, rets, alpha=0.06):
    syms = rets.columns
    frame = rets.loc[rets.index >= date].dropna(axis=1, how='any')
    return ewma_cov_frame_qh(frame, alpha=alpha).reindex(index=syms, columns=syms)

def ewma_cov_mp(rets, alpha=0.06):
    covar = pd.DataFrame(index=rets.columns, columns=rets.columns)
    delta = rets.isnull().sum(axis=1).shift(1) - rets.isnull().sum(axis=1)
    dates = delta.loc[delta != 0].index.tolist()

    func = partial(ewma_cov_mp_worker, rets=rets, alpha=alpha)
    covs = {}

    with ProcessPoolExecutor(max_workers=6) as exec:
        future_to_date = {exec.submit(func, date): date for date in dates}
        covs = {future_to_date[future]: future.result() for future in as_completed(future_to_date)}

    for date in dates:
        covar.fillna(covs[date], inplace=True)

    return covar


def ewma(df, alpha=0.94):
    weights = (1-alpha) ** np.arange(len(df))[::-1]

    # fillna with 0 here
    normalized = (df-df.mean()).fillna(0).to_numpy()
    out =  ((weights * normalized.T) @ normalized / weights.sum()
    return out

 # verify
 out = ewma(df)
 print(out[0,1] == ewma_cov_pairwise(df[0],df[1]) )
 # True

这在我的系统上用150 ms花费了df.shape==(2000,2000),而您的代码却在几分钟之内无法运行:-)。