使用下面的pandas数据框,取自dict的字典:
import numpy as np
import pandas as pd
from scipy.stats import pearsonr
NaN = np.nan
dd ={'A': {'A': '1', 'B': '2', 'C': '3'},
'B': {'A': '4', 'B': '5', 'C': '6'},
'C': {'A': '7', 'B': '8', 'C': '9'}}
df_link_link = pd.DataFrame.from_dict(dd, orient='index')
我想形成一个新的pandas DataFrame,每行的行之间有Pearson相关的结果,不包括相同行之间的Pearson相关性(将A与其自身关联应该只是NaN
。这是拼写的在这里作为dicts的词典:
dict_congruent = {'A': {'A': NaN,
'B': pearsonr([NaN,2,3],[4,5,6]),
'C': pearsonr([NaN,2,3],[7,8,9])},
'B': {'A': pearsonr([4,NaN,6],[1,2,3]),
'B': NaN,
'C': pearsonr([4,NaN,6],[7,8,9])},
'C': {'A': pearsonr([7,8,NaN],[1,2,3]),
'B': pearsonr([7,8,NaN],[4,5,6]),
'C': NaN }}
其中NaN
只是numpy.nan
。有没有办法在pandas中执行此操作而无需迭代dicts的字典?我有大约7600万对,所以如果存在,非迭代方法会很棒。
答案 0 :(得分:8)
规范但不可行的解决方案
df.corr().mask(np.equal.outer(df.index.values, df.columns.values))
corr
的默认方法是pearson
。
<强> TL; DR 强>
转置您的数据以使用此
用弓包裹
非常广泛的数据
np.random.seed([3,1415])
m, n = 1000, 10000
df = pd.DataFrame(np.random.randn(m, n), columns=['s{}'.format(i) for i in range(n)])
魔术功能
def corr(df, step=100, mask_diagonal=False):
n = df.shape[0]
def corr_closure(df):
d = df.values
sums = d.sum(0, keepdims=True)
stds = d.std(0, keepdims=True)
def corr_(k=0, l=10):
d2 = d.T.dot(d[:, k:l])
sums2 = sums.T.dot(sums[:, k:l])
stds2 = stds.T.dot(stds[:, k:l])
return pd.DataFrame((d2 - sums2 / n) / stds2 / n,
df.columns, df.columns[k:l])
return corr_
c = corr_closure(df)
step = min(step, df.shape[1])
tups = zip(range(0, n, step), range(step, n + step, step))
corr_table = pd.concat([c(*t) for t in tups], axis=1)
if mask_diagonal:
np.fill_diagonal(corr_table.values, np.nan)
return corr_table
演示
ct = corr(df, mask_diagonal=True)
ct.iloc[:10, :10]
魔术解决方案
逻辑:
def corr_closure(df):
d = df.values # get underlying numpy array
sums = d.sum(0, keepdims=True) # pre calculate sums
stds = d.std(0, keepdims=True) # pre calculate standard deviations
n = d.shape[0] # grab number of rows
def corr(k=0, l=10):
d2 = d.T.dot(d[:, k:l]) # for this slice, run dot product
sums2 = sums.T.dot(sums[:, k:l]) # dot pre computed sums with slice
stds2 = stds.T.dot(stds[:, k:l]) # dot pre computed stds with slice
# calculate correlations with the information I have
return pd.DataFrame((d2 - sums2 / n) / stds2 / n,
df.columns, df.columns[k:l])
return corr