我在Windows 10和Spyder Python IDE上使用Python 2.7
我正在尝试计算从任何其他节点到达网络中任何节点的后验条件概率。网络由dataframe
定义,其中每一行是edge
和fld1
之间的定向连接(在图论中称为fld2
),而value
是从fld1
移动到fld2
的概率。
为了计算我需要遍历dataframe
的概率。我正在使用iterrows
中的pandas
,但我也正在实施while loop
来捕获从一个节点到另一个节点的间接路径。
我的代码如下。我的问题是,我的代码是否正确,因为我可以使用pandas iterrows
和while loop
?
import pandas as pd
#from itertools import combinations
from itertools import permutations
df = pd.DataFrame({'fld1': ['apple', 'apple', 'bear','bear','car','car','car','dee','dee','eagle','eagle']
, 'fld2': ['bear', 'car', 'car','eagle','bear','dee','eagle','eagle','foo','dee','foo']
, 'value': [.3,.3,.2,.1,.3,.3,.2,.4,.1,.3,.2]})
## define global objects
#starter value holders
og_fld1_val = []
og_fld2_val = []
og_r_val = []
#df of already checked r_vals
dnc_df = pd.DataFrame(columns = ['fld1','fld2','distance'])
##df of all r_vals to find
flds = pd.Series(df.fld1.unique())
flds = pd.Series(flds.append(pd.Series(df.fld2.unique())).unique())
combos = []
for L in range(0, len(flds)+1):
for subset in permutations(flds, L):
if len(subset) == 2:
combos.append(subset)
rel_df = pd.DataFrame.from_records(data = combos, columns = ['fld1','fld2'])
####for all rows of df
#for each fld1-fld2 relationship in df
# aka (each edge in the network, starting with a-b)
for index, row in df.iterrows():
#take row 1 info for fld1 and fld2 seperately
og_fld1_val = df.fld1[index]
og_fld2_val = df.fld2[index]
og_r_val = df.value[index]
#add info to do not try again list
dnc_df.set_value(index, 'fld1', og_fld1_val)
dnc_df.set_value(index, 'fld2', og_fld2_val)
#variable value holders
#fld1_val = []
#fld2_val = []
#r_val = []
###fld1 has been established now for each path from fld1 outwards
for index, row in df.loc[df.fld1 == og_fld1_val].iterrows():
#see next connection that is not the terminal node
while og_fld2_val <> df.loc[df.fld1 == og_fld1_val].fld2[index]:
#capture relationship between previous node and next node
try:
r_val
except:
r_val = df.loc[df.fld1 == og_fld1_val].value[index]
else:
r_val = r_val * df.loc[df.fld1 == og_fld1_val].value[index]
#if r_val in globals():
# r_val = r_val * df.loc[df.fld1 == og_fld1_val].value[index]
#else:
# r_val = df.loc[df.fld1 == og_fld1_val].value[index]
if r_val < 0.001:
continue
我的目标是创建r_val
列,以便df
成为df2
。实际上,我的数据集很大(500K +行),这只是一个样本数据集。
df2 = pd.DataFrame({'fld1': ['apple', 'apple', 'bear','bear','car','car','car','dee','dee','eagle','eagle']
, 'fld2': ['bear', 'car', 'car','eagle','bear','dee','eagle','eagle','foo','dee','foo']
, 'value': [.3,.3,.2,.1,.3,.3,.2,.4,.1,.3,.2]
, 'r_val': [.39,.36,.2,.164,.3,.369,.35,.4,.18,.3,.23]})
答案 0 :(得分:1)
import pandas as pd
df = pd.DataFrame({'fld1': ['apple', 'apple', 'bear','bear','car','car','car','dee','dee','eagle','eagle']
, 'fld2': ['bear', 'car', 'car','eagle','bear','dee','eagle','eagle','foo','dee','foo']
, 'value': [.3,.3,.2,.1,.3,.3,.2,.4,.1,.3,.2]})
gsums = df.groupby("fld1").sum() # source group sums
df.set_index("fld1", inplace=True) # set index to source column
df["sums"] = gsums # new column sums in dataframe for next operation
df["rval"] = df["value"] / df["sums"] # divide the columns
df.drop("sums", axis=1, inplace=True) # drop the sums column
df.reset_index(inplace=True) # reset index to the original
但是,如果你将转换似然/概率存储在n×n帧中会更容易。然后你可以这样做:
import pandas as pd
from numpy.random import rand
vars = ("fld1", "fld2", "fld3")
n = len(vars)
df = pd.DataFrame(rand(n, n), index=vars, columns=vars)
dfprobs = df/df.sum(axis=0) # divide by sum of rows, or axis=1 to divide by sum of columns
对于python图,我建议在igraph和networkx上查看。