我正在使用python中的库 rpy2 将 R 包应用于熊猫数据框。
我想从包 记分卡 (来自 R < / em>)到熊猫datframe,但是当我遇到错误并且不知道为什么
这是我的代码:
# python
import pandas as pd
import numpy as np
import rpy2
from rpy2.robjects.packages import importr
from rpy2.robjects import pandas2ri
from rpy2.robjects.vectors import DataFrame
# R
base = importr('base')
score = importr("scorecard")
# Create pandas df
df = pd.DataFrame( np.random.randn(5,4), # 5 rows, 2 columns
columns = ["A","B","C","D"], # name of columns
index = ["Max", "Nathy", "Tom", "Joe", "Kathy"] )
df["C"] = [0,0,1,0,1] # "BGI"
pandas2ri.activate()
# Convert pandas to r
df_r = pandas2ri.py2ri(df)
df_r = base.as_data_frame(df_r)
print(type(df_r))
pandas2ri.deactivate()
bins = score.woebin(df_r,
y = "C",
x = base.c("A","B") )
在las命令中出现以下错误
Error in matrix(unlist(value, recursive = FALSE, use.names = FALSE), nrow = nr, :
'data' must be of a vector type, was 'NULL'
答案 0 :(得分:1)
这是pyper
import pandas as pd
import numpy as np
from pyper import *
df = pd.DataFrame( np.random.randn(5,4), # 5 rows, 2 columns
columns = ["A","B","C","D"], # name of columns
index = ["Max", "Nathy", "Tom", "Joe", "Kathy"] )
df["C"] = [0,0,1,0,1]
r=R(use_pandas=True)
r.assign("df_r", df)
r("library(scorecard)")
r('bins <- woebin(df_r, y = "C", c("A", "B"))')
binsN = r.get('bins')
-检查输出
print(binsN)
#{'A': variable bin count count_distr good bad #\
#0 b'A' b'[-Inf,1.777599442)' 3 0.6 2 1
#1 b'A' b'[1.777599442, Inf)' 2 0.4 1 1
# badprob woe bin_iv total_iv breaks \
#0 0.333333 -0.287682 0.047947 0.115525 b'1.777599442'
#1 0.500000 0.405465 0.067578 0.115525 b'Inf'
# is_special_values
#0 False
#1 False , 'B': variable bin count #count_distr good bad \
#0 b'B' b'[-Inf,0.2711706509)' 3 0.6 2 1
#1 b'B' b'[0.2711706509, Inf)' 2 0.4 1 1
# badprob woe bin_iv total_iv breaks \
#0 0.333333 -0.287682 0.047947 0.115525 b'0.2711706509'
#1 0.500000 0.405465 0.067578 0.115525 b'Inf'
# is_special_values
#0 False
#1 False }
这也可以通过R
和python
来获取reticulate
个对象来完成。创建了一个python脚本('pytmp.py')
#pytmp.py
import pandas as pd
import numpy as np
df = pd.DataFrame( np.random.randn(5,4), # 5 rows, 2 columns
columns = ["A","B","C","D"], # name of columns
index = ["Max", "Nathy", "Tom", "Joe", "Kathy"] )
df["C"] = [0,0,1,0,1] # "BGI"
df
-在R
library(reticulate)
library(scorecard)
use_python("/usr/local/bin/python")
use_virtualenv("~/r-reticulate")
source_python("pytmp.py")
bins <- woebin(df, y = "C", x = c("A","B") )
bins
#$A
# variable bin count count_distr good bad badprob woe bin_iv total_iv breaks is_special_values
#1: A [-Inf,0.895928754) 3 0.6 2 1 0.3333333 -0.2876821 0.04794701 0.1155245 0.895928754 FALSE
#2: A [0.895928754, Inf) 2 0.4 1 1 0.5000000 0.4054651 0.06757752 0.1155245 Inf FALSE
#$B
# variable bin count count_distr good bad badprob woe bin_iv total_iv breaks is_special_values
#1: B [-Inf,0.2356073663) 3 0.6 2 1 0.3333333 -0.2876821 0.04794701 0.1155245 0.2356073663 FALSE
#2: B [0.2356073663, Inf) 2 0.4 1 1 0.5000000 0.4054651 0.06757752 0.1155245 Inf FALSE
注意:我们没有设置种子,因此每次运行的值都不同