我试图比较使用R和使用Python的随机森林模型的结果。我比较的模型性能的关键测量是AUC(ROC曲线下的面积)。其原因是AUC值代表预测值的分布(即概率)。我确实发现R和Python之间的AUC值有一些显着差异。关于R和Python之间的差异,我确实阅读了Stack Overflow上的一些相关问题和答案。但是,我觉得我的问题应该与那些不同。
我试图在R和Python中保持一些关键的超参数相同。他们是:
ntree
设置为Python中的n_estimators
mtry
设置为Python中的max_features
nodesize
设置为Python中的min_samples_leaf
NULL
。在Python中,默认值为None
。replace
设置为等于bootstrap
,即True
或两者均为False
。问题是两类分类问题,有86个预测因子。每个预测变量都是连续的或布尔的。在R中没有使用因子类型预测器。在训练数据中有2008年的观察结果,在测试数据中有335个观察结果。两个数据的响应率相同,即79.7%。
结果如下:
R中的模型1结果
training_auc=0.9249080
,test_auc=0.6308934
R中的模型2结果
training_auc=0.9245665
,test_auc=0.6364838
Python中的模型1结果
training_auc=0.80515863
,test_auc=0.62194316
Python中的模型2结果
training_auc=0.86075733
,test_auc=0.61522362
您可以发现R和Python之间模型2(非自举采样)中AUC值的差异小于模型1(自举采样),特别是在训练数据的AUC中。
我的问题是:
为什么即使我在R和Python中设置相同的超参数,AUC对训练数据也有如此大的差异?
我错过了任何重要参数吗?或者我的R或Python代码有错误吗?
如何在R中使用classwt
并在Python中使用class_weight
?
library(randomForest)
library(glmnet)
setwd("D:/Project Files2/Python Efficiency/test RF using another dataset")
#read in data for training and data for testing
X_train01 <- read.csv("X_train_0.csv",header=FALSE)
y_train01 <- read.csv("y_train_0.csv",header=FALSE)
colnames(y_train01) <- "response"
X_test01 <- read.csv("X_test_0.csv",header=FALSE)
y_test01 <- read.csv("y_test_0.csv", header=FALSE)
colnames(y_test01) <- "response"
#define a function for RF
run_quick_rf4 <- function(X_train01,y_train01, X_test01, y_test01, ntree, mtry, nodesize, maxnodes=NULL, replace=TRUE, classwt=NULL, rnd_seed= 12345){
set.seed(rnd_seed)
rf_model <- randomForest(x=X_train01,y=as.factor(y_train01[,1]),
ntree=ntree,
mtry= mtry,
nodesize= nodesize,
maxnodes= maxnodes,
replace = replace,
sampsize = nrow(X_train01),
classwt = classwt
)
train01_pred <- predict(rf_model, X_train01, type='prob')
train01_auc <- auc(y_train01[,1], train01_pred[,2])
test01_pred <- predict(rf_model, X_test01, type='prob')
test01_auc <- auc(y_test01[,1],test01_pred[,2])
auc_outcome<- c(train01_auc, test01_auc)
names(auc_outcome)<- c("training_auc", "test_auc")
return(auc_outcome)
}
#>>>>>>>>>>>>>>>>>>>>>>>>>End of this function>>>>>>>>>>>>>>>>>>
#run random forest models with parameters set.
#Model 1
run_quick_rf4(X_train01, y_train01, X_test01, y_test01, 500, 20, 20, maxnodes=NULL, replace=TRUE, classwt=NULL, rnd_seed= 12345)
#Model 2
run_quick_rf4(X_train01, y_train01, X_test01, y_test01, 500, 20, 20, maxnodes=NULL, replace=FALSE, classwt=NULL, rnd_seed= 12345)
import numpy as np
from sklearn import cross_validation
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
import datetime
import os
#change work directory
os.chdir("D:/yxue/Uplift_Gilenya/followup/Data")
# only specify sample weight
def run_quick_RF_final(file_counter, n_estimators, max_features, min_samples_leaf, max_leaf_nodes=None, class_weight={0: 1, 1: 1}, s_wt={0:1, 1:1}, bootstrap=True, random_seed=4568):
x_train_file = 'X_train_%d.csv' %(file_counter)
y_train_file = 'y_train_%d.csv' %(file_counter)
x_test_file = 'X_test_%d.csv' %(file_counter)
y_test_file = 'y_test_%d.csv' %(file_counter)
X_train = np.loadtxt(x_train_file, delimiter=',', skiprows=0)
y_train = np.loadtxt(y_train_file, delimiter=',', skiprows=0)
X_test = np.loadtxt(x_test_file, delimiter=',', skiprows=0)
y_test = np.loadtxt(y_test_file, delimiter=',', skiprows=0)
rf_model = RandomForestClassifier()
rf_model.set_params(n_estimators=n_estimators, max_features=max_features, min_samples_leaf=min_samples_leaf, max_leaf_nodes=max_leaf_nodes, class_weight=class_weight, criterion='gini', bootstrap=bootstrap, random_state=random_seed)
if s_wt != None:
sample_wt = np.ones((len(y_train),), dtype=np.float64)
sample_wt[y_train == 0] = float(s_wt[0])
sample_wt[y_train == 1] = float(s_wt[1])
else:
sample_wt = s_wt
#print np.bincount(sample_wt)
rf_model.fit(X_train, y_train, sample_weight=sample_wt)
pred_train = rf_model.predict_proba(X_train)
train_auc = roc_auc_score(y_train,pred_train[:, 1])
pred_test = rf_model.predict_proba(X_test)
test_auc = roc_auc_score(y_test, pred_test[:, 1])
auc_outcome = np.array([train_auc, test_auc])
return auc_outcome
# run random forest model. the parameter setting is same as in R
# Model 1
run_quick_RF_final(0, 500, 20, 20, max_leaf_nodes=None, class_weight=None, s_wt=None, bootstrap=True, random_seed=4568)
# Model 2
run_quick_RF_final(0, 500, 20, 20, max_leaf_nodes=None, class_weight=None, s_wt=None, bootstrap=False, random_seed=4568)