我已经阅读了关于此主题的堆栈交换的其他页面,但我仍然无法弄清楚为什么selectedSub <- subset(corList, (abs(cor) > 0.2 & j == 'Income'))
返回错误&lt; 0 rows&gt; (或0长度的row.names)。
# Url
library(RCurl)
urlfile <- 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
x <- getURL(urlfile, ssl.verifypeer = FALSE)
#Read csv
adults <- read.csv(textConnection(x), header = F)
head(adults,2)
# Change the header names
names(adults) = c('Age', 'Workclass', 'FinalWeight', 'Eductation', 'EducationNumber',
'MaritalStatus', 'Occupation', 'Relationship', 'Race',
'Sex', 'CapitalGain', 'CapitalLoss', 'HoursWeek',
'NativeCountry', 'Income')
# Change to binary
adults$Income <- ifelse(adults$Income ==' <= 50k', 0,1)
# Change all the factors to numbers
library(caret)
dmy <- dummyVars(" ~.", data = adults)
adultsTrsf <- data.frame(predict(dmy, newdata = adults))
dim(adultsTrsf)
head(adultsTrsf)
## Correlation matrix with p-values.
cor.prob <- function (X, dfr = nrow(X) - 2) {
R <- cor(X, use="pairwise.complete.obs")
above <- row(R) < col(R)
r2 <- R[above]^2
Fstat <- r2 * dfr/(1 - r2)
R[above] <- 1 - pf(Fstat, 1, dfr)
R[row(R) == col(R)] <- NA
R
}
## Use this to dump the cor.prob output to a 4 column matrix
## with row/column indices, correlation, and p-value.
flattenSquareMatrix <- function(m) {
if( (class(m) != "matrix") | (nrow(m) != ncol(m))) stop("Must be a square matrix.")
if(!identical(rownames(m), colnames(m))) stop("Row and column names must be equal.")
ut <- upper.tri(m)
data.frame(i = rownames(m)[row(m)[ut]],
j = rownames(m)[col(m)[ut]],
cor=t(m)[ut],
p=m[ut])
}
##
corMasterList <- flattenSquareMatrix (cor.prob(adultsTrsf))
# Order by absolute correlation value
corList <- corMasterList [order(-abs(corMasterList$cor)),]
##
head(corList)
##
selectedSub <- subset(corList, (abs(cor) > 0.2 & j == 'Income'))
##
selectedSub