< 0行> (或0长度的row.names)

时间:2016-09-22 00:34:01

标签: r matrix dataframe correlation data-science

我已经阅读了关于此主题的堆栈交换的其他页面,但我仍然无法弄清楚为什么selectedSub <- subset(corList, (abs(cor) > 0.2 & j == 'Income'))返回错误&lt; 0 rows&gt; (或0长度的row.names)。

# Url
library(RCurl)
urlfile <- 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'

x <- getURL(urlfile, ssl.verifypeer = FALSE)

#Read csv
adults <- read.csv(textConnection(x), header = F)

head(adults,2)

# Change the header names
names(adults) = c('Age', 'Workclass', 'FinalWeight', 'Eductation',  'EducationNumber',
              'MaritalStatus', 'Occupation', 'Relationship', 'Race', 
              'Sex', 'CapitalGain', 'CapitalLoss', 'HoursWeek', 
              'NativeCountry', 'Income') 

# Change to binary
adults$Income <- ifelse(adults$Income ==' <= 50k', 0,1)

# Change all the factors to numbers
library(caret)
dmy <- dummyVars(" ~.", data = adults)
adultsTrsf <- data.frame(predict(dmy, newdata = adults))
dim(adultsTrsf)
head(adultsTrsf)

## Correlation matrix with p-values.
cor.prob <- function (X, dfr = nrow(X) - 2) {
R <- cor(X, use="pairwise.complete.obs")
above <- row(R) < col(R)
r2 <- R[above]^2
Fstat <- r2 * dfr/(1 - r2)
R[above] <- 1 - pf(Fstat, 1, dfr)
R[row(R) == col(R)] <- NA
R
}

## Use this to dump the cor.prob output to a 4 column matrix
## with row/column indices, correlation, and p-value.
flattenSquareMatrix <- function(m) {
if( (class(m) != "matrix") | (nrow(m) != ncol(m))) stop("Must be a square matrix.")
if(!identical(rownames(m), colnames(m))) stop("Row and column names must be equal.")
ut <- upper.tri(m)
data.frame(i = rownames(m)[row(m)[ut]],
         j = rownames(m)[col(m)[ut]],
         cor=t(m)[ut],
         p=m[ut])
}

##
corMasterList <- flattenSquareMatrix (cor.prob(adultsTrsf))

# Order by absolute correlation value
corList <- corMasterList [order(-abs(corMasterList$cor)),]

##
head(corList)

##
selectedSub <- subset(corList, (abs(cor) > 0.2 & j == 'Income'))

## 
selectedSub

0 个答案:

没有答案