我有一个包含20个变量v1 - v20的数据集。现在我想使用cor(...)计算v2和v10之间的相关性,直到v15和v3,v10到v15。最好的方法是什么?我是否必须使用
为每个变量对执行此操作cor(v2, v10)
cor(v2, v11)
cor(v2, v12)
and so on?
以下是实际数据集:
> dput(dataset)
structure(list(Number = 1:15, Question.1.1 = c(3L, 4L, 5L, 5L,
4L, 5L, 5L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 5L), Question.1.2 = c(1L,
2L, 1L, 1L, 4L, 1L, 1L, 2L, 3L, 1L, 1L, 1L, 1L, 1L, 1L), Question.2.1 = c(5L,
3L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L), Question.2.2 = c(2L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), Question.3.1 = c(2L,
NA, 4L, 5L, 4L, 3L, 5L, 3L, 5L, 5L, 5L, 5L, 4L, 4L, 4L), Question.3.2 = c(2L,
NA, 1L, 1L, 2L, 2L, 1L, 4L, 3L, 1L, 1L, 1L, 2L, 2L, 1L), Question.4.1 = c(3L,
2L, 5L, 2L, 5L, 5L, 5L, 3L, 5L, 5L, 5L, 5L, 4L, 5L, 2L), Question.4.2 = c(2L,
2L, 1L, 2L, 2L, 1L, 2L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 4L), Question.5.1 = c(5L,
3L, 5L, 3L, 4L, 4L, 5L, 3L, 5L, 5L, 5L, 5L, 5L, 4L, 4L), Question.5.2 = c(2L,
2L, 1L, 1L, 3L, 2L, 1L, 3L, 4L, 1L, 1L, 1L, 1L, 1L, 1L), Question.6.1 = c(5L,
2L, 2L, 2L, 3L, 2L, 3L, 1L, 3L, 3L, 5L, 4L, 3L, 3L, 1L), Question.6.2 = c(2L,
3L, 2L, 1L, 2L, 3L, 3L, 3L, 3L, 2L, 1L, 1L, 2L, 2L, 1L), Question.7.1 = c(5L,
2L, 5L, 5L, 5L, 3L, 5L, 5L, 2L, 4L, 5L, 5L, 5L, 4L, 5L), Question.7.2 = c(1L,
4L, 1L, 1L, 2L, 2L, 1L, 1L, 3L, 1L, 1L, 1L, 1L, 1L, 1L), Question.8.1 = c(4L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L), Question.8.2 = c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), Question.9.1 = c(5L,
3L, 5L, 4L, 4L, 5L, 5L, 5L, 4L, 5L, 5L, 5L, 5L, 4L, 3L), Question.9.2 = c(1L,
1L, 1L, 2L, 2L, 1L, 2L, 1L, 4L, 2L, 1L, 2L, 2L, 1L, 2L), AQ.1 = c(5L,
5L, 5L, 1L, 3L, 5L, 5L, 5L, 5L, 2L, 2L, 2L, 5L, 5L, 3L), AQ.2 = c(2L,
5L, 2L, 1L, 2L, 5L, 2L, 1L, 5L, 1L, 1L, 4L, 2L, 3L, 3L), Task.1 = c(5L,
2L, 5L, 1L, 4L, 5L, 5L, 4L, 4L, 5L, 5L, 4L, 5L, 5L, 5L), Task.2 = c(4L,
3L, 5L, 4L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L), Task.3 = c(4L,
3L, 4L, 1L, 3L, 5L, 4L, 5L, 5L, 5L, 5L, 5L, 4L, 4L, 4L), Task.4 = c(5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L), Task.5 = c(5L,
4L, 5L, 4L, 4L, 5L, 4L, 5L, 4L, 5L, 5L, 5L, 5L, 5L, 4L), GQ.1 = c(4L,
2L, 2L, 5L, 4L, 4L, 5L, 4L, 5L, 5L, 5L, 4L, 4L, 5L, 4L), GQ.2 = c(4L,
4L, 4L, 5L, 5L, 4L, 4L, 3L, 3L, 3L, 5L, 5L, 5L, 4L, 3L), GQ.3 = c(5L,
3L, 2L, 5L, 3L, 5L, 5L, 5L, 4L, 5L, 5L, 5L, 4L, 4L, 4L), GQ.4 = c(5L,
2L, 1L, 4L, 4L, 4L, 4L, 3L, 3L, 3L, 5L, 5L, 4L, 4L, 1L), GQ.5 = c(4L,
3L, 4L, 5L, 5L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 4L, 3L), GQ.6 = c(2L,
2L, 1L, 1L, 2L, 1L, 4L, 1L, 4L, 5L, 5L, 1L, 5L, 1L, 5L), GQ.7 = c(4L,
5L, 5L, 5L, 4L, 2L, 3L, 5L, 3L, 5L, 5L, 2L, 5L, 3L, 2L), GQ.8 = c(2L,
4L, 3L, 2L, 3L, 3L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L), GQ.9 = c(3L,
5L, 2L, 3L, 4L, 4L, 5L, 3L, 4L, 4L, 3L, 3L, 4L, 2L, 2L), GQ.10 = c(3L,
4L, 1L, 2L, 3L, 4L, 5L, 5L, 5L, 5L, 4L, 4L, 5L, 5L, 2L), Feature.1 = c(4L,
4L, 2L, 3L, 4L, 4L, 5L, 5L, 4L, 5L, 5L, 4L, 5L, 3L, 4L), Feature.2 = c(4L,
4L, 2L, 1L, 5L, 4L, 5L, 5L, 5L, 4L, 4L, 3L, 5L, 3L, 2L), Feature.3 = c(3L,
2L, 1L, 2L, 5L, 5L, 2L, 4L, 2L, 4L, 4L, 5L, 2L, 4L, 2L), Feature.4 = c(3L,
3L, 3L, 4L, 3L, 4L, 5L, 5L, 4L, 4L, 4L, 3L, 4L, 3L, 3L), Feature.5 = c(2L,
2L, 3L, 3L, 4L, 3L, 4L, 4L, 2L, 4L, 3L, 4L, 5L, 3L, 1L), Feature.6 = c(5L,
5L, 1L, 1L, 5L, 5L, 5L, 4L, 4L, 5L, 5L, 5L, 5L, 4L, 4L), Feature.7 = c(5L,
3L, 2L, 5L, 4L, 5L, 3L, 5L, 4L, 5L, 5L, 5L, 5L, 4L, 4L)), .Names = c("Number",
"Question.1.1", "Question.1.2", "Question.2.1", "Question.2.2",
"Question.3.1", "Question.3.2", "Question.4.1", "Question.4.2",
"Question.5.1", "Question.5.2", "Question.6.1", "Question.6.2",
"Question.7.1", "Question.7.2", "Question.8.1", "Question.8.2",
"Question.9.1", "Question.9.2", "AQ.1", "AQ.2", "Task.1", "Task.2",
"Task.3", "Task.4", "Task.5", "GQ.1", "GQ.2", "GQ.3", "GQ.4",
"GQ.5", "GQ.6", "GQ.7", "GQ.8", "GQ.9", "GQ.10", "Feature.1",
"Feature.2", "Feature.3", "Feature.4", "Feature.5", "Feature.6",
"Feature.7"), class = "data.frame", row.names = c(NA, -15L))
答案 0 :(得分:4)
我可能误解了这个问题......但为什么不只是在数据框上运行cor
?
例如:
data <- data.frame(q1=sample(1:5, 15, rep=1),
q2=sample(1:5, 15, rep=1),
q3=sample(1:5, 15, rep=1),
q4=sample(1:5, 15, rep=1),
q5=sample(1:5, 15, rep=1),
q6=sample(1:5, 15, rep=1),
q7=sample(1:5, 15, rep=1),
q8=sample(1:5, 15, rep=1),
q9=sample(1:5, 15, rep=1),
q10=sample(1:5, 15, rep=1))
print(cor(data))
你甚至可以
image(cor(data), x=1:10, y=1:10, zlim=c(-1,1))
如果你只需要某些相关值,只需将corr的结果放在一个变量中,然后取出你需要的结果。
例如,我们希望第2列与第5列到第10列的相关性:
corrs <- cor(data)
print(corrs[2, 5:10]) # or corrs[5:10, 2], the correlation matrix is symmetric
答案 1 :(得分:1)
显式对数据集进行子集并在该数据集上运行关联命令。假设您的变量排序正确,请尝试以下方法:
cor(dat[,c(2, 10:15)][,1]
cor(dat[,c(3, 10:15)][,1]
如果没有订购,您只需要订购它们或者用引号命名变量。例如:
cor(dat[,c('v3', 'v10', 'v11', 'v12', 'v13', 'v14', 'v15')][,1]
答案 2 :(得分:1)
使用subset
命令:
dtf <- subset(mtcars, select = c(mpg, hp, wt))
cor(dtf)
mpg hp wt
mpg 1.0000000 -0.7761684 -0.8676594
hp -0.7761684 1.0000000 0.6587479
wt -0.8676594 0.6587479 1.0000000
或使用psych
包和corr.test
功能:
library(psych)
corr.test(dtf)
Call:corr.test(x = dtf)
Correlation matrix
mpg hp wt
mpg 1.00 -0.78 -0.87
hp -0.78 1.00 0.66
wt -0.87 0.66 1.00
Sample Size
mpg hp wt
mpg 32 32 32
hp 32 32 32
wt 32 32 32
Probability value
mpg hp wt
mpg 0 0 0
hp 0 0 0
wt 0 0 0
答案 3 :(得分:1)
问题似乎是由于在整个数据帧上运行corr导致的信息过载。我没有太多使用它,但是ggplot成名的Hadley Wickham的plyr package似乎为分组和管理输出提供了一些优雅的解决方案。