我从mysql加载我的数据有两列:id,rt
id对应于实践中的许多rts(我设计糟糕的表格):
id rt
1 5129052 2
2 5129052 2
3 5129052 5
4 5129052 6
5 7125052 0
6 7125052 1
7 7125052 2
8 7125052 4
9 7125052 6
10 7125052 7
我想创建一个像下面那样的数据透视表。第一列只是行数,没什么特别的。
5129052 7125052
1 2 0
2 2 1
3 5 2
4 6 4
5 NA 6
6 NA 7
如果可行,也按升序对值进行排序。
谢谢!
答案 0 :(得分:5)
你的答案必须是一个矩阵吗?因为在这种情况下,矩阵对我来说并没有多大意义。看起来像列表将是一个更实际的解决方案,允许每个rt
的{{1}}向量的不同长度。例如:
id
答案 1 :(得分:2)
使用Jean V.Adams提供的内容,您可以使用以下内容创建数据框。
dat <- read.table(text= " id rt
1 5129052 2
2 5129052 2
3 5129052 5
4 5129052 6
5 7125052 0
6 7125052 1
7 7125052 2
8 7125052 4
9 7125052 6
10 7125052 7",header=TRUE,sep="")
tmp <- split(dat$rt,dat$id,sort)
res <- sapply(tmp,function(x) { c(x,rep(NA,maxl - length(x)))})
答案 2 :(得分:1)
你可以找到列表中的元素:
# Loading dataset
df <- structure(list(id = c(5129052L, 5129052L, 5129052L, 5129052L,
7125052L, 7125052L, 7125052L, 7125052L, 7125052L, 7125052L),
rt = c(2L, 2L, 5L, 6L, 0L, 1L, 2L, 4L, 6L, 7L)), .Names = c("id",
"rt"), class = "data.frame", row.names = c(NA, -10L))
# cbind the list output
do.call(cbind,split(df$rt, df$id))
#Output: the values of the first list are recycled instead of NAs
# 5129052 7125052
#[1,] 2 0
#[2,] 2 1
#[3,] 5 2
#[4,] 6 4
#[5,] 2 6
#[6,] 2 7
# A.N. Spiess wrote a cbind.na function http://rmazing.wordpress.com/2012/06/19/dont-fill-me-up/#comments
cbind.na <- function (..., deparse.level = 1)
{
na <- nargs() - (!missing(deparse.level))
deparse.level <- as.integer(deparse.level)
stopifnot(0 <= deparse.level, deparse.level <= 2)
argl <- list(...)
while (na > 0 && is.null(argl[[na]])) {
argl <- argl[-na]
na <- na - 1
}
if (na == 0)
return(NULL)
if (na == 1) {
if (isS4(..1))
return(cbind2(..1))
else return(matrix(...)) ##.Internal(cbind(deparse.level, ...)))
}
if (deparse.level) {
symarg <- as.list(sys.call()[-1L])[1L:na]
Nms <- function(i) {
if (is.null(r <- names(symarg[i])) || r == "") {
if (is.symbol(r <- symarg[[i]]) || deparse.level ==
2)
deparse(r)
}
else r
}
}
## deactivated, otherwise no fill in with two arguments
if (na == 0) {
r <- argl[[2]]
fix.na <- FALSE
}
else {
nrs <- unname(lapply(argl, nrow))
iV <- sapply(nrs, is.null)
fix.na <- identical(nrs[(na - 1):na], list(NULL, NULL))
## deactivated, otherwise data will be recycled
#if (fix.na) {
# nr <- max(if (all(iV)) sapply(argl, length) else unlist(nrs[!iV]))
# argl[[na]] <- cbind(rep(argl[[na]], length.out = nr),
# deparse.level = 0)
#}
if (deparse.level) {
if (fix.na)
fix.na <- !is.null(Nna <- Nms(na))
if (!is.null(nmi <- names(argl)))
iV <- iV & (nmi == "")
ii <- if (fix.na)
2:(na - 1)
else 2:na
if (any(iV[ii])) {
for (i in ii[iV[ii]]) if (!is.null(nmi <- Nms(i)))
names(argl)[i] <- nmi
}
}
## filling with NA's to maximum occuring nrows
nRow <- as.numeric(sapply(argl, function(x) NROW(x)))
maxRow <- max(nRow, na.rm = TRUE)
argl <- lapply(argl, function(x) if (is.null(nrow(x))) c(x, rep(NA, maxRow - length(x)))
else rbind.na(x, matrix(, maxRow - nrow(x), ncol(x))))
r <- do.call(cbind, c(argl[-1L], list(deparse.level = deparse.level)))
}
d2 <- dim(r)
r <- cbind2(argl[[1]], r)
if (deparse.level == 0)
return(r)
ism1 <- !is.null(d1 <- dim(..1)) && length(d1) == 2L
ism2 <- !is.null(d2) && length(d2) == 2L && !fix.na
if (ism1 && ism2)
return(r)
Ncol <- function(x) {
d <- dim(x)
if (length(d) == 2L)
d[2L]
else as.integer(length(x) > 0L)
}
nn1 <- !is.null(N1 <- if ((l1 <- Ncol(..1)) && !ism1) Nms(1))
nn2 <- !is.null(N2 <- if (na == 2 && Ncol(..2) && !ism2) Nms(2))
if (nn1 || nn2 || fix.na) {
if (is.null(colnames(r)))
colnames(r) <- rep.int("", ncol(r))
setN <- function(i, nams) colnames(r)[i] <<- if (is.null(nams))
""
else nams
if (nn1)
setN(1, N1)
if (nn2)
setN(1 + l1, N2)
if (fix.na)
setN(ncol(r), Nna)
}
r
}
# Which can be used to produce the desired output
do.call(cbind.na,split(df$rt, df$id))
# 5129052 7125052
#[1,] 2 0
#[2,] 2 1
#[3,] 5 2
#[4,] 6 4
#[5,] NA 6
#[6,] NA 7
答案 3 :(得分:1)
抱歉,但我发现这里的大部分答案都有点矫枉过正。这是另外两个建议。两者都依赖于创建一个辅助“id”,表示现有id的值的数量。
## Create a secondary "id"
df$id2 <- ave(as.character(df$id), df$id, FUN = seq_along)
df ## Your new "df"
id rt id2
1 5129052 2 1
2 5129052 2 2
3 5129052 5 3
4 5129052 6 4
5 7125052 0 1
6 7125052 1 2
7 7125052 2 3
8 7125052 4 4
9 7125052 6 5
10 7125052 7 6
reshape
如果需要,只需清理变量名称,就可以了。
reshape(df, direction = "wide", idvar = "id2", timevar = "id")
id2 rt.5129052 rt.7125052
1 1 2 0
2 2 2 1
3 3 5 2
4 4 6 4
9 5 NA 6
10 6 NA 7
更清晰的语法和更清晰的输出。
dcast
关于你的奖金问题?这些解决方案都会输出常规library(reshape2)
dcast(df, id2 ~ id, value.var="rt")
id2 5129052 7125052
1 1 2 0
2 2 2 1
3 3 5 2
4 4 6 4
5 5 NA 6
6 6 NA 7
,因此data.frame
可以直接使用它们。
答案 4 :(得分:0)
非常丑陋,但可行的解决方案:
> dput(df)
structure(list(id = c(5129052L, 5129052L, 5129052L, 5129052L,
7125052L, 7125052L, 7125052L, 7125052L, 7125052L, 7125052L),
rt = c(2L, 2L, 5L, 6L, 0L, 1L, 2L, 4L, 6L, 7L)), .Names = c("id",
"rt"), class = "data.frame", row.names = c("1", "2", "3", "4",
"5", "6", "7", "8", "9", "10"))
> df
id rt
1 5129052 2
2 5129052 2
3 5129052 5
4 5129052 6
5 7125052 0
6 7125052 1
7 7125052 2
8 7125052 4
9 7125052 6
10 7125052 7
计算条目:
t1 = table(df$id)
> t1
5129052 7125052
4 6
初始化矩阵:
foo = matrix(NA,max(t1),length(t1))
填充矩阵:
for (x in names(t1)){foo[1:t1[x],x] = sort(df$rt[df$id==x])}
> foo
5129052 7125052
[1,] 2 0
[2,] 2 1
[3,] 5 2
[4,] 6 4
[5,] NA 6
[6,] NA 7