我正在尝试使用庞大的数据帧(180000 x 400)来计算另一个更小的数据帧。
我有以下数据框
df1=data.frame(LOCAT=c(1,2,3,4,5,6),START=c(120,345,765,1045,1347,1879),END=c(150,390,802,1120,1436,1935),CODE1=c(1,1,0,1,0,0),CODE2=c(1,0,0,0,-1,-1))
df1
LOCAT START END CODE1 CODE2
1 1 120 150 1 1
2 2 345 390 1 0
3 3 765 802 0 0
4 4 1045 1120 1 0
5 5 1347 1436 0 -1
6 6 1879 1935 0 -1
这是一个示例数据框。行继续到180000并且列超过400。 我需要做的是根据每一列创建一个新的数据帧,告诉我每个列的大小继续“1”或“-1”,并返回它的位置,大小和值。
这样的CODE1:
LOCAT SIZE VALUE
1 1 to 2 270 POS
2 4 to 4 75 POS
和CODE2一样:
LOCAT SIZE VALUE
1 1 to 1 30 POS
2 5 to 6 588 NEG
不幸的是,我仍然没有弄清楚如何做到这一点。我一直在尝试使用几行代码来开发一个自动执行此操作的函数,但开始迷失或陷入循环,似乎没有任何作用。
任何帮助将不胜感激。 提前致谢
答案 0 :(得分:4)
下面的代码以您想要的确切格式为您提供答案,除了我将“LOCAT”列拆分为两个名为“Starts”和“Stops”的列。此代码适用于您的整个数据框,无需为每个CODE(CODE1,CODE2等)手动复制它。
它假定唯一的非CODE列的名称为“LOCAT”“START”和“END”。
# need package "plyr"
library("plyr")
# test2 is the example data frame that you gave in the question
test2 <- data.frame(
"LOCAT"=1:6,
"START"=c(120,345,765, 1045, 1347, 1879),
"END"=c(150,390,803,1120,1436, 1935),
"CODE1"=c(1,1,0,1,0,0),
"CODE2"=c(1,0,0,0,-1,-1)
)
codeNames <- names(test2)[!names(test2)%in%c("LOCAT","START","END")] # the names of columns that correspond to different codes
test3 <- reshape(test2, varying=codeNames, direction="long", v.names="CodeValue", timevar="Code") # reshape so the different codes are variables grouped into the same column
test4 <- test3[,!names(test3)%in%"id"] #remove the "id" column
sss <- function(x){ # sss gives the starting points, stopping points, and sizes (sss) in a data frame
rleX <- rle(x[,"CodeValue"]) # rle() to get the size of consecutive values
stops <- cumsum(rleX$lengths) # cumulative sum to get the end-points for the indices (the second value in your LOCAT column)
starts <- c(1, head(stops,-1)+1) # the starts are the first value in your LOCAT column
ssX0 <- data.frame("Value"=rleX$values, "Starts"=starts, "Stops"=stops) #the starts and stops from X (ss from X)
ssX <- ssX0[ssX0[,"Value"]!=0,] # remove the rows the correspond to CODE_ values that are 0 (not POS or NEG)
# The next 3 lines calculate the equivalent of your SIZE column
sizeX1 <- x[ssX[,"Starts"],"START"]
sizeX2 <- x[ssX[,"Stops"],"END"]
sizeX <- sizeX2 - sizeX1
sssX <- data.frame(ssX, "Size"=sizeX) # Combine the Size to the ssX (start stop of X) data frame
return(sssX) #Added in EDIT
}
answer0 <- ddply(.data=test4, .variables="Code", .fun=sss) # use the function ddply() in the package "plyr" (apply the function to each CODE, why we reshaped)
answer <- answer0 # duplicate the original, new version will be reformatted
answer[,"Value"] <- c("NEG",NA,"POS")[answer0[,"Value"]+2] # reformat slightly so that we have POS/NEG instead of 1/-1
希望这有帮助,祝你好运!
答案 1 :(得分:2)
使用游程编码来确定CODE1
取相同值的组。
rle_of_CODE1 <- rle(df1$CODE1)
为方便起见,找到值为非零的点,以及相应块的长度。
CODE1_is_nonzero <- rle_of_CODE1$values != 0
n <- rle_of_CODE1$lengths[CODE1_is_nonzero]
忽略df1
为零的CODE1
部分。
df1_with_nonzero_CODE1 <- subset(df1, CODE1 != 0)
根据我们在rle
找到的连续块定义一个组。
df1_with_nonzero_CODE1$GROUP <- rep(seq_along(n), times = n)
使用ddply
获取每个组的摘要统计信息。
summarised_by_CODE1 <- ddply(
df1_with_nonzero_CODE1,
.(GROUP),
summarise,
MinOfLOCAT = min(LOCAT),
MaxOfLOCAT = max(LOCAT),
SIZE = max(END) - min(START)
)
summarised_by_CODE1$VALUE <- ifelse(
rle_of_CODE1$values[CODE1_is_nonzero] == 1,
"POS",
"NEG"
)
summarised_by_CODE1
## GROUP MinOfLOCAT MaxOfLOCAT SIZE VALUE
## 1 1 1 2 270 POS
## 2 3 4 4 75 POS
现在重复CODE2
。