我知道标题有点蹩脚,但我想不出别的什么可以称之为。我正在尝试使用lon(经度列)中出现的值对大数据框进行子集化。我工作的当前子集脚本,它在-180(n / a值)出现的任何时候创建子集,并且包括在存在一个或多个-180之前和之后的第一个非-180数。我的问题是我希望子集在任何-180秒之前由20个经度组成,之后是20个。由于我的许多文件以-180开头并以-180秒结束,因此这是创建和错误。我只是不知道如何告诉脚本子集-180s,但也忽略了可能出现在第一行或最后一行的任何内容。理想情况下,脚本只会分配前面有20个经度和后面经过20个经度的-180。另外,我永远不会知道文件的开头和结尾会出现多少-180秒,这对我自己来说是最大的问题。下面是我的数据和我当前的子集代码示例。预先感谢您的帮助! 编辑:这些行保持相同的顺序且不按任何方式排序也非常重要,因为这是按时间顺序排列的数据。我的数据框有4461行和7列。下面的编辑是我数据框的一小部分样本。
cols <- structure(list(fixType = structure(c(39L, 39L, 39L, 39L, 39L, 39L, 39L, 39L, 39L, 39L), .Label = c("firstfix +indoors +startpoint", "firstfix +indoors +startpoint +cluster_center", "firstfix +indoors +stationary", "firstfix +indoors +stationary +cluster_center", "firstfix +invehicle +startpoint", "firstfix +invehicle +startpoint +cluster_center", "firstfix +invehicle +stationary +cluster_center", "firstfix +outdoors +startpoint", "firstfix +outdoors +startpoint +cluster_center", "firstfix +outdoors +stationary", "firstfix +outdoors +stationary +cluster_center", "inserted +indoors +midpoint", "inserted +indoors +pausepoint", "inserted +indoors +stationary", "inserted +indoors +stationary +cluster_center", "inserted +invehicle +midpoint", "inserted +invehicle +pausepoint", "inserted +invehicle +stationary", "inserted +invehicle +stationary +cluster_center", "inserted +outdoors +midpoint", "inserted +outdoors +pausepoint", "inserted +outdoors +stationary", "inserted +outdoors +stationary +cluster_center", "lastfix +indoors +endpoint", "lastfix +indoors +endpoint +cluster_center", "lastfix +indoors +stationary", "lastfix +indoors +stationary +cluster_center", "lastfix +invehicle +endpoint", "lastfix +invehicle +endpoint +cluster_center", "lastfix +outdoors +endpoint", "lastfix +outdoors +endpoint +cluster_center", "lastfix +outdoors +stationary", "lastvalidfix +indoors +stationary", "lastvalidfix +indoors +stationary +cluster_center", "lastvalidfix +invehicle +stationary", "lastvalidfix +invehicle +stationary +cluster_center", "lastvalidfix +outdoors +stationary", "lastvalidfix +outdoors +stationary +cluster_center", "unknown", "valid +indoors +endpoint", "valid +indoors +endpoint +cluster_center", "valid +indoors +midpoint", "valid +indoors +pausepoint", "valid +indoors +pausepoint +cluster_center", "valid +indoors +startpoint", "valid +indoors +startpoint +cluster_center", "valid +indoors +stationary", "valid +indoors +stationary +cluster_center", "valid +invehicle +endpoint", "valid +invehicle +endpoint +cluster_center", "valid +invehicle +midpoint", "valid +invehicle +pausepoint", "valid +invehicle +startpoint", "valid +invehicle +startpoint +cluster_center", "valid +invehicle +stationary", "valid +invehicle +stationary +cluster_center", "valid +outdoors +endpoint", "valid +outdoors +endpoint +cluster_center", "valid +outdoors +midpoint", "valid +outdoors +pausepoint", "valid +outdoors +pausepoint +cluster_center", "valid +outdoors +startpoint", "valid +outdoors +startpoint +cluster_center", "valid +outdoors +stationary", "valid +outdoors +stationary +cluster_center"), class = "factor"), lon = c(-180, -180, -180, -180, -180, -180, -180, -180, -180, -180), lat = c(-180, -180, -180, -180, -180, -180, -180, -180, -180, -180), activityIntensity = c(2L, 2L, 1L, 2L, 2L, 2L, 0L, 2L, 1L, 0L), Impute = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), ID = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 4352L, 4353L, 4354L), subsetNum = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA)), .Names = c("fixType", "lon", "lat", "activityIntensity", "Impute", "ID", "subsetNum"), row.names = c(4462L, 4463L, 4464L, 4465L, 4466L, 4467L, 4468L, 8813L, 8814L, 8815L), class = "data.frame")
子集代码:
set.seed(5)
n <- length(df) #make it equal to the length of whatever the input file is
impCols <- df[ , c("fixType", "lon", "lat", "activityIntensity", "Impute", "ID", "subsetNum")]
test.df <- data.frame(impCols)
df <- test.df
obs <- dim(df)[1]
counter <- 1
subM.List <- list()
start.idx <- NA
for(i in 1:obs){
if (is.na(start.idx) & (substr(df[i,"lon"], 1, 4) == -180)){
start.idx <- i-1
}
else if (!is.na(start.idx) & (substr(df[i,"lon"], 1, 4) != -180)){
end.idx <- i+1 #the plus one will give you the first two instances of signal instead of just the first
subMat <- df[start.idx:end.idx,]
subM.List[[counter]] <- subMat
start.idx <- NA
counter <- counter + 1
}
}
答案 0 :(得分:0)
indicators <- df$lon == -180
# the first and last non-zero indicators are your index boundaries
indx.min <- min(which(!indicators)) # will issue warning if lon is nothing but '-180'
indx.max <- max(which(!indicators)) # will issue warning if lon is nothing but '-180'
"My problem is that I would like the subsets to be comprised of
the 20 longitudes before any -180s, and 20 after"
# `inPlay` are the indicators that are not at the extreme ends
inPlay <- which(indicators)
inPlay <- inPlay[inPlay > indx.min & inPlay < indx.max]
# Sample Size
S <- 20 # use a variable so you can change it as needed
diffPlay <- diff(inPlay)
stop <- c(which(diffPlay !=1 ), length(inPlay))
start <- c(1, which(diffPlay !=1 )+1)
# these are your rowranges of `180s`. You can have a look if youd like
rbind(inPlay[start], inPlay[stop])
# we are going to take the 20 rows before each "start"
# and the 20 rows after each "start" + "plus"
inPlayPlus <- inPlay[stop] - inPlay[start]
inPlayStart <- inPlay[start]
## The names given to `inPlay` will be the name of your subsetted list
names(inPlayStart) <- ifelse(inPlayPlus > 0, paste0("Rows", inPlayStart, "_to_", inPlayStart+inPlayPlus), paste0("Row", inPlayStart))
subsetsList <-
lapply(seq_along(inPlayStart), function(i) {
# This can be one line. Broken up so you can see what's happening
from <- max(indx.min, inPlayStart[[i]]-S) # notice, calling max on the min
to <- min(indx.max, inPlayStart[[i]] + inPlayPlus[[i]] +S) # and min on the max
cat("i is ", i, "\tPlus=", inPlayPlus[[i]], "\t(from, to) = (", from, ", ", to, ")\tDIFF=", to-from, "\n", sep="")
indx <- if (inPlayPlus[[i]] == 0) from:to else setdiff(from:to, inPlayStart[[i]]+(1:inPlayPlus[[i]]) )
df[indx, ]
})
## Have a look at the results
subsetsList