可以帮助调试一个函数。这是为了做
dat3 <- c(4,7,5,7,8,4,4,4,4,4,4,7,4,4,8,8,5,5,5,5)
myfunc(dat3, chunksize = 8)
## [1] 4 7 5 8 4 4 4 4 4 7 5 8 4 4 5 5 4
以sizer的块为单位对数据进行分区,并确保每个块中都存在每个级别。该功能适用于玩具示例
myfunc <- function(x, chunksize = 8) {
numChunks <- ceiling(length(x) / chunksize)
uniqx <- unique(x)
lastChunkSize <- chunksize * (1 - numChunks) + length(x)
## check to see if it is mathematically possible
if (length(uniqx) > chunksize)
stop('more factors than can fit in one chunk')
if (any(table(x) < numChunks))
stop('not enough of at least one factor to cover all chunks')
if (lastChunkSize < length(uniqx))
stop('last chunk will not have all factors')
## actually arrange things in one feasible permutation
allIndices <- sapply(uniqx, function(z) which(z == x))
## fill one of each unique x into chunks
chunks <- lapply(1:numChunks, function(i) sapply(allIndices, `[`, i))
remainder <- unlist(sapply(allIndices, tail, n = -3))
remainderCut <- split(remainder, ceiling(seq_along(remainder)/4))
## combine them all together, wary of empty lists
finalIndices <- sapply(1:numChunks,
function(i) {
if (i <= length(remainderCut))
c(chunks[[i]], remainderCut[[i]])
else
chunks[[i]]
})
save(finalIndices,file="finalIndices")
x[unlist(finalIndices)]
}
问题是我想从函数中获得重新排列的indix(所以这里所谓的最终指数)。问题是,对于具有更多观察结果的真实数据集(https://www.dropbox.com/s/n3wc5qxaoavr4ta/j.RData?dl=0),该函数不起作用。
数据作为因素 https://www.dropbox.com/s/0ue2xzv5e6h858q/t.RData?dl=0
我根据函数第一行中存在的9847 I级别更改chunkszie参数。问题是,当我从保存的文件中访问finalIndices时,我得到一个dim 137 60的矩阵。它没有为我的所有观察提供索引(接近600k)。有人能告诉我我做错了什么吗?我知道60是块的数量(nrows / chunksize),但137似乎不合适。
答案 0 :(得分:1)
行remainderCut <- split(remainder, ceiling(seq_along(remainder)/4))
被硬编码到玩具数据集中,只是为每个块添加了四个元素,这会对其他数据集产生错误的结果。
虽然可以通过修改代码来解决这个问题,但我提出了一个稍微不同的方法解决这个问题:
library(data.table)
generate.chunks <- function(dat3, chunksize = 8) {
# get number of unique values
freqs <- table(dat3)
# get chunk sizes
chunk.sizes <- rep(chunksize,length(dat3) %/% chunksize)
last.chunk.size <- length(dat3) %% chunksize
if (last.chunk.size > 0) chunk.sizes <- c(chunk.sizes,last.chunk.size)
# few checks
if (chunksize < length(freqs))
stop(sprintf('Chunk size is smaller than the number of factors: %i elements in a chunk, %i factors. Increase the chunk size',chunksize,length(freqs)))
if (chunk.sizes[length(chunk.sizes)] < length(freqs))
stop(sprintf('Last chunk size is smaller than the number of factors: %i elements in the chunk, %i factors. Use a different chunk size',chunksize,length(freqs)))
if (min(freqs) < length(chunk.sizes))
stop(sprintf('Not enough values in a factor to populate every chunk: %i < %i. Increase the chunk size',min(freqs),length(chunk.sizes)))
# make sure that each chunk has at least one factor
d.predefined <- data.frame(
chunk = rep(1:length(chunk.sizes),each=length(freqs)),
i = rep(1:length(freqs),length(chunk.sizes))
)
# randomly distribute the remaining values
d.sampled <- data.frame(
chunk = unlist(mapply(rep,1:length(chunk.sizes),chunk.sizes - length(freqs),SIMPLIFY=F)),
i = sample(unlist(mapply(rep,1:length(freqs),freqs - length(chunk.sizes))))
)
# put the predefined and sampled results together and split
d.result <- rbind(d.predefined,d.sampled)
# calculate indices
indices <- sapply(names(freqs),function(s) which(dat3==s))
dt <- as.data.table(d.result)
dt[,ind:=indices[[i]],by=i]
finalIndices <- split(dt$ind,dt$chunk)
save(finalIndices,file="finalIndices")
names(freqs)[d.result$i]
}