我有一个示例数据集和我正在尝试编写的函数。目标是让函数为区域内的所有区间生成值。因此,如果一个StationID有2个间隔而同一个区域中的另一个StationID有三个间隔,则对于任何最初不存在的间隔,两个StationID需要具有相同的间隔数,而Num_scaled为0。
以下是一个区域的数据输出结果。
示例数据集
dataset<-structure(list(Area = structure(c(1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L), .Label = c("A1",
"A2"), class = "factor"), StationID = c(1, 1, 2, 2, 2, 1, 1,
1, 2, 2, 3, 3, 1, 2, 2, 1, 1, 2, 3, 3), Gear = structure(c(2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = c("Com", "Survey"), class = "factor"),
interval = c(2, 4, 2, 5, 7, 2, 4, 10, 11, 20, 7, 20, 3, 3,
6, 7, 10, 4, 2, 3), Num_scaled = c(1, 3, 2, 7, 5, 4, 4, 2,
3, 3, 7, 20, 2, 1, 1, 3, 5, 6, 1, 2)), .Names = c("Area",
"StationID", "Gear", "interval", "Num_scaled"), row.names = c(NA,
-20L), class = "data.frame")
功能尝试
combined=data.frame()
rep_func<-function(data){
for(i in unique(data$Area)){
tmp<-droplevels(subset(data,Area==Area[i]))
data.1<-as.table(by(tmp$Num_scaled,list(tmp$Area,tmp$StationID,tmp$Gear,tmp$interval),sum))
data.2<-as.data.frame(ftable(data.1))
names(data.2)<-c("Area","StationID","Gear","interval","Num_scaled")}
combined=rbind(combined,data.2)
combined[is.na(combined)] <- 0
return(combined)
}
all2<-rep_func(dataset)
我收到以下错误消息:
Error in names(data.2) <- c("Area", "StationID", "Gear", "interval", "Num_scaled") :
'names' attribute [5] must be the same length as the vector [3]
我理解错误消息的含义 - 向量只有三个变量,但名称代码有5个名称。数据中应该有5个变量。当我这么做的时候很长一段时间 - 为每个区域分开我没有问题。我真正的数据集要大得多,我希望有一个函数来代替。
R info:
R version 3.3.2 (2016-10-31)
Platform: i386-w64-mingw32/i386 (32-bit)
Running under: Windows 7 x64 (build 7601) Service Pack 1
答案 0 :(得分:0)
在unique(data$Area)
中,您需要考虑因素。为了让您的函数生成A1
的间隔,请将其更改为seq_along(unique(data$Area))
。
留在基地R,您可能需要的是:
# intervals
all2 <- lapply(seq_along(unique(dataset[, 1])), function(i, x=dataset){
t <- x[x[, 1] == x[i, 1], ]
d1 <- as.table(by(t$Num_scaled, as.list(t[, -5]), sum))
d2 <- setNames(as.data.frame(d1),
c("Area","StationID","Gear","interval","Num_scaled"))
d2[is.na(d2)] <- 0
return(d2)
})
# binding the two lists into a df
all3 <- unique(do.call(rbind, all2))
# splitting it into the areas
all.lst <- split(all3, all3[, 1])
# yields one df for each area in a list
all.lst
# $A1
# Area StationID Gear interval Num_scaled
# 1 A1 1 Com 2 0
# 3 A1 2 Com 2 0
# 5 A1 1 Survey 2 1
# 7 A1 2 Survey 2 2
# 9 A1 1 Com 3 2
# 11 A1 2 Com 3 1
# 13 A1 1 Survey 3 0
# 15 A1 2 Survey 3 0
# 17 A1 1 Com 4 0
# 19 A1 2 Com 4 0
# 21 A1 1 Survey 4 3
# 23 A1 2 Survey 4 0
# 25 A1 1 Com 5 0
# 27 A1 2 Com 5 0
# 29 A1 1 Survey 5 0
# 31 A1 2 Survey 5 7
# 33 A1 1 Com 6 0
# 35 A1 2 Com 6 1
# 37 A1 1 Survey 6 0
# 39 A1 2 Survey 6 0
# 41 A1 1 Com 7 0
# 43 A1 2 Com 7 0
# 45 A1 1 Survey 7 0
# 47 A1 2 Survey 7 5
#
# $A2
# Area StationID Gear interval Num_scaled
# 2 A2 1 Com 2 0
# 4 A2 2 Com 2 0
# 6 A2 1 Survey 2 0
# 8 A2 2 Survey 2 0
# 10 A2 1 Com 3 0
# 12 A2 2 Com 3 0
# 14 A2 1 Survey 3 0
# 16 A2 2 Survey 3 0
# 18 A2 1 Com 4 0
# 20 A2 2 Com 4 0
# 22 A2 1 Survey 4 0
# 24 A2 2 Survey 4 0
# 26 A2 1 Com 5 0
# 28 A2 2 Com 5 0
# 30 A2 1 Survey 5 0
# 32 A2 2 Survey 5 0
# 34 A2 1 Com 6 0
# 36 A2 2 Com 6 0
# 38 A2 1 Survey 6 0
# 40 A2 2 Survey 6 0
# 42 A2 1 Com 7 0
# 44 A2 2 Com 7 0
# 46 A2 1 Survey 7 0
# 48 A2 2 Survey 7 0
# access df for Area 1
all.lst[[1]]
答案 1 :(得分:0)
我相信这可以做你想要的:
library(tidyr)
library(dplyr)
# Factors as strings
dataset[] <- lapply(dataset, as.character)
df = dataset %>%
group_by(Area) %>%
complete(StationID, interval, Gear, Area, fill=list(Num_scaled=0))
# Show example output for Area A1
df %>% filter(Area=='A1') %>% as.data.frame
输出:
StationID interval Gear Area Num_scaled
1 1 2 Com A1 0
2 1 2 Survey A1 1
3 1 3 Com A1 2
4 1 3 Survey A1 0
5 1 4 Com A1 0
6 1 4 Survey A1 3
7 1 5 Com A1 0
8 1 5 Survey A1 0
9 1 6 Com A1 0
10 1 6 Survey A1 0
11 1 7 Com A1 0
12 1 7 Survey A1 0
13 2 2 Com A1 0
14 2 2 Survey A1 2
15 2 3 Com A1 1
16 2 3 Survey A1 0
17 2 4 Com A1 0
18 2 4 Survey A1 0
19 2 5 Com A1 0
20 2 5 Survey A1 7
21 2 6 Com A1 1
22 2 6 Survey A1 0
23 2 7 Com A1 0
24 2 7 Survey A1 5
希望这有帮助!