Question

我有一个示例数据集和我正在尝试编写的函数。目标是让函数为区域内的所有区间生成值。因此，如果一个StationID有2个间隔而同一个区域中的另一个StationID有三个间隔，则对于任何最初不存在的间隔，两个StationID需要具有相同的间隔数，而Num_scaled为0。

以下是一个区域的数据输出结果。

示例数据集

dataset<-structure(list(Area = structure(c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L), .Label = c("A1", 
"A2"), class = "factor"), StationID = c(1, 1, 2, 2, 2, 1, 1, 
1, 2, 2, 3, 3, 1, 2, 2, 1, 1, 2, 3, 3), Gear = structure(c(2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L), .Label = c("Com", "Survey"), class = "factor"), 
    interval = c(2, 4, 2, 5, 7, 2, 4, 10, 11, 20, 7, 20, 3, 3, 
    6, 7, 10, 4, 2, 3), Num_scaled = c(1, 3, 2, 7, 5, 4, 4, 2, 
    3, 3, 7, 20, 2, 1, 1, 3, 5, 6, 1, 2)), .Names = c("Area", 
"StationID", "Gear", "interval", "Num_scaled"), row.names = c(NA, 
-20L), class = "data.frame")

功能尝试

combined=data.frame()

rep_func<-function(data){
  for(i in unique(data$Area)){
  tmp<-droplevels(subset(data,Area==Area[i]))
  data.1<-as.table(by(tmp$Num_scaled,list(tmp$Area,tmp$StationID,tmp$Gear,tmp$interval),sum))
  data.2<-as.data.frame(ftable(data.1))
  names(data.2)<-c("Area","StationID","Gear","interval","Num_scaled")}
  combined=rbind(combined,data.2)
  combined[is.na(combined)] <- 0
  return(combined)
}
all2<-rep_func(dataset)

我收到以下错误消息：

Error in names(data.2) <- c("Area", "StationID", "Gear", "interval", "Num_scaled") : 
  'names' attribute [5] must be the same length as the vector [3]

我理解错误消息的含义 - 向量只有三个变量，但名称代码有5个名称。数据中应该有5个变量。当我这么做的时候很长一段时间 - 为每个区域分开我没有问题。我真正的数据集要大得多，我希望有一个函数来代替。

R info：

R version 3.3.2 (2016-10-31)
Platform: i386-w64-mingw32/i386 (32-bit)
Running under: Windows 7 x64 (build 7601) Service Pack 1

Answer 1

在unique(data$Area)中，您需要考虑因素。为了让您的函数生成A1的间隔，请将其更改为seq_along(unique(data$Area))。

留在基地R，您可能需要的是：

# intervals
all2 <- lapply(seq_along(unique(dataset[, 1])), function(i, x=dataset){
  t <- x[x[, 1] == x[i, 1], ]
  d1 <- as.table(by(t$Num_scaled, as.list(t[, -5]), sum))
  d2 <- setNames(as.data.frame(d1), 
                 c("Area","StationID","Gear","interval","Num_scaled"))
  d2[is.na(d2)] <- 0
  return(d2)
  })

# binding the two lists into a df
all3 <- unique(do.call(rbind, all2))

# splitting it into the areas
all.lst <- split(all3, all3[, 1])

# yields one df for each area in a list
all.lst
# $A1
# Area StationID   Gear interval Num_scaled
# 1    A1         1    Com        2          0
# 3    A1         2    Com        2          0
# 5    A1         1 Survey        2          1
# 7    A1         2 Survey        2          2
# 9    A1         1    Com        3          2
# 11   A1         2    Com        3          1
# 13   A1         1 Survey        3          0
# 15   A1         2 Survey        3          0
# 17   A1         1    Com        4          0
# 19   A1         2    Com        4          0
# 21   A1         1 Survey        4          3
# 23   A1         2 Survey        4          0
# 25   A1         1    Com        5          0
# 27   A1         2    Com        5          0
# 29   A1         1 Survey        5          0
# 31   A1         2 Survey        5          7
# 33   A1         1    Com        6          0
# 35   A1         2    Com        6          1
# 37   A1         1 Survey        6          0
# 39   A1         2 Survey        6          0
# 41   A1         1    Com        7          0
# 43   A1         2    Com        7          0
# 45   A1         1 Survey        7          0
# 47   A1         2 Survey        7          5
# 
# $A2
# Area StationID   Gear interval Num_scaled
# 2    A2         1    Com        2          0
# 4    A2         2    Com        2          0
# 6    A2         1 Survey        2          0
# 8    A2         2 Survey        2          0
# 10   A2         1    Com        3          0
# 12   A2         2    Com        3          0
# 14   A2         1 Survey        3          0
# 16   A2         2 Survey        3          0
# 18   A2         1    Com        4          0
# 20   A2         2    Com        4          0
# 22   A2         1 Survey        4          0
# 24   A2         2 Survey        4          0
# 26   A2         1    Com        5          0
# 28   A2         2    Com        5          0
# 30   A2         1 Survey        5          0
# 32   A2         2 Survey        5          0
# 34   A2         1    Com        6          0
# 36   A2         2    Com        6          0
# 38   A2         1 Survey        6          0
# 40   A2         2 Survey        6          0
# 42   A2         1    Com        7          0
# 44   A2         2    Com        7          0
# 46   A2         1 Survey        7          0
# 48   A2         2 Survey        7          0

# access df for Area 1
all.lst[[1]]

Answer 2

我相信这可以做你想要的：

library(tidyr)
library(dplyr)

# Factors as strings
dataset[] <- lapply(dataset, as.character) 

df = dataset %>% 
    group_by(Area) %>% 
    complete(StationID, interval, Gear, Area, fill=list(Num_scaled=0))

# Show example output for Area A1
df %>% filter(Area=='A1') %>% as.data.frame

输出：

   StationID interval   Gear Area Num_scaled
1          1        2    Com   A1          0
2          1        2 Survey   A1          1
3          1        3    Com   A1          2
4          1        3 Survey   A1          0
5          1        4    Com   A1          0
6          1        4 Survey   A1          3
7          1        5    Com   A1          0
8          1        5 Survey   A1          0
9          1        6    Com   A1          0
10         1        6 Survey   A1          0
11         1        7    Com   A1          0
12         1        7 Survey   A1          0
13         2        2    Com   A1          0
14         2        2 Survey   A1          2
15         2        3    Com   A1          1
16         2        3 Survey   A1          0
17         2        4    Com   A1          0
18         2        4 Survey   A1          0
19         2        5    Com   A1          0
20         2        5 Survey   A1          7
21         2        6    Com   A1          1
22         2        6 Survey   A1          0
23         2        7    Com   A1          0
24         2        7 Survey   A1          5

希望这有帮助！

R函数带循环，为唯一因子级别分配缺失值

2 个答案: