Question

我需要添加一个均值，方差和标准偏差列（每个主题），但我的数据有点复杂：

我有主题ID，日期＆amp;时间，一年中的几周，总体出勤人数和每周的出勤率。现在我需要的是另外3列，给我每周的平均访问量，出勤率和标准差。

为了更清楚，这是我的数据集的快照：

df <- c(Contact.ID, Date.Time, Week, Attendance, WeeklyAT)

Contact.ID     Date      Time    Week    Attendance  WeeklyAT  *Mean      *v    *sd
1   A       2012-10-06 18:54:48   40          3          2    *0.214   *0.335  *0.579
2   A       2012-10-08 20:50:18   40          3          2    *0.214   *0.335  *0.579  
3   A       2012-11-24 20:18:44   47          3          1    *0.214   *0.335  *0.579  
4   B       2012-11-15 16:58:15   46          4          1 
5   B       2013-01-09 10:57:02    2          4          3
6   B       2013-01-11 17:31:22    2          4          3
7   B       2013-01-14 18:37:00    2          4          3
8   C       2013-02-22 17:46:07    8          2          1
9   C       2013-02-27 11:21:00    9          2          1
10  D       2012-10-28 14:48:33   43          1          1

要计算平均出勤率，需要考虑的是，我正在查看的时间范围为14周，并且每周出勤率都会重复，因此需要与周数绑定。因此，要计算主题A和B的意思，例如它必须是：

meanA =（2 + 1 + 0 + 0 + 0 + 0 + 0 + 0 + 0 + 0 + 0 + 0 + 0 + 0）/14=0.214

meanB =（1 + 3 + 0 + 0 + 0 + 0 + 0 + 0 + 0 + 0 + 0 + 0 + 0 + 0）/14=0.286

（这里14周并不重要，但是对于方差和sd它确实如此：

方差A =Σ（x-μ）^ 2 = [（2-0.214）^ 2 +（1-0.214）^ 2 +（0-0.214）^ 2 +（0-0.214）^ 2 +（0- 0.214）^ 2 +（0-0.214）^ 2 +（0-0.214）^ 2 +（0-0.214）^ 2 +（0-0.214）^ 2 +（0-0.214）^ 2 +（0-0.214） ^ 2 +（0-0.214）^ 2 +（0-0.214）^ 2 +（0-0.214）^ 2] /（14-1）= 4.357 / 13 = 0.335

sdA =√varianceA=√0.335= 0.579

我无法弄清楚如何在代码中执行此操作。我尝试了ifelse个函数和一般var和mean，并尝试使用这些函数创建新列但未能按主题（Contact.ID）和我的n = 14定义它。< / p>

我非常感谢帮助。非常感谢！

Answer 1

这是一个非常粗略的解决方案（需要去）。我假设原始问题中的计算并不完全正确，如果我错了，你应该能够调整我的代码：

编辑1：更新代码 - 因为方差计算中使用的均值错误并添加了一些注释。

# Set to data.table
setDT(df)

# Number of weeks in our data
nweeks <- df[, uniqueN(Week)] # 7 

# Calculate mean number of visits per week
df[, Mean := .N / nweeks, by = .(Contact.ID)]

# Add the rank of the week, this variable is used in the loop below
df <- merge(df,
            df[!duplicated(Week), .(Week, num_week = rank(Week))])

# Set key for tha data.table... makes syntax simpler
setkey(df, Contact.ID, num_week)

# Initalize variance variable
df[, v := 0]

# For each id go through every week and fill in vector of number of visits
# attendance_vector based on which we will calculate variance for each id.
for (id in unique(df$Contact.ID)) {
  attendance_vector <- integer(nweeks)
  mean <- df[id, Mean][1] # mean for this id...
  for (week in unique(df$num_week)) {
    attendance_vector[week] <- 
      df[.(id, week)][1, ifelse(!is.na(WeeklyAT), WeeklyAT, 0)]
  }
  df[id, v := sum((attendance_vector - mean)^2) / (nweeks - 1L)]
  cat("for", id, "the weekly attendance was: \n")
  print(cbind(unique(df$Week), attendance_vector, mean))
}

# Standard deviation
df[, sd := sqrt(v), by = Contact.ID]

# Drop num_week variable
df[, num_week := NULL]
df

    Week Contact.ID       Date     Time Attendance WeeklyAT      Mean         v        sd
 1:   40          A 2012-10-06 18:54:48          3        2 0.4285714 0.6190476 0.7867958
 2:   40          A 2012-10-08 20:50:18          3        2 0.4285714 0.6190476 0.7867958
 3:   47          A 2012-11-24 20:18:44          3        1 0.4285714 0.6190476 0.7867958
 4:    2          B 2013-01-09 10:57:02          4        3 0.5714286 1.2857143 1.1338934
 5:    2          B 2013-01-11 17:31:22          4        3 0.5714286 1.2857143 1.1338934
 6:    2          B 2013-01-14 18:37:00          4        3 0.5714286 1.2857143 1.1338934
 7:   46          B 2012-11-15 16:58:15          4        1 0.5714286 1.2857143 1.1338934
 8:    8          C 2013-02-22 17:46:07          2        1 0.2857143 0.2380952 0.4879500
 9:    9          C 2013-02-27 11:21:00          2        1 0.2857143 0.2380952 0.4879500
10:   43          D 2012-10-28 14:48:33          1        1 0.1428571 0.1428571 0.3779645

Answer 2

数据

df <- structure(list(Contact.ID = 1:10, Date = c("A", "A", "A", "B", 
"B", "B", "B", "C", "C", "D"), Time = c("2012-10-06 18:54:48", 
"2012-10-08 20:50:18", "2012-11-24 20:18:44", "2012-11-15 16:58:15", 
"2013-01-09 10:57:02", "2013-01-11 17:31:22", "2013-01-14 18:37:00", 
"2013-02-22 17:46:07", "2013-02-27 11:21:00", "2012-10-28 14:48:33"
), Week = c(40L, 40L, 47L, 46L, 2L, 2L, 2L, 8L, 9L, 43L), Attendance = c(3L, 

3L, 3L, 4L, 4L, 4L, 4L, 2L, 2L, 1L), WeeklyAT = c(2L, 2L, 1L, 
1L, 3L, 3L, 3L, 1L, 1L, 1L)), .Names = c("Contact.ID", "Date", 
"Time", "Week", "Attendance", "WeeklyAT"), row.names = c(NA, 
-10L), class = c("data.table", "data.frame"))

tidyverse解决方案

library(tidyverse)
df1 <- df %>%
         group_by(Date) %>%
         nest(Week, WeeklyAT) %>%          # nest relevant data
         mutate(data = map(data, ~.x %>% filter(duplicated(Week)==F))) %>%    # filter out duplicated Weeks
         mutate(data = map(data, ~c(.x$WeeklyAT, rep(0, 14-length(.x$WeeklyAT))))) %>%      # make WeeklyAT into 14-element vector
         mutate(data = map(data, ~data.frame(Mean = mean(.x), sd = sd(.x), v = sd(.x)**2))) %>%    # calculate statistics and save as data frame
         unnest(data) %>%        # unnest results
         left_join(df, ., by="Date")       # combine with original data frame

输出

   Contact.ID Date                Time Week Attendance WeeklyAT       Mean
1           1    A 2012-10-06 18:54:48   40          3        2 0.21428571
2           2    A 2012-10-08 20:50:18   40          3        2 0.21428571
3           3    A 2012-11-24 20:18:44   47          3        1 0.21428571
4           4    B 2012-11-15 16:58:15   46          4        1 0.28571429
5           5    B 2013-01-09 10:57:02    2          4        3 0.28571429
6           6    B 2013-01-11 17:31:22    2          4        3 0.28571429
7           7    B 2013-01-14 18:37:00    2          4        3 0.28571429
8           8    C 2013-02-22 17:46:07    8          2        1 0.14285714
9           9    C 2013-02-27 11:21:00    9          2        1 0.14285714
10         10    D 2012-10-28 14:48:33   43          1        1 0.07142857
          sd          v
1  0.5789342 0.33516484
2  0.5789342 0.33516484
3  0.5789342 0.33516484
4  0.8254203 0.68131868
5  0.8254203 0.68131868
6  0.8254203 0.68131868
7  0.8254203 0.68131868
8  0.3631365 0.13186813
9  0.3631365 0.13186813
10 0.2672612 0.07142857

R：每个受试者的平均值，方差和标准偏差列

2 个答案:

数据

tidyverse解决方案

输出