通过基于组条件创建重复项来扩展data.frame(3)

时间:2017-05-29 17:00:23

标签: r dataframe duplicates rows

this SO question开始。

示例data.frame

df = read.table(text = 'ID  Day Count   Count_group
            18  1933    6   15
            33  1933    6   15
            37  1933    6   15
            18  1933    6   15
            16  1933    6   15
            11  1933    6   15
            111 1932    5   9
            34  1932    5   9
            60  1932    5   9
            88  1932    5   9
            18  1932    5   9
            33  1931    3   4
            13  1931    3   4
            56  1931    3   4
            23  1930    1   1
            6   1800    6   12
            37  1800    6   12
            98  1800    6   12
            52  1800    6   12
            18  1800    6   12
            76  1800    6   12
            55  1799    4   6
            6   1799    4   6
            52  1799    4   6
            133 1799    4   6
            112 1798    2   2
            677 1798    2   2
            778 888     4   8
            111 888     4   8
            88  888     4   8
            10  888     4   8
            37  887     2   4
            26  887     2   4
            8   886     1   2
            56  885     1   1
            22  120     2   6
            34  120     2   6
            88  119     1   6
            99  118     2   5
            12  118     2   5
            90  117     1   3
            22  115     2   2
            99  115     2   2', header = TRUE)

Count列显示每个ID的{​​{1}}值总数,而Day列显示每个Count_group值的总和{ {1}},IDDayDay - 1Day -2

e.g。 1933 = Day -3 15因为Day -4 6(1933)+ Count_group 5(1932)+ Count 3(1931)+ Count 1(1930)+ { {1}} 0(1929)。

我需要做的是为每个Count创建重复的观察结果并将其添加到其中,以便按Count Count Count_groupCount_group展示每个Day {1}},Day - 1Day -2

e.g。 Day -3 = 15由Day -4 1933年,1932年,1931年,1930年(以及Count_group中未存在的1929年)的计数值组成。所以五天需要包含在Day = 15中。下一个将是df = 9,由1932年,1931年,1930年,1929年和1928年组成;等...

期望的输出:

Count_group

(请注意,为了使它们更清晰,每个5天的不同组用空行分隔)

我有不同的data.frames,按n天分组,因此我想专门为每个代码调整代码(稍微改一下)。

由于

2 个答案:

答案 0 :(得分:1)

我附上了一种相当机械的方法,但我相信这是一个很好的起点。 我注意到在原始表中的条目

ID Day Count Count_group 18 1933 6 14

重复;为了清楚起见,我没有动过它。

方法的结构:

  1. 阅读原始数据
  2. 为每个Day
  3. 生成数据框列表
  4. 生成最终数据框,将列表折叠为2。
  5. 1。阅读原始数据

    我们从

    开始
    df = read.table(text = 'ID  Day Count   Count_group
                    18  1933    6   14
                    33  1933    6   14
                    37  1933    6   14
                    18  1933    6   14
                    16  1933    6   14
                    11  1933    6   14
                    111 1932    5   9
                    34  1932    5   9
                    60  1932    5   9
                    88  1932    5   9
                    18  1932    5   9
                    33  1931    3   4
                    13  1931    3   4
                    56  1931    3   4
                    23  1930    1   1
                    6   1800    6   12
                    37  1800    6   12
                    98  1800    6   12
                    52  1800    6   12
                    18  1800    6   12
                    76  1800    6   12
                    55  1799    4   6
                    6   1799    4   6
                    52  1799    4   6
                    133 1799    4   6
                    112 1798    2   2
                    677 1798    2   2
                    778 888     4   7
                    111 888     4   7
                    88  888     4   7
                    10  888     4   7
                    37  887     2   4
                    26  887     2   4
                    8   886     1   2
                    56  885     1   1', header = TRUE)
    
    # ordered vector of unique values for "Day"
    ord_day <- unique(df$Day[order(df$Day)])
    ord_day
     [1]  885  886  887  888 1798 1799 1800 1930 1931 1932 1933
    

    2.为每个Day

    生成数据框列表

    对于ord_day中的每个元素,我们引入一个data.frame作为名为df_new_aug的列表的元素。 此类数据框是通过for循环定义的ord_dayord_day[2]ord_day[1]之外的所有值,这些值将单独处理。

    循环背后的理念:对于每个ord_day[i] i > 2 {{}}},我们会检查ord_day[i-1]ord_day[i-2](或两者!)之间的哪些天数(通过变量{{ 1}})"Count"处的值"Count_Group"

    因此,我们在循环中引入ord_day[i]语句。 我们走了

    if else

    3。生成最终数据框,将列表折叠为2。

    我们通过一个丑陋的循环崩溃# Recursive generation of the list of data.frames (for days > 886) #----------------------------------------------------------------- df_new <- list() df_new_aug <- list() # we exclude cases i=1, 2: they are manually treated below for ( i in 3: length(ord_day) ) { # is "Count_Group" for ord_day[i] equal to the sum of "Count" at ord_day[i-1] and ord_day[i-2]? if ( unique(df[df$Day == ord_day[i], "Count_group"]) == unique(df[df$Day == ord_day[i], "Count"]) + unique(df[df$Day == ord_day[i-1], "Count"]) + unique(df[df$Day == ord_day[i-2], "Count"]) ) { # we create columns ID | Day | Count df_new[[i]] <- data.frame(df[df$Day == ord_day[i] | df$Day == ord_day[i-1] | df$Day == ord_day[i-2], c("ID", "Day", "Count")]) # we append the Count_Group of the Day in ord_day[i] df_new_aug[[i]] <- data.frame( df_new[[i]], Count_group = rep(unique(df[df$Day == ord_day[i], "Count_group"]), nrow(df_new[[i]]) ) ) } else if (unique(df[df$Day == ord_day[i], "Count_group"]) == unique(df[df$Day == ord_day[i], "Count"]) + unique(df[df$Day == ord_day[i-1], "Count"]) ) #only "Count" at i and i-1 contribute to "Count_group" at i { df_new[[i]] <- data.frame(df[df$Day == ord_day[i] | df$Day == ord_day[i-1], c("ID", "Day", "Count")]) # we append the Count_Group of the Day in ord_day[2] df_new_aug[[i]] <- data.frame(df_new[[i]], Count_group = rep(unique(df[df$Day == ord_day[i], "Count_group"]), nrow(df_new[[i]]) ) ) } else #only "Count" at i contributes to "Count_group" at i df_new[[i]] <- data.frame(df[df$Day == ord_day[i], c("ID", "Day", "Count")]) # we append the Count_Group of the Day in ord_day[i] df_new_aug[[i]] <- data.frame(df_new[[i]], Count_group = rep(unique(df[df$Day == ord_day[i], "Count_group"]), nrow(df_new[[i]]) ) ) #closing the for loop } # for ord_day[2] = "886" (both "Count" at i =2 and i = 1 contribute to "Count_group" at i=2) #------------------------------------------------------------------------------------- df_new[[2]] <- data.frame(df[df$Day == ord_day[2] | df$Day == ord_day[1], c("ID", "Day", "Count")]) # we append the Count_Group of the Day in ord_day[2] df_new_aug[[2]] <- data.frame(df_new[[2]], Count_group = rep(unique(df[df$Day == ord_day[2], "Count_group"]), nrow(df_new[[2]]) ) ) # for ord_day[1] = "885" (only "count" at i = 1 contributes to "Count_group" at i =1) #------------------------------------------------------------------------------------ df_new[[1]] <- data.frame(df[df$Day == ord_day[1], c("ID", "Day", "Count")]) # we append the Count_Group of the Day in ord_day[i] df_new_aug[[1]] <- data.frame(df_new[[1]], Count_group = rep(unique(df[df$Day == ord_day[1], "Count_group"]), nrow(df_new[[1]]) ) ) # produced list df_new_aug ,但其他解决方案(例如df_new_augReduce()也可以):

    merge()

    一个人到达# merging the list (mechanically): final result df_result <- df_new_aug[[1]] for (i in 1:10){ df_result <- rbind(df_result, df_new_aug[[i+1]]) } 并停止分析。

答案 1 :(得分:1)

我之前回答的一般化版本......

#first add grouping variables
days <- 5 #grouping no of days
df$smalldaygroup <- c(0,cumsum(sapply(2:nrow(df),function(i) df$Day[i]!=df$Day[i-1]))) #individual days
df$bigdaygroup <- c(0,cumsum(sapply(2:nrow(df),function(i) df$Day[i]<df$Day[i-1]-days+1))) #blocks of linked days

#duplicate days in each big group
df2 <- lapply(split(df,df$bigdaygroup),function(x) {
  n <- max(x$Day)-min(x$Day)+1 #number of consecutive days in big group
  dayvec <- (max(x$Day):min(x$Day)) #possible days in range
  daylog <- dayvec[dayvec %in% x$Day] #actual days in range
  pattern <- data.frame(base=rep(dayvec,each=days))
  pattern$rep <- sapply(1:nrow(pattern),function(i) pattern$base[i]+1-sum(pattern$base[1:i]==pattern$base[i])) #indices to repeat
  pattern$offset <- match(pattern$rep,daylog)-match(pattern$base,daylog) #offsets (used later)
  pattern <- pattern[(pattern$base %in% x$Day) & (pattern$rep %in% x$Day),] #remove invalid elements
  #store pattern in list as offsets needed in next loop
  return(list(df=split(x,x$smalldaygroup)[match(pattern$rep,daylog)],pat=pattern))
})

#change the Count_group to previous value in added entries
df2 <- lapply(df2,function(L) lapply(1:length(L$df),function(i) {
  x <- L$df[[i]]
  offset <- L$pat$offset #pointer to day to copy Count_group from
  x$Count_group <- L$df[[i-offset[i]]]$Count_group[1]
  return(x)
}))

df2 <- do.call(rbind,unlist(df2,recursive=FALSE)) #bind back together

df2[,5:6] <- NULL #remove grouping variables

head(df2,30) #ignore rownames!

       ID  Day Count Count_group
01.1   18 1933     6          15
01.2   33 1933     6          15
01.3   37 1933     6          15
01.4   18 1933     6          15
01.5   16 1933     6          15
01.6   11 1933     6          15
02.7  111 1932     5          15
02.8   34 1932     5          15
02.9   60 1932     5          15
02.10  88 1932     5          15
02.11  18 1932     5          15
03.12  33 1931     3          15
03.13  13 1931     3          15
03.14  56 1931     3          15
04     23 1930     1          15
05.7  111 1932     5           9
05.8   34 1932     5           9
05.9   60 1932     5           9
05.10  88 1932     5           9
05.11  18 1932     5           9
06.12  33 1931     3           9
06.13  13 1931     3           9
06.14  56 1931     3           9
07     23 1930     1           9
08.12  33 1931     3           4
08.13  13 1931     3           4
08.14  56 1931     3           4
09     23 1930     1           4
010    23 1930     1           1
11.16   6 1800     6          12