根据R

时间:2016-07-19 19:13:09

标签: r dataframe

我是R的新手并且学习基本的东西。我在R中有一个数据框,其中包含controller_id,user_id,mth_id,col_val1等列,直到col_val100。

df <- data.frame('controller_id' = c('X','X','X','X','X','X','Y','Y','Y','Y','Y','Y','Z','Z'),
'user_id'=c('A','B','C','A','B','C','P','Q','R','P','Q','R',NA,NA),
'mth_id'=c('1393','1393','1393','1398','1398','1398','1393','1393','1393','1398','1398','1398','1393','1398'),
'col_val1' = c(5,4,6,3,1,10,12,15,18,13,19,1,5,2),
'col_val2'=c(8,12,9,2,12,5,7,9,11,4,0,7,10,5))

> df
   controller_id user_id mth_id col_val1 col_val2
1              X       A   1393        5        8
2              X       B   1393        4       12
3              X       C   1393        6        9
4              X       A   1398        3        2
5              X       B   1398        1       12
6              X       C   1398       10        5
7              Y       P   1393       12        7
8              Y       Q   1393       15        9
9              Y       R   1393       18       11
10             Y       P   1398       13        4
11             Y       Q   1398       19        0
12             Y       R   1398        1        7
13             Z    <NA>   1393        5       10
14             Z    <NA>   1398        2        5

我想要的是根据特定的user_id,mth_id计算每个controller_id的col_values的差异,并仅显示随着mth_id的增加而减少的col_values。

例如:对于controller_id = X,我们有3个user_id作为A,B,C用于两个不同的mth_id。代码应该为所有3个users_id计算m__id 1398和1393的col_val1之间的差异,并且如果该差异是&lt; 0然后我想要像

这样的输出
Col_val1 for controller_id 'X', user_id 'A' has decreased from 5 to 3

如果给定的controller_id没有关联的user_id,那么它应该计算controller_id本身之间的列值差异。

理想情况下,我希望将这些输出存储在列表/数据框中以供以后使用。 此外,代码将运行约。数据框中有900列。

非常感谢任何帮助。

1 个答案:

答案 0 :(得分:1)

考虑使用运行组总和的基本R解决方案。要迭代所有列,请使用sapply()传递列名:

rowdiff <- function(col) {
             sapply(1:nrow(df),
               function(i){
                 # CONDITIONAL TO RETURN NA FOR FIRST VAL IN EACH USER ID
                 ifelse(sum(df[1:i, c("user_id")] == df$user_id[i]) == 1, NA,
                    # DIFFERENCE OF CURRENT LOOP COL VALUE - LAST COL VALUE OF USER ID GROUP
                    df[[col]][i] -
                    sum((df[1:i-1, c("user_id")] == df$user_id[i]) 
                    * df[1:i-1,][[col]]))
               })
           }


finaldf <- cbind(df, data.frame(sapply(names(df[c(3:ncol(df))]), rowdiff)))

#   user_id mth_id col_val1 col_val2 col_val3 col_val1 col_val2 col_val3
# 1       A   1398        4        2       12       NA       NA       NA
# 2       B   1398        3        3       30       NA       NA       NA
# 3       C   1398        1        1       14       NA       NA       NA
# 4       A   1393        5        7        7        1        5       -5
# 5       B   1393        2        6       18       -1        3      -12
# 6       C   1393        7        0        9        6       -1       -5
# 7       D   1398        4        5       12       NA       NA       NA
# 8       D   1393        0        3       24       -4       -2       12

你是否需要写出来的陈述:

statements <- function(col) {
  sapply(1:nrow(df),
         function(i){

           delta <- df[[col]][i]-
                      sum((df[1:i-1, c("controller_id")] == df$controller_id[i])
                         *(df[1:i-1, c("user_id")] == df$user_id[i]) 
                         * df[1:i-1,][[col]])

           changeword <- ifelse(delta < 0, "decreased", 
                                ifelse(delta > 0, "increased", "not changed"))

           ifelse(sum(df[1:i, c("user_id")] == df$user_id[i]) == 1, NA,
                  paste0(col, " for controller_id '", df$controller_id[i], "', user_id '", 
                         df$user_id[i], "' has ", changeword, " from ",
                         sum((df[1:i-1, c("controller_id")] == df$controller_id[i])
                             * (df[1:i-1, c("user_id")] == df$user_id[i]) 
                             * df[1:i-1,][[col]]), " to ",
                         df[[col]][i])

           )
         })
}
finaldf <- cbind(df, data.frame(sapply(names(df[c(4:ncol(df))]), statements)))

输出

                                                                  col_val1
1                                                                     <NA>
2                                                                     <NA>
3                                                                     <NA>
4    col_val1 for controller_id 'X', user_id 'A' has decreased from 5 to 3
5    col_val1 for controller_id 'X', user_id 'B' has decreased from 4 to 1
6   col_val1 for controller_id 'X', user_id 'C' has increased from 6 to 10
7                                                                     <NA>
8                                                                     <NA>
9                                                                     <NA>
10 col_val1 for controller_id 'Y', user_id 'P' has increased from 12 to 13
11 col_val1 for controller_id 'Y', user_id 'Q' has increased from 15 to 19
12  col_val1 for controller_id 'Y', user_id 'R' has decreased from 18 to 1
13                                                                    <NA>
14                                                                    <NA>
                                                                    col_val2
1                                                                       <NA>
2                                                                       <NA>
3                                                                       <NA>
4      col_val2 for controller_id 'X', user_id 'A' has decreased from 8 to 2
5  col_val2 for controller_id 'X', user_id 'B' has not changed from 12 to 12
6      col_val2 for controller_id 'X', user_id 'C' has decreased from 9 to 5
7                                                                       <NA>
8                                                                       <NA>
9                                                                       <NA>
10     col_val2 for controller_id 'Y', user_id 'P' has decreased from 7 to 4
11     col_val2 for controller_id 'Y', user_id 'Q' has decreased from 9 to 0
12    col_val2 for controller_id 'Y', user_id 'R' has decreased from 11 to 7
13                                                                      <NA>
14                                                                      <NA>