Question

我想使用R生成按某些列分组的唯一数字（序列号或随机数）。

下面提供了一个示例数据集

GZIPOutputStream

使用上面的数据，我想创建一个变量，例如style_serial，如下所示：

fact_code  style_         item             buyer
1206       -23            LADIES TANK TOP  652
1206       -23            LADIES TANK TOP  652
1206       -23            LADIES TANK TOP  652
1214       593935_592435  SS T-SHIRT       254
1214       593935_592435  SS T-SHIRT       254 
1214       593935_592435  SS T-SHIRT       254
7022       1572472        T-SHIRT          338
7022       1572472        T-SHIRT          338
7022       1572472        T-SHIRT          338

也就是说，创建一个变量，该变量采用由事实_代码，样式_，项目和购买者列分组的唯一值。我已经使用fact_code style_ item buyer style_serial 1206 -23 LADIES TANK TOP 652 1 1206 -23 LADIES TANK TOP 652 1 1206 -23 LADIES TANK TOP 652 1 1214 593935_592435 SS T-SHIRT 254 2 1214 593935_592435 SS T-SHIRT 254 2 1214 593935_592435 SS T-SHIRT 254 2 7022 1572472 T-SHIRT 338 3 7022 1572472 T-SHIRT 338 3 7022 1572472 T-SHIRT 338 3包尝试了以下R代码：

dplyr

其中df <- df %>% dplyr::group_by(fact_code, style_, buyer) %>% dplyr::mutate(style_serial = 1:n())是上述示例数据帧的名称。但这给了我意外的输出：

df

我不介意style_serial是否是随机化的整数集，以便数据看起来像这样：

fact_code  style_         item             buyer style_serial
1206       -23            LADIES TANK TOP  652   1
1206       -23            LADIES TANK TOP  652   2
1206       -23            LADIES TANK TOP  652   3   
1214       593935_592435  SS T-SHIRT       254   1
1214       593935_592435  SS T-SHIRT       254   2 
1214       593935_592435  SS T-SHIRT       254   3
7022       1572472        T-SHIRT          338   1
7022       1572472        T-SHIRT          338   2
7022       1572472        T-SHIRT          338   3

要生成上表，请运行以下fact_code style_ item buyer style_serial 1206 -23 LADIES TANK TOP 652 10 1206 -23 LADIES TANK TOP 652 10 1206 -23 LADIES TANK TOP 652 10 1214 593935_592435 SS T-SHIRT 254 2 1214 593935_592435 SS T-SHIRT 254 2 1214 593935_592435 SS T-SHIRT 254 2 7022 1572472 T-SHIRT 338 100 7022 1572472 T-SHIRT 338 100 7022 1572472 T-SHIRT 338 100代码：

但是，我无法获得所需的输出。

主要目的是在这种情况下创建一个变量style_serial，该变量采用按一定数量的列分组的唯一值，在这种情况下，该事实值是fact_code，style_，item和Buyer。

任何帮助将不胜感激。

Answer 1

我们可以使用group_indices中的dplyr

library(dplyr)
df %>%
   mutate(style_serial = sample(6000)[group_indices(.,fact_code, style_, buyer)])
# fact_code        style_            item buyer style_serial
#1      1206           -23 LADIES TANK TOP   652         5778
#2      1206           -23 LADIES TANK TOP   652         5778
#3      1206           -23 LADIES TANK TOP   652         5778
#4      1214 593935_592435      SS T-SHIRT   254          998
#5      1214 593935_592435      SS T-SHIRT   254          998
#6      1214 593935_592435      SS T-SHIRT   254          998
#7      7022       1572472         T-SHIRT   338         3018
#8      7022       1572472         T-SHIRT   338         3018
#9      7022       1572472         T-SHIRT   338         3018

注意：数字用random sample进行了编号，如果我们不需要，请删除sample部分

df %>%
  mutate(style_serial = group_indices(.,fact_code, style_, buyer))

或使用base R

v1 <- with(df, do.call(paste, df[1:3]))
df$style_serial <-  match(v1, unique(v1))

数据

df <- structure(list(fact_code = c(1206L, 1206L, 1206L, 1214L, 1214L, 
1214L, 7022L, 7022L, 7022L), style_ = c("-23", "-23", "-23", 
"593935_592435", "593935_592435", "593935_592435", "1572472", 
"1572472", "1572472"), item = c("LADIES TANK TOP", "LADIES TANK TOP", 
"LADIES TANK TOP", "SS T-SHIRT", "SS T-SHIRT", "SS T-SHIRT", 
"T-SHIRT", "T-SHIRT", "T-SHIRT"), buyer = c(652L, 652L, 652L, 
254L, 254L, 254L, 338L, 338L, 338L)), class = "data.frame", row.names = c(NA, 
-9L))

Answer 2

完全dplyr解决方案，创建一个查找表并将其连接到基本表上。

serial_df <- df %>%
  dplyr::group_by(fact_code, style_, buyer) %>% 
  dplyr::summarise() %>% 
  dplyr::ungroup() %>% 
  dplyr::mutate(style_serial = row_number())

dplyr::left_join(df, serial_df)
#> Joining, by = c("fact_code", "style_", "buyer")
#>   fact_code        style_            item buyer style_serial
#> 1      1206           -23 LADIES TANK TOP   652            1
#> 2      1206           -23 LADIES TANK TOP   652            1
#> 3      1206           -23 LADIES TANK TOP   652            1
#> 4      1214 593935_592435      SS T-SHIRT   254            2
#> 5      1214 593935_592435      SS T-SHIRT   254            2
#> 6      1214 593935_592435      SS T-SHIRT   254            2
#> 7      7022       1572472         T-SHIRT   338            3
#> 8      7022       1572472         T-SHIRT   338            3
#> 9      7022       1572472         T-SHIRT   338            3

如果您希望将其作为“单线”：

df <- df %>% dplyr::left_join(
  df %>%
    dplyr::group_by(fact_code, style_, buyer) %>% 
    dplyr::summarise() %>% 
    dplyr::ungroup() %>% 
    dplyr::mutate(style_serial = row_number())
  )
#> Joining, by = c("fact_code", "style_", "buyer")
#>   fact_code        style_            item buyer style_serial
#> 1      1206           -23 LADIES TANK TOP   652            1
#> 2      1206           -23 LADIES TANK TOP   652            1
#> 3      1206           -23 LADIES TANK TOP   652            1
#> 4      1214 593935_592435      SS T-SHIRT   254            2
#> 5      1214 593935_592435      SS T-SHIRT   254            2
#> 6      1214 593935_592435      SS T-SHIRT   254            2
#> 7      7022       1572472         T-SHIRT   338            3
#> 8      7022       1572472         T-SHIRT   338            3
#> 9      7022       1572472         T-SHIRT   338            3

^{由reprex package（v0.2.1）于2019-02-06创建}

Answer 3

您可以使用rleid中的data.table，即

library(dplyr)
df %>% 
 mutate(style = data.table::rleid(fact_code, style_, item))

Answer 4

使用dplyr而没有其他软件包的方式：

df %>%
  mutate(
    style_serial = cumsum(
      coalesce(as.numeric(paste0(fact_code, style_, buyer) != lag(paste0(fact_code, style_, buyer))), 1)
      )
  )

仅使用data.table：

library(data.table)

setDT(df)[, style_serial := .GRP, by = .(fact_code, style_, buyer)]

两种情况下的输出：

   fact_code        style_          item buyer style_serial
1:      1206           -23 LADIESTANKTOP   652            1
2:      1206           -23 LADIESTANKTOP   652            1
3:      1206           -23 LADIESTANKTOP   652            1
4:      1214 593935_592435     SST-SHIRT   254            2
5:      1214 593935_592435     SST-SHIRT   254            2
6:      1214 593935_592435     SST-SHIRT   254            2
7:      7022       1572472       T-SHIRT   338            3
8:      7022       1572472       T-SHIRT   338            3
9:      7022       1572472       T-SHIRT   338            3

使用dplyr在R中使用group_by生成唯一/随机序列号

4 个答案:

数据