用不添加任何信息的列扩展数据框

时间:2018-10-26 09:11:03

标签: r dplyr tidyr

我有如下数据:

ex <- structure(list(data = structure(c(16983, 16983, 16983, 16983, 
16983, 16983, 16983, 16983, 16983, 16983, 16983, 16983, 16983, 
16983, 16983, 16983, 16983, 16983, 16983, 16984, 16984, 16984, 
16984, 16984, 16984, 16985, 16985, 16985, 16985, 16985, 16985, 
16986, 16986, 16986, 16986, 16986, 16986, 16987, 16987, 16987, 
16987, 16988, 16988, 16988, 16988, 16988, 16989, 16989, 16989, 
16989), class = "Date"), media = c("Newspapers", "Newspapers", 
"Newspapers", "Newspapers", "Magazines", "Magazines", "Magazines", 
"Outdoor", "Outdoor", "Outdoor", "Outdoor", "Outdoor", "Outdoor", 
"Radio", "Radio", "Radio", "Radio", "Radio", "Radio", "Radio", 
"Radio", "Radio", "Radio", "Radio", "Radio", "Radio", "Radio", 
"Radio", "Radio", "Radio", "Radio", "Newspapers", "Magazines", 
"Radio", "Radio", "Radio", "Radio", "Radio", "Radio", "Radio", 
"Radio", "Radio", "Radio", "Radio", "Radio", "Radio", "Newspapers", 
"Newspapers", "Magazines", "Radio"), brand_short = c("Brand A", 
"Brand B", "Brand C", "Brand D", "Brand E", "Brand C", "Brand D", 
"Brand A", "Brand E", "Brand B", "Brand F", "Brand C", "Brand D", 
"Brand A", "Brand E", "Brand B", "Brand F", "Brand C", "Brand D", 
"Brand A", "Brand E", "Brand B", "Brand F", "Brand C", "Brand D", 
"Brand A", "Brand E", "Brand B", "Brand F", "Brand C", "Brand D", 
"Brand C", "Brand E", "Brand A", "Brand E", "Brand F", "Brand C", 
"Brand A", "Brand E", "Brand F", "Brand C", "Brand A", "Brand E", 
"Brand B", "Brand F", "Brand C", "Brand A", "Brand C", "Brand A", 
"Brand A"), label = c("ntv0_Newspapers_Brand A", "ntv0_Newspapers_Brand B", 
"ntv0_Newspapers_Brand C", "ntv0_Newspapers_Brand D", "ntv0_Magazines_Brand E", 
"ntv0_Magazines_Brand C", "ntv0_Magazines_Brand D", "ntv0_Outdoor_Brand A", 
"ntv0_Outdoor_Brand E", "ntv0_Outdoor_Brand B", "ntv0_Outdoor_Brand F", 
"ntv0_Outdoor_Brand C", "ntv0_Outdoor_Brand D", "ntv0_Radio_Brand A", 
"ntv0_Radio_Brand E", "ntv0_Radio_Brand B", "ntv0_Radio_Brand F", 
"ntv0_Radio_Brand C", "ntv0_Radio_Brand D", "ntv0_Radio_Brand A", 
"ntv0_Radio_Brand E", "ntv0_Radio_Brand B", "ntv0_Radio_Brand F", 
"ntv0_Radio_Brand C", "ntv0_Radio_Brand D", "ntv0_Radio_Brand A", 
"ntv0_Radio_Brand E", "ntv0_Radio_Brand B", "ntv0_Radio_Brand F", 
"ntv0_Radio_Brand C", "ntv0_Radio_Brand D", "ntv0_Newspapers_Brand C", 
"ntv0_Magazines_Brand E", "ntv0_Radio_Brand A", "ntv0_Radio_Brand E", 
"ntv0_Radio_Brand F", "ntv0_Radio_Brand C", "ntv0_Radio_Brand A", 
"ntv0_Radio_Brand E", "ntv0_Radio_Brand F", "ntv0_Radio_Brand C", 
"ntv0_Radio_Brand A", "ntv0_Radio_Brand E", "ntv0_Radio_Brand B", 
"ntv0_Radio_Brand F", "ntv0_Radio_Brand C", "ntv0_Newspapers_Brand A", 
"ntv0_Newspapers_Brand C", "ntv0_Magazines_Brand A", "ntv0_Radio_Brand A"
), var1 = c(2, 3, 2, 1, 6, 1, 2, 0, 0, 0, 0, 0, 0, 451, 6, 468, 
921, 800, 258, 36, 14, 546, 1316, 749, 264, 36, 15, 608, 918, 
1097, 265, 1, 1, 25, 4, 3, 1129, 25, 4, 3, 981, 46, 5, 552, 4, 
989, 1, 1, 1, 65), var2 = c(9240, 41030, 146280, 45000, 792500, 
151900, 115483, 302125, 4034, 555326, 2661, 5806, 48268, 473789, 
60340, 237996, 484655, 415685, 338373, 6185, 39760, 149848, 315474, 
210416, 159881, 5577, 24150, 100646, 174513, 317148, 106100, 
98280, 164450, 3730, 17990, 160, 941389, 3726, 18060, 152, 765857, 
14152, 34790, 354024, 862, 872216, 6240, 16450, 1870, 20458)), class = c("tbl_df", 
"tbl", "data.frame"), row.names = c(NA, -50L))

我想为datamediabrand_short的所有组合提供行,并用var1填充var20缺少数据。我还想保留label变量,您可以注意到,该变量是使用mediabrand_short创建的。我知道有专门为此设计的tidyr::complete函数,但是我在进行适当的调用时遇到了麻烦。我尝试过的一个:

ex %>% complete(data, nesting(media, brand_short, label), fill = list(var1 = 0, var2 = 0))

不是为所有品牌生产,也不是生产其他不需要的标签:

ex %>% complete(data, media, nesting(brand_short, label), fill = list(var1 = 0, var2 = 0))

如何使我的数据框在任何特定日期具有所有mediabrand_short组合(每个日期有24行),而不与label组合?

1 个答案:

答案 0 :(得分:2)

您想得太多。您不需要嵌套任何内容,因为您想要datamediabrand_short的所有可能组合。 label不应包含在内。要更新标签,请在之后使用mutate语句。

ex %>% complete(data,media,brand_short,fill=list(var1=0,var2=0)) %>%
mutate(label=paste('ntv0',media,brand_short,sep='_'))
# A tibble: 168 x 6
   data       media      brand_short label                    var1   var2
   <date>     <chr>      <chr>       <chr>                   <dbl>  <dbl>
 1 2016-07-01 Magazines  Brand A     ntv0_Magazines_Brand A      0      0
 2 2016-07-01 Magazines  Brand B     ntv0_Magazines_Brand B      0      0
 3 2016-07-01 Magazines  Brand C     ntv0_Magazines_Brand C      1 151900
 4 2016-07-01 Magazines  Brand D     ntv0_Magazines_Brand D      2 115483
 5 2016-07-01 Magazines  Brand E     ntv0_Magazines_Brand E      6 792500
 6 2016-07-01 Magazines  Brand F     ntv0_Magazines_Brand F      0      0
 7 2016-07-01 Newspapers Brand A     ntv0_Newspapers_Brand A     2   9240
 8 2016-07-01 Newspapers Brand B     ntv0_Newspapers_Brand B     3  41030
 9 2016-07-01 Newspapers Brand C     ntv0_Newspapers_Brand C     2 146280
10 2016-07-01 Newspapers Brand D     ntv0_Newspapers_Brand D     1  45000
# ... with 158 more rows