dplyr:bind_rows()如何更改原始数据帧

时间:2017-08-31 11:02:05

标签: r dataframe dplyr

hth1是我已经拥有的数据框。

> hth1
Source: local data frame [13 x 14]
Groups: team [13]

    team   CSK    DC    DD    GL   KKR   KTK  KXIP    MI    PW   RCB  RPSG
   <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1    CSK     0     8    11     0    11     2     9    10     4    10     0
2     DC     2     0     8     0     2     1     7     5     3     8     0
3     DD     5     3     0     0     7     2     8     5     2    10     2
4     GL     0     0     2     0     0     0     0     0     0     1     0
5    KKR     5     7    10     2     0     0     5    10     3    15     0
6    KTK     0     0     0     0     2     0     1     0     1     2     0
7   KXIP     8     3    10     2    14     0     0    11     2     6     1
8     MI    12     5    13     2     8     1     7     0     3    11     1
9     PW     2     1     4     0     2     0     4     3     0     1     0
10   RCB     9     3     7     2     3     0    12     8     4     0     1
11  RPSG     0     0     0     2     2     0     1     1     0     1     0
12    RR     8     2     7     0    14     1     7     6     2     7     0
13    SH     3     0     4     0     5     0     4     5     2     5     2
# ... with 2 more variables: RR <dbl>, SH <dbl>

为什么bind_rows()和原始数据帧返回的数据帧不同?

> h <- list(hth1)
> hth_b1 <- bind_rows(h)
> identical(hth1, hth_b1)
[1] FALSE
> class(hth_b1)
[1] "grouped_df" "tbl_df"     "tbl"        "data.frame"
> class(hth1)
[1] "grouped_df" "tbl_df"     "tbl"        "data.frame"
> setequal(hth1, hth_b1)
TRUE
> anti_join(hth1, hth_b1)
Joining, by = c("team", "CSK", "DC", "DD", "GL", "KKR", "KTK", "KXIP", "MI", "PW", "RCB", "RPSG", "RR", "SH")
Source: local data frame [0 x 14]
Groups: team [13]

# ... with 14 variables: team <chr>, CSK <dbl>, DC <dbl>, DD <dbl>, GL <dbl>,
#   KKR <dbl>, KTK <dbl>, KXIP <dbl>, MI <dbl>, PW <dbl>, RCB <dbl>,
#   RPSG <dbl>, RR <dbl>, SH <dbl>

我错过了什么?我被困在这里已经很久了。

更新1:

根据Benjamin的要求,我在两个数据帧上都有dput()函数。这是输出。

> dput(hth_b1)
structure(list(team = c("CSK", "DC", "DD", "GL", "KKR", "KTK", 
"KXIP", "MI", "PW", "RCB", "RPSG", "RR", "SH"), CSK = c(0, 2, 
5, 0, 5, 0, 8, 12, 2, 9, 0, 8, 3), DC = c(8, 0, 3, 0, 7, 0, 3, 
5, 1, 3, 0, 2, 0), DD = c(11, 8, 0, 2, 10, 0, 10, 13, 4, 7, 0, 
7, 4), GL = c(0, 0, 0, 0, 2, 0, 2, 2, 0, 2, 2, 0, 0), KKR = c(11, 
2, 7, 0, 0, 2, 14, 8, 2, 3, 2, 14, 5), KTK = c(2, 1, 2, 0, 0, 
0, 0, 1, 0, 0, 0, 1, 0), KXIP = c(9, 7, 8, 0, 5, 1, 0, 7, 4, 
12, 1, 7, 4), MI = c(10, 5, 5, 0, 10, 0, 11, 0, 3, 8, 1, 6, 5
), PW = c(4, 3, 2, 0, 3, 1, 2, 3, 0, 4, 0, 2, 2), RCB = c(10, 
8, 10, 1, 15, 2, 6, 11, 1, 0, 1, 7, 5), RPSG = c(0, 0, 2, 0, 
0, 0, 1, 1, 0, 1, 0, 0, 2), RR = c(9, 7, 9, 0, 1, 1, 8, 10, 3, 
9, 0, 0, 7), SH = c(3, 0, 4, 3, 4, 0, 4, 3, 0, 4, 0, 0, 0)), .Names = c("team", 
"CSK", "DC", "DD", "GL", "KKR", "KTK", "KXIP", "MI", "PW", "RCB", 
"RPSG", "RR", "SH"), row.names = c(NA, -13L), class = c("grouped_df", 
"tbl_df", "tbl", "data.frame"), vars = list(team), indices = list(
    0L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L), group_sizes = c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), biggest_group_size = 1L, labels = structure(list(
    team = c("CSK", "DC", "DD", "GL", "KKR", "KTK", "KXIP", "MI", 
    "PW", "RCB", "RPSG", "RR", "SH")), row.names = c(NA, -13L
), class = "data.frame", vars = list(team), .Names = "team"))
> 
> dput(hth1)
structure(list(team = c("CSK", "DC", "DD", "GL", "KKR", "KTK", 
"KXIP", "MI", "PW", "RCB", "RPSG", "RR", "SH"), CSK = c(0, 2, 
5, 0, 5, 0, 8, 12, 2, 9, 0, 8, 3), DC = c(8, 0, 3, 0, 7, 0, 3, 
5, 1, 3, 0, 2, 0), DD = c(11, 8, 0, 2, 10, 0, 10, 13, 4, 7, 0, 
7, 4), GL = c(0, 0, 0, 0, 2, 0, 2, 2, 0, 2, 2, 0, 0), KKR = c(11, 
2, 7, 0, 0, 2, 14, 8, 2, 3, 2, 14, 5), KTK = c(2, 1, 2, 0, 0, 
0, 0, 1, 0, 0, 0, 1, 0), KXIP = c(9, 7, 8, 0, 5, 1, 0, 7, 4, 
12, 1, 7, 4), MI = c(10, 5, 5, 0, 10, 0, 11, 0, 3, 8, 1, 6, 5
), PW = c(4, 3, 2, 0, 3, 1, 2, 3, 0, 4, 0, 2, 2), RCB = c(10, 
8, 10, 1, 15, 2, 6, 11, 1, 0, 1, 7, 5), RPSG = c(0, 0, 2, 0, 
0, 0, 1, 1, 0, 1, 0, 0, 2), RR = c(9, 7, 9, 0, 1, 1, 8, 10, 3, 
9, 0, 0, 7), SH = c(3, 0, 4, 3, 4, 0, 4, 3, 0, 4, 0, 0, 0)), .Names = c("team", 
"CSK", "DC", "DD", "GL", "KKR", "KTK", "KXIP", "MI", "PW", "RCB", 
"RPSG", "RR", "SH"), class = c("grouped_df", "tbl_df", "tbl", 
"data.frame"), row.names = c(NA, -13L), vars = list(team), labels = structure(list(
    team = c("CSK", "DC", "DD", "GL", "KKR", "KTK", "KXIP", "MI", 
    "PW", "RCB", "RPSG", "RR", "SH")), class = "data.frame", row.names = c(NA, 
-13L), vars = list(team), drop = TRUE, .Names = "team"), indices = list(
    0L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L), drop = TRUE, group_sizes = c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), biggest_group_size = 1L)

两者的输出存在差异,hth1的额外下降= TRUE。

我不明白为什么它不存在于另一个中。

2 个答案:

答案 0 :(得分:1)

可重现的例子:

library(tidyverse)    
test1 <- mtcars %>% group_by(cyl)    
test2 <- bind_rows(list(test1))

identical(test1, test2) #FALSE
all_equal(test1, test2) #TRUE

您可以同时检查attributes,并且可以看到rownames不同:

rownames(test1)
 [1] "Mazda RX4"           "Mazda RX4 Wag"       "Datsun 710"         
 [4] "Hornet 4 Drive"      "Hornet Sportabout"   "Valiant"            
 [7] "Duster 360"          "Merc 240D"           "Merc 230"           
[10] "Merc 280"            "Merc 280C"           "Merc 450SE"         
[13] "Merc 450SL"          "Merc 450SLC"         "Cadillac Fleetwood" 
[16] "Lincoln Continental" "Chrysler Imperial"   "Fiat 128"           
[19] "Honda Civic"         "Toyota Corolla"      "Toyota Corona"      
[22] "Dodge Challenger"    "AMC Javelin"         "Camaro Z28"         
[25] "Pontiac Firebird"    "Fiat X1-9"           "Porsche 914-2"      
[28] "Lotus Europa"        "Ford Pantera L"      "Ferrari Dino"       
[31] "Maserati Bora"       "Volvo 142E"
rownames(test2)
 [1] "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10" "11" "12" "13"
[14] "14" "15" "16" "17" "18" "19" "20" "21" "22" "23" "24" "25" "26"
[27] "27" "28" "29" "30" "31" "32"

永远不要指望吃饭会尊重你的rownames,他们可能会随时默默地放弃。

答案 1 :(得分:0)

原谅这个答案的格式,但看起来你的标签贴在一个对象上,而不是在另一个对象上。如果没有查看生成对象的代码,那么标签被附加或删除的地方就不是我能够知道的。我在下面的对象中加粗了差异。

注意:不要将其格式化为代码是一个慎重的选择。格式化为代码使我无法用粗体文本标记结构中的差异

dput(hth_b1) 结构(列表(团队= c(“CSK”,“DC”,“DD”,“GL”,“KKR”,“KTK”, “KXIP”,“MI”,“PW”,“RCB”,“RPSG”,“RR”,“SH”),CSK = c(0,2, 5,0,5,0,8,12,2,9,0,0,3),DC = c(8,0,3,0,7,0,3, 5,1,3,0,2,0),DD = c(11,8,0,2,10,0,10,13,4,7,0, 7,4),GL = c(0,0,0,0,2,0,2,2,0,2,2,0,0),KKR = c(11, 2,7,7,0,2,14,8,2,3,2,14,5),KTK = c(2,1,2,0,0, 0,0,1,0,0,0,1,0),KXIP = c(9,7,8,0,5,1,0,7,4, 12,1,7,4),MI = c(10,5,5,0,10,0,11,0,3,8,1,6,5) ),PW = c(4,3,2,0,3,1,2,3,0,4,0,2,2),RCB = c(10, 8,10,1,15,2,6,11,1,0,1,7,5),RPSG = c(0,0,2,0, 0,0,1,1,0,1,0,0,2),RR = c(9,7,9,0,1,1,8,10,3, 9,0,0,7),SH = c(3,0,4,3,4,0,4,3,0,4,0,0,0))。。Name = c(“team”, “CSK”,“DC”,“DD”,“GL”,“KKR”,“KTK”,“KXIP”,“MI”,“PW”,“RCB”, “RPSG”,“RR”,“SH”),row.names = c(NA,-13L),class = c(“grouped_df”, “tbl_df”,“tbl”,“data.frame”),vars = list(team),indices = list(     0L,1L,2L,3L,4L,5L,6L,7L,8L,9L,10L,11L,12L),group_sizes = c(1L, 1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L),largest_group_size = 1L ,标签=结构(列表(     team = c(“CSK”,“DC”,“DD”,“GL”,“KKR”,“KTK”,“KXIP”,“MI”,     “PW”,“RCB”,“RPSG”,“RR”,“SH”)),row.names = c(NA,-13L ),class =“data.frame”,vars = list(team),. Name =“team”))

dput(hth1) 结构(列表(团队= c(“CSK”,“DC”,“DD”,“GL”,“KKR”,“KTK”, “KXIP”,“MI”,“PW”,“RCB”,“RPSG”,“RR”,“SH”),CSK = c(0,2, 5,0,5,0,8,12,2,9,0,0,3),DC = c(8,0,3,0,7,0,3, 5,1,3,0,2,0),DD = c(11,8,0,2,10,0,10,13,4,7,0, 7,4),GL = c(0,0,0,0,2,0,2,2,0,2,2,0,0),KKR = c(11, 2,7,7,0,2,14,8,2,3,2,14,5),KTK = c(2,1,2,0,0, 0,0,1,0,0,0,1,0),KXIP = c(9,7,8,0,5,1,0,7,4, 12,1,7,4),MI = c(10,5,5,0,10,0,11,0,3,8,1,6,5) ),PW = c(4,3,2,0,3,1,2,3,0,4,0,2,2),RCB = c(10, 8,10,1,15,2,6,11,1,0,1,7,5),RPSG = c(0,0,2,0, 0,0,1,1,0,1,0,0,2),RR = c(9,7,9,0,1,1,8,10,3, 9,0,0,7),SH = c(3,0,4,3,4,0,4,3,0,4,0,0,0))。。Name = c(“team”, “CSK”,“DC”,“DD”,“GL”,“KKR”,“KTK”,“KXIP”,“MI”,“PW”,“RCB”, “RPSG”,“RR”,“SH”),class = c(“grouped_df”,“tbl_df”,“tbl”, “data.frame”),row.names = c(NA,-13L),vars = list(team),labels = structure(list(list)     team = c(“CSK”,“DC”,“DD”,“GL”,“KKR”,“KTK”,“KXIP”,“MI”,     “PW”,“RCB”,“RPSG”,“RR”,“SH”)),class =“data.frame”,row.names = c(NA, -13L),vars = list(team),drop = TRUE,.Names =“team”),indices = list(     0L,1L,2L,3L,4L,5L,6L,7L,8L,9L,10L,11L,12L),drop = TRUE,group_sizes = c(1L, 1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L),largest_group_size = 1L)

在下面的示例中,我将向mtcars数据框添加标签,然后通过bind_rows运行,您将看到标签不再存在。这就是我认为您的数据正在发生的事情。

library(Hmisc)
mtcars2 <- mtcars
label(mtcars2, self = FALSE) <- toupper(names(mtcars))

library(dplyr)

mtcars3 <- bind_rows(mtcars2)

identical(mtcars2, mtcars3)

label(mtcars3)