在一个数据框中查找字符串并将其替换为第二个数据框

时间:2015-07-02 12:38:57

标签: r

一般来说,我有两个数据框:

主要的一个:

structure(list(mpg = c(21, 21, 22.8, 21.4, 18.7, 18.1, 14.3, 
24.4, 22.8, 19.2, 17.8, 16.4, 17.3, 15.2, 10.4, 10.4, 14.7, 32.4, 
30.4, 33.9, 21.5, 15.5, 15.2, 13.3, 19.2, 27.3, 26, 30.4, 15.8, 
19.7, 15, 21.4), cyl = c(6, 6, 4, 6, 8, 6, 8, 4, 4, 6, 6, 8, 
8, 8, 8, 8, 8, 4, 4, 4, 4, 8, 8, 8, 8, 4, 4, 4, 8, 6, 8, 4), 
    disp = c(160, 160, 108, 258, 360, 225, 360, 146.7, 140.8, 
    167.6, 167.6, 275.8, 275.8, 275.8, 472, 460, 440, 78.7, 75.7, 
    71.1, 120.1, 318, 304, 350, 400, 79, 120.3, 95.1, 351, 145, 
    301, 121), hp = c(110, 110, 93, 110, 175, 105, 245, 62, 95, 
    123, 123, 180, 180, 180, 205, 215, 230, 66, 52, 65, 97, 150, 
    150, 245, 175, 66, 91, 113, 264, 175, 335, 109), drat = c(3.9, 
    3.9, 3.85, 3.08, 3.15, 2.76, 3.21, 3.69, 3.92, 3.92, 3.92, 
    3.07, 3.07, 3.07, 2.93, 3, 3.23, 4.08, 4.93, 4.22, 3.7, 2.76, 
    3.15, 3.73, 3.08, 4.08, 4.43, 3.77, 4.22, 3.62, 3.54, 4.11
    ), wt = c(2.62, 2.875, 2.32, 3.215, 3.44, 3.46, 3.57, 3.19, 
    3.15, 3.44, 3.44, 4.07, 3.73, 3.78, 5.25, 5.424, 5.345, 2.2, 
    1.615, 1.835, 2.465, 3.52, 3.435, 3.84, 3.845, 1.935, 2.14, 
    1.513, 3.17, 2.77, 3.57, 2.78), qsec = c(16.46, 17.02, 18.61, 
    19.44, 17.02, 20.22, 15.84, 20, 22.9, 18.3, 18.9, 17.4, 17.6, 
    18, 17.98, 17.82, 17.42, 19.47, 18.52, 19.9, 20.01, 16.87, 
    17.3, 15.41, 17.05, 18.9, 16.7, 16.9, 14.5, 15.5, 14.6, 18.6
    ), vs = c(0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 
    0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1), am = c(1, 
    1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 
    0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1), gear = c("a", "b", "c", 
    "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", 
    "p", "r", "s", "t", "u", "w", "z", "a1", "b1", "c1", "d1", 
    "e1", "f1", "g1", "h1", "i1"), carb = c("Mazda RX4", "Mazda RX4 Wag", 
    "Datsun 710", "Hornet 4 Drive", "Hornet Sportabout", "Valiant", 
    "Duster 360", "Merc 240D", "Merc 230", "Merc 280", "Merc 280C", 
    "Merc 450SE", "Merc 450SL", "Merc 450SLC", "Cadillac Fleetwood", 
    "Lincoln Continental", "Chrysler Imperial", "Fiat 128", "Honda Civic", 
    "Toyota Corolla", "Toyota Corona", "Dodge Challenger", "AMC Javelin", 
    "Camaro Z28", "Pontiac Firebird", "Fiat X1-9", "Porsche 914-2", 
    "Lotus Europa", "Ford Pantera L", "Ferrari Dino", "Maserati Bora", 
    "Volvo 142E")), .Names = c("mpg", "cyl", "disp", "hp", "drat", 
"wt", "qsec", "vs", "am", "gear", "carb"), row.names = c(NA, 
-32L), class = "data.frame")

表名:

    structure(list(car = structure(c(18L, 18L, 19L, 19L, 5L, 13L, 
14L, 31L, 7L, 21L, 20L, 22L, 23L, 24L, 25L, 26L, 2L, 15L, 4L, 
9L, 12L, 29L, 30L, 6L, 1L, 3L, 27L, 10L, 28L, 16L, 11L, 11L, 
8L, 17L, 32L, 32L), .Label = c("AMC Javelin", "Cadillac Fleetwood", 
"Camaro Z28", "Chrysler Imperial", "Datsun 710", "Dodge Challenger", 
"Duster 360", "Ferrari Dino", "Fiat 128", "Fiat X1-9", "Ford Pantera L", 
"Honda Civic", "Hornet 4 Drive", "Hornet Sportabout", "Lincoln Continental", 
"Lotus Europa", "Maserati Bora", "Mazda RX4", "Mazda RX4 Wag", 
"Merc 230", "Merc 240D", "Merc 280", "Merc 280C", "Merc 450SE", 
"Merc 450SL", "Merc 450SLC", "Pontiac Firebird", "Porsche 914-2", 
"Toyota Corolla", "Toyota Corona", "Valiant", "Volvo 142E"), class = "factor"), 
    owner = structure(c(15L, 25L, 9L, 8L, 4L, 7L, 31L, 22L, 17L, 
    6L, 12L, 32L, 12L, 19L, 1L, 3L, 11L, 20L, 26L, 27L, 24L, 
    29L, 16L, 1L, 2L, 13L, 9L, 26L, 23L, 10L, 18L, 14L, 30L, 
    28L, 5L, 21L), .Label = c("Amid", "Armin", "Crane", "Dietmar", 
    "Gared", "Gratea", "Hank", "Hannea", "Hans", "Heta", "Horse_with_no_name", 
    "Jeff", "Krea", "Marea", "Mark", "Mattheus", "Micha", "Miko", 
    "Myrcella", "Neil", "Nina", "Peter", "Rene", "Robert", "Steffan", 
    "Tim", "Timon", "Timothy", "Uwe", "Vincent", "Wolfram", "Yena"
    ), class = "factor")), .Names = c("car", "owner"), row.names = c(NA, 
-36L), class = "data.frame")

所以我想在我的主数据框中添加额外的列,其名称为owner。您可以在第二个数据中找到每辆汽车的owners

BUT:

正如您已经注意到,有些汽车可能拥有不同的车主。我不想在我的主数据框中创建额外的行,所以我想将两个/三个或更多的所有者放在同一行中,但是以逗号分隔。

3 个答案:

答案 0 :(得分:3)

我会让你的第二个数据帧成为一个数据表(我称之为dt2)然后你可以这样做:

dt2 <- dt2[ , list( 'owner' = toString(owner)) , by = car ]

然后将其加入到您的第一个数据表中。

编辑:我喜欢@akrun

添加“toString”

答案 1 :(得分:3)

尝试

library(data.table)
res <- setDT(df1, key='car')[df2][,c(.SD[1L],
           list(ownerN=toString(owner))), car][,owner:=NULL]
head(res,2)
#             car mpg cyl disp  hp drat    wt  qsec vs am gear carb
#1:     Mazda RX4  21   6  160 110  3.9 2.620 16.46  0  1    4    4
#2: Mazda RX4 Wag  21   6  160 110  3.9 2.875 17.02  0  1    4    4
#          ownerN
#1: Mark, Steffan
#2:  Hans, Hannea

或者选择paste'df2'中'car'的'owner'并加入'df1'

 dt2 <- setDT(df2)[, .(owner=toString(owner)), by= car]
 setkey(setDT(df1), car)[dt2]

数据

df1 <- cbind(mtcars, car=row.names(mtcars))
row.names(df1) <- NULL
df2[] <- lapply(df2, as.character)

答案 2 :(得分:2)

我不知道为什么你会这样做但是如果你想重新组合同一行中的所有者并用逗号分隔它们:

library(dplyr)
df2 %>% 
  group_by(car) %>% 
  summarise(owner = paste(owner, collapse = ", ")) %>% 
  inner_join(df1, by = c("car" = "carb"))

# Which gives for example :
        car         owner mpg cyl disp  hp drat   wt  qsec vs am gear
1 Mazda RX4 Mark, Steffan  21   6  160 110  3.9 2.62 16.46  0  1    4