Question

这是一个相当简单的任务，但我试图围绕如何使用数据框与键和值匹配值。我尝试过合并，但由于行数不同，我不确定这是否合适。

我是否可以编写一个for循环来遍历输入数据框中的每个键，如果它是查找表中的一个，则更改Product的值？

基本上，我的数据如下所示：

input_key <- c(9061,8680,1546,5376,9550,9909,3853,3732,9209) input_product <- c("Water", "Bread", NA, "Chips", "Chicken", NA, "Chocolate", "Donuts", "Juice") input <- as.data.frame(cbind(input_key, input_product))

我想用相应查找表中的Product值替换NA：

lookup_key <- c(1245,1546, 7764, 9909)
lookup_product <- c("Ice Cream","Soda", "Bacon","Cheese")
lookup_data <- as.dataframe(cbind(lookup_key, lookup_product))

最后，我希望最终的数据框看起来像这样：

output_key <- c(9061,8680,1546,5376,9550,9909,3853,3732,9209)
output_product <- c("Water", "Bread", "Soda", "Chips", "Chicken", "Cheese", Chocolate","Donuts", "Juice")
output_data <- as.data.frame(cbind(output_key, output_product))

Answer 1

非常累，所以这很笨拙，但它应该适用于所提供的数据（尽管你的输出样本是错误的）：

require(dplyr)

rbind(input[!is.na(input$input_product),],
    inner_join(lookup_data,input,by=c("lookup_key"="input_key")) %>% 
    select(lookup_key,lookup_product) %>%
    rename(input_product = lookup_product, input_key = lookup_key))

Answer 2

使用 data.table 包可以轻松完成此操作，如下所示：

# load sample data
input_data <- structure(list(
    input_key = 
        structure(c(6L, 5L, 1L, 4L, 8L, 9L, 
                    3L, 2L, 7L), 
                  .Label = c("1546", "3732", "3853", "5376", "8680", 
                             "9061", "9209", "9550", "9909"), class = "factor"), 
    input_product = structure(c(7L, 1L, NA, 3L, 2L, NA, 4L, 5L, 6L), 
                              .Label = c("Bread", "Chicken", "Chips", "Chocolate", 
                                         "Donuts", "Juice", "Water"), class = "factor")), 
    .Names = c("input_key", 
               "input_product"), 
    row.names = c(NA, -9L), class = "data.frame")

lookup_data <- structure(list(
    lookup_key = structure(1:4, 
                           .Label = c("1245", "1546", "7764", "9909"), class = "factor"), 
    lookup_product = structure(c(3L, 
                                 4L, 1L, 2L), .Label = c("Bacon", "Cheese", "Ice Cream", "Soda"
                                 ), class = "factor")), .Names = c("lookup_key", "lookup_product"
                                 ), row.names = c(NA, -4L), class = "data.frame")

# convert to data.table and add keys for merging
library(data.table)
input <- data.table(input_data, key = 'input_key')
lookup <- data.table(lookup_data, key = 'lookup_key')

# merge the data (can use merge method as well)
DT <- lookup[input]

# where the input_product is NA, replace with lookup
DT[is.na(input_product), input_product := lookup_product]
print(DT)

# you can now get rid of lookup_product column, if you like
DT[, lookup_product:= NULL]
print(DT)

以上的最终结果是：

> print(DT)
   lookup_key input_product
1:       1546          Soda
2:       3732        Donuts
3:       3853     Chocolate
4:       5376         Chips
5:       8680         Bread
6:       9061         Water
7:       9209         Juice
8:       9550       Chicken
9:       9909        Cheese

Answer 3

选项1 ：使用R-base函数：

矢量解决方案：

input$input_product[input$input_key %in% lookup_data$lookup_key == TRUE] <- 
    lookup_product[lookup_data$lookup_key %in% input$input_key == TRUE]

注意： ==TRUE是多余的，只是为了更好地理解而添加。

使用lapply功能：

idx <- input$input_key %in% lookup_data$lookup_key
lapply((1:nrow(input)),
    function(i) {
        if (idx[i] == TRUE) {
            jdx <- lookup_data$lookup_key %in% input$input_key[i]
            input$input_product[i] <<- lookup_data$lookup_product[jdx == TRUE]
        }
    }
)

注意：注意全局分配操作（<<）

使用for循环：

idx <- input$input_key %in% lookup_data$lookup_key
for (i in (1:nrow(input))) {
    if (idx[i] == TRUE) {
        jdx <- lookup_data$lookup_key %in% input$input_key[i]
        input$input_product[i] <- lookup_data$lookup_product[jdx == TRUE]
    }
}

注意：这里我们只需要一个简单的作业。

在上述情况下，您需要创建设置输入参数的数据框：stringsAsFactors为FALSE，例如：

input <- as.data.frame(cbind(input_key, input_product), stringsAsFactors = FALSE)
lookup_data <- as.data.frame(cbind(lookup_key, lookup_product), stringsAsFactors = FALSE)

然后你得到输出：

> input
  input_key input_product
1      9061         Water
2      8680         Bread
3      1546          Soda
4      5376         Chips
5      9550       Chicken
6      9909        Cheese
7      3853     Chocolate
8      3732        Donuts
9      9209         Juice
>

选项2 ：使用data.table包

我使用内部联接找到了优雅的解决方案：

require(data.table)
setkey(input,input_key)
setkey(lookup_data,lookup_key)
> setDT(input)[setDT(lookup_data), input_product := i.lookup_product, nomatch=0][]
 input_key input_product
1:      1546          Soda
2:      3732        Donuts
3:      3853     Chocolate
4:      5376         Chips
5:      8680         Bread
6:      9061         Water
7:      9209         Juice
8:      9550       Chicken
9:      9909        Cheese
>

data.table实际上对数据集操作非常强大。我们来解释背后的语法：

setDT：通过引用（不发生复制）将数据框转换为data.table，因为原始数据集不是data.table类，这是通往即时转换它们。请注意，现在没有必要使用属性stringsAsFactors，因为对于data.table，其默认值为FALSE。
input[lookup_data, nomatch=0]：使用data.table包创建内部联接的方式（请参阅此link）。这意味着拦截两个表。值为no match的{{1}}选项表示不会为该行的i返回任何行（在我们的示例中为0）。

这将是输出：

lookup_data

> setDT(input)[setDT(lookup_data), nomatch=0][] input_key input_product lookup_product 1: 1546 NA Soda 2: 9909 NA Cheese >：从外部分配列数据集，内部数据集的值。
input_product := i.lookup_product：打印结果（用于验证解决方案目的）

有关[]的更多信息，我建议您阅读包documentation，它附带了许多示例。在R中运行以下命令（加载data.table包之后）：

也是一个好主意

data.table

它提供了50多个示例（包文档中的相同内容）及其相应的结果，以了解此包的不同用途。

<强>性能

让我们在性能方面比较所有可能的替代方案。然后我们需要修改用于增加其大小的输入数据集：

example(data.table)

将所有不同的替代品包装到相应的给定函数中。我已经包括在内 @count

提出的

rep.num <- 1000
input_key <- rep(c(9061,8680,1546,5376,9550,9909,3853,3732,9209),rep.num)
input_product <- rep(c("Water", "Bread", NA, "Chips", "Chicken", NA, "Chocolate", 
    "Donuts", "Juice"),rep.num)
input <- as.data.frame(cbind(input_key, input_product), stringsAsFactors=F)

解决方案

dplyr

现在测试每个解决方案（仔细检查）。

复制输入数据集，因为vectSol <- function(input, lookup_data) { input$input_product[input$input_key %in% lookup_data$lookup_key == TRUE] <- lookup_product[lookup_data$lookup_key %in% input$input_key == TRUE] return(input) } lapplySol <- function(input, lookup_data) { idx <- input$input_key %in% lookup_data$lookup_key lapply((1:nrow(input)), function(i) { if (idx[i] == TRUE) { jdx <- lookup_data$lookup_key %in% input$input_key[i] input$input_product[i] <<- lookup_data$lookup_product[jdx == TRUE] } } ) return(input) } forSol <- function(input, lookup_data) { idx <- input$input_key %in% lookup_data$lookup_key for (i in (1:nrow(input))) { if (idx[i] == TRUE) { jdx <- lookup_data$lookup_key %in% input$input_key[i] input$input_product[i] <- lookup_data$lookup_product[jdx == TRUE] } } return(input) } dataTableSol <- function (input, lookup_data) { setkey(input,input_key) setkey(lookup_data,lookup_key) input[lookup_data, input_product := i.lookup_product, nomatch=0] return(input) } dplyrSol <- function(input, lookup_data) { rbind(input[!is.na(input$input_product),], inner_join(lookup_data,input,by=c("lookup_key"="input_key")) %>% select(lookup_key,lookup_product) %>% rename(input_product = lookup_product, input_key = lookup_key)) return(input) }通过引用操作。我们需要从头开始创建一个副本。

data.table

我们使用包input.copy <- setDT(as.data.frame(cbind(input_key, input_product), stringsAsFactors=F)) lookup_data.copy<- setDT(as.data.frame(cbind(lookup_key, lookup_product), stringsAsFactors=F)) input1.out <- vectSol(input, lookup_data) input2.out <- lapplySol(input, lookup_data) input3.out <- forSol(input, lookup_data) input4.out <- forSol(input, lookup_data) input5.out <- dataTableSol(copy(input.copy), lookup_data.copy)，因为compare无法比较数据框使用data.table对象，因为属性值，所以我们需要一个仅检查值的比较。

all.equal

现在让我们使用library(compare) OK <- all( all.equal(input1.out, input2.out) && all.equal(input1.out, input3.out) && all.equal(input1.out, input4.out) && compare(input1.out[order(input1.out$input_key),], input5.out, ignoreAttrs=T)$result ) try(if(!OK) stop("Result are not the same for all methods"))包来比较所有解决方案的时间性能

microbenchmark

结果如下：

library(microbenchmark)
op <- microbenchmark(
    VECT = {vectSol(input, lookup_data)},
    FOR = {forSol(input, lookup_data)},
    LAPPLY = {lapplySol(input, lookup_data)},
    DPLYR = {dplyrSol(input, lookup_data)},
    DATATABLE = {dataTableSol(input.copy, lookup_data.copy)},
    times=100L)
print(op)

此外，我们可以通过以下方式绘制解决方案：

Unit: milliseconds
      expr        min         lq       mean     median         uq        max neval cld
      VECT   1.005890   1.078983   1.384964   1.108162   1.282269   6.562040   100  a 
       FOR 416.268583 438.545475 476.551526 449.679426 476.032938 740.027018   100   b
    LAPPLY 428.456092 454.664204 492.918478 464.204607 501.168572 751.786224   100   b
     DPLYR  13.371847  14.919726  16.482236  16.105815  17.086174  23.537866   100  a 
 DATATABLE   1.699995   2.059205   2.427629   2.279371   2.489406   8.542219   100  a

此订单的最佳表现是：Vectorial，data.table，dplyr，for-loop，lapply。

如何迭代向量并替换R中的值

3 个答案: