Question

我有一个复杂的编码问题，希望您能提供帮助：

每个月我都必须使用更改表对查找表进行更改，该更改表概述了（1）要添加的新产品和（2）产品名称的更改。我正在尝试使此过程自动化，因为我正在处理40,000行以上的数据。这里有太多敏感数据无法显示，但是我在下面给出了一个有关数据和过程的小例子：

查询表

dput(LookupFile)
structure(list(ProductTag = structure(c(1L, 6L, 3L, 5L, 2L, 7L, 
4L), .Label = c("Confect", "Orange", "Pastries", "Root", "Sauces", 
"Spread", "White"), class = "factor"), Brand = structure(c(1L, 
1L, 1L, 2L, 2L, 3L, 4L), .Label = c("A", "B", "C", "D"), class = "factor"), 
    BrandID = c(340, 340, 340, 400, 400, 402, 403), Name = structure(c(3L, 
    4L, 1L, 7L, 2L, 5L, 6L), .Label = c("Cakes", "Carrots", "Choc", 
    "Honey", "Onions", "Potatoes", "Syrups"), class = "factor"), 
    NameID = c(200, 700, 100, 300, 1000, 1200, 1700)), class = "data.frame", row.names = c(NA, 
-7L))

变更表

dput(ChangesFiles)
structure(list(ProductTag = structure(c(1L, 3L, 2L), .Label = c("Breads", 
"Confect", "Flours"), class = "factor"), Brand = structure(c(1L, 
2L, 1L), .Label = c("A", "D"), class = "factor"), BrandID = c(340, 
403, 340), Name = structure(c(2L, 3L, 1L), .Label = c("Chocolate", 
"Gluten", "Staple"), class = "factor"), NameID = c(4000, 3400, 
200)), class = "data.frame", row.names = c(NA, -3L))

我知道rbind允许我将新数据附加到查找表上：

rbind(LookupFile,ChangesFiles)

但是，这又增加了一层必须修改现有行（即产品名称）的内容。在我给出的示例中，我想使用ChangesFiles记录更改以下记录，该记录将Name ='Choc'替换为'Chocolate'：

<ProductTag = 'Confect', Brand = 'A', BrandID = '340', Name = 'Choc', NameID = '200'

这是一个小数据样本，因此我可以轻松地手动进行此更改。但是，我将如何进行大规模更改？

我希望它看起来像这样：

     ProductTag Brand BrandID   Name   NameID
1     Confect     A     340 Chocolate    200
2      Spread     A     340     Honey    700
3    Pastries     A     340     Cakes    100
4      Sauces     B     400    Syrups    300
5      Orange     B     400   Carrots   1000
6       White     C     402    Onions   1200
7        Root     D     403  Potatoes   1700
8      Breads     A     340    Gluten   4000
9      Flours     D     403    Staple   3400

Answer 1

这是一个Vlookup选项

> library(qdapTools)
> LookupFile$Name <- as.character(LookupFile$Name) # just for not dealing with factors
> replace <- as.character(LookupFile$ProductTag %l% ChangesFiles[,c(1,4)])  # here's a Vlookup
> ind <- !is.na(replace) # identifying value to replace
> LookupFile$Name[ind] <- replace[ind] # replacing
> rbind(LookupFile,ChangesFiles) # delivering your desired output
   ProductTag Brand BrandID      Name NameID
1     Confect     A     340 Chocolate    200
2      Spread     A     340     Honey    700
3    Pastries     A     340     Cakes    100
4      Sauces     B     400    Syrups    300
5      Orange     B     400   Carrots   1000
6       White     C     402    Onions   1200
7        Root     D     403  Potatoes   1700
8      Breads     A     340    Gluten   4000
9      Flours     D     403    Staple   3400
10    Confect     A     340 Chocolate    200

Answer 2

在Changes中处理数据（并将其重命名为Lookup和tidyverse）：

library(tidyverse)

bind_rows(
  anti_join(Changes, Lookup,  by = colnames(Changes)),   # data  without changes
  anti_join(Lookup,  Changes, by = colnames(Lookup)[-4]) # changed names & added
  ) %>%
  arrange(BrandID, NameID)

#   ProductTag Brand BrandID      Name NameID
# 1   Pastries     A     340     Cakes    100
# 2    Confect     A     340 Chocolate    200
# 3     Breads     A     340    Gluten   4000
# 4     Spread     A     340     Honey    700
# 5     Orange     B     400   Carrots   1000
# 6     Sauces     B     400    Syrups    300
# 7      White     C     402    Onions   1200
# 8       Root     D     403  Potatoes   1700
# 9     Flours     D     403    Staple   3400

Answer 3

仅从LookupFile中获取ChangesFiles中未定义的行，而从ChangesFiles中获取所有内容。

key <- c("BrandID", "NameID") #Assuming that BrandID and NameID need to be unique
rbind(LookupFile[!interaction(LookupFile[key]) %in% interaction(ChangesFiles[key]),], ChangesFiles)

   ProductTag Brand BrandID      Name NameID
2      Spread     A     340     Honey    700
3    Pastries     A     340     Cakes    100
4      Sauces     B     400    Syrups    300
5      Orange     B     400   Carrots   1000
6       White     C     402    Onions   1200
7        Root     D     403  Potatoes   1700
1      Breads     A     340    Gluten   4000
21     Flours     D     403    Staple   3400
31    Confect     A     340 Chocolate    200

如果还有要删除的内容，并且在ChangesFiles中用NA标记，则可以使用：

rbind(LookupFile[!interaction(LookupFile[key]) %in% interaction(ChangesFiles[key]),], ChangesFiles[complete.cases(ChangesFiles),])

Answer 4

data.table来营救！这是单线的。.

使用更新联接仅更新名称列。

library( data.table )
setDT(LookupFile)[ setDT(ChangesFiles), Name := i.Name, on = .(ProductTag, Brand, BrandID)][]

#    ProductTag Brand BrandID      Name NameID
# 1:    Confect     A     340 Chocolate    200
# 2:     Spread     A     340     Honey    700
# 3:   Pastries     A     340     Cakes    100
# 4:     Sauces     B     400    Syrups    300
# 5:     Orange     B     400   Carrots   1000
# 6:      White     C     402    Onions   1200
# 7:       Root     D     403  Potatoes   1700

setDT()用于将data.frames转换为data.table-class。如果您的数据已经在data.table-类中，则

LookupFile [ChangesFiles，Name：= i.Name，on =。（ProductTag，Brand，BrandID）] []

也可以。

在R中同时合并表和更改值

4 个答案: