如何在数据帧中搜索部分匹配并对单独的列求和，然后返回所有匹配的相应总和

时间：2016-04-15 14:22:02

标签： r dataframe

这是对上一个问题的跟进。

我有一个名为df2的数据框，其中包含变量AllCustomerName和sum.of.FY.Total。第一列“AllCustomerName”包含所有客户端的列表。我有一个单独的列表，其中包含我需要信息的客户的名称（NeedClientInfo）。我的代码用于在DF中搜索“allcustomername”中与我的NeedClientInfo列表中找到的字符串部分或完全匹配的所有观察结果，并且每次匹配，将“sum.of.FY.Total”列中的客户端编号求和。最终，我想返回每个客户端的总和以及每个客户端的匹配字符串列表。

主数据框

dput(df2)
structure(list(Transaction.ID = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 
10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 
26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 
42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 
58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 
74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 
90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 
105, 106, 107, 108), AllcustomerName = structure(c(44L, 8L, 40L, 
45L, 14L, 42L, 19L, 29L, 21L, 12L, 6L, 23L, 3L, 34L, 10L, 27L, 
25L, 36L, 38L, 54L, 1L, 53L, 17L, 51L, 47L, 32L, 57L, 43L, 7L, 
39L, 45L, 13L, 41L, 18L, 28L, 20L, 11L, 5L, 22L, 2L, 33L, 9L, 
26L, 24L, 35L, 37L, 54L, 4L, 52L, 15L, 49L, 46L, 30L, 55L, 43L, 
7L, 39L, 45L, 13L, 41L, 18L, 28L, 20L, 11L, 5L, 22L, 2L, 33L, 
9L, 26L, 24L, 35L, 37L, 54L, 4L, 52L, 15L, 49L, 46L, 30L, 55L, 
43L, 7L, 39L, 45L, 13L, 41L, 18L, 28L, 20L, 11L, 5L, 22L, 2L, 
33L, 9L, 26L, 24L, 35L, 37L, 54L, 4L, 52L, 16L, 50L, 48L, 31L, 
56L), .Label = c("Bank of America ( BOA ) Op", "BMW", "BMW Motorsport", 
"BOA", "Chevy", "Chevy Inc.", "Coca Cola", "Coca Cola Ltd.", 
"Ferrari", "Ferrari Marketing", "Ford", "Ford Holdings", "Gap", 
"Gap llc.", "Giant", "Giant Corp", "Giant Foodstores", "Gucci", 
"Gucci Partners", "Hermes", "Hermes Marketing", "Honda", "Honda Parnters", 
"John Deere", "John Deere Operations", "Lamborghini", "Lamborghini Accounting", 
"Louis Vuittons", "Louis Vuittons HR", "McDonalds", "McDonalds Corp", 
"McDonalds UK", "Mercedes Benz", "Mercedes Benz inc.", "NBA", 
"NBA Analysis", "NFA ", "NFA Recruitment", "Nike", "Nike Finance", 
"North Face", "North Face LTD.", "Pepsi", "Pepsi Holdings", "Ralph Lauren", 
"Range Rover", "Range Rover Ad", "Range Rover Corp", "Safeway", 
"Safeway Corp", "Safeway Holdings", "Suntrust", "Suntrust Bank", 
"VFC Corp", "Wendys", "Wendys 2", "Wendys SNG"), class = "factor"), 
    sum.of.FY.Total = c(4916487, 4663357, 909996, 4471305, 1720676, 
    7034137, 4017939, 4524008, 5547914, 1980483, 1203141, 132121, 
    3594657, 1598713, 2182312, 4779278, 6512046, 8136679, 5655455, 
    2159191, 9360006, 156573, 9140869, 7536559, 9130948, 4669661, 
    6194570, 4272497, 4550240, 3548889, 9750697, 2088667, 4534458, 
    5476184, 4883956, 6779033, 3579352, 1011840, 4310802, 2235088, 
    3643387, 757483, 6935256, 2990853, 53131, 5585040, 7877862, 
    3665289, 1367070, 8753575, 3524916, 5680361, 5069410, 4627819, 
    4653707, 9991615, 5618644, 5070332, 772305, 1207882, 491771, 
    9741735, 3865162, 4068133, 4462921, 9125132, 9212654, 4210293, 
    8604194, 2994089, 7387356, 3862073, 2008803, 9893430, 6231332, 
    2676382, 1596216, 6576172, 8960161, 8323238, 3122570, 4532453, 
    2713177, 661403, 9725618, 2986872, 5799460, 3136023, 6345920, 
    231895, 7786946, 2341224, 4352162, 2654683, 4706294, 8396018, 
    3144172, 2100856, 2828535, 6487379, 7779024, 8635949, 441559, 
    7928063, 3935233, 3902695, 6505559, 1194013)), .Names = c("Transaction.ID", 
"AllcustomerName", "sum.of.FY.Total"), row.names = c(NA, -108L
), class = "data.frame")

客户列表

dput(NeedClientInfo)
list("Pepsi", "Coca Cola", "Nike", "Ralph Lauren", "Gap", "North Face", 
    "Gucci", "Louis Vuittons", "Hermes", "Ford", "Chevy", "Honda", 
    "BMW", "Mercedes Benz", "Ferrari", "Lamborghini", "John Deere", 
    "NBA", "NFA ", "VFC Corp", "BOA", "Suntrust", "Giant", "Safeway", 
    "Range Rover", "McDonalds", "Wendys")

library(data.table)
setDT(df2)[AllCustomerName %chin% unlist(NeedClientName), .(Sum = sum(Sum.of.FY.Total)), 
                by = AllCustomerName]

这是此问题的代码的最新版本。然而，它不考虑部分匹配。

2 个答案:

答案 0 :(得分：0)

在您的data.table %like%选择中使用%chin%代替i。应该做部分匹配。有关%like%如何运作的详细信息can be found in Data.Table Documentation on page 44

答案 1 :(得分：0)

Is this what you need?

l2 <- lapply(NeedClientInfo, function(i) grep(i, df2$AllcustomerName))
l3 <- lapply(l2, function(i) sum(df2$sum.of.FY.Total[i]))
names(l3) <- unlist(NeedClientInfo)
head(l3)
#$Pepsi
#[1] 18375144
#$`Coca Cola`
#[1] 21918389
#$Nike
#[1] 10738932
#$`Ralph Lauren`
#[1] 29017952
#$Gap
#[1] 7568520
#$`North Face`
#[1] 18575937