每个用户最常保持大小

时间:2015-03-02 15:12:29

标签: r

不幸的是,新月带来了一些我无法单独解决的新问题 - 我想列出每个用户返回的大小[returnShipment = 1]。如果2个或更多尺寸具有相同的出现率,则应写入“ - ”

我已经尝试过这样但我仍坚持如何解决它;)

tmp <- with(DB, tapply(size, customerID, returnShipment, function(x) {
  tbl <- table(x)
  most <- which(tbl == max(tbl) & returnShipment == "No")
  if (length(most) > 1) return('-') else return(names(tbl)[most])
}))
DB$mostKeptSize <- tmp[DB$customerID]

预期结果(使用返回的项目进行更新而不是保留[ItemReturned ==“是”]):

mostreturnedSize = c("-", "retained all items", "42", "-", "-", "42", "retained all items", "retained all items", "-", "-")    

(用户1:返回尺寸“m”一次&尺寸“xxl”一次
用户2:没有返回项目
用户3:返回大小“42”一次)

数据:

DB <- data.frame(orderID  = c(1,2,3,4,5,6,7,8,9,10),     
orderDate = c("1.1.14","1.1.14","1.1.14","1.1.14","2.1.14", "2.1.14","2.1.14","2.1.14","2.1.14","2.1.14"),  
itemID = c(2,3,2,5,12,4,2,3,1,5),  
size = c("m", "l", 42, "xxl", "m", 42, 39, "m", "m", 44),
customerID = c(1, 2, 3, 1, 1, 3, 2, 2, 1, 1),
ItemReturned = c('No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'No', 'No', 'No', 'No'))

希望你能告诉我什么是错的,或者告诉我另一种解决问题的可能性。

这就是我到目前为止:

tab = table(mydata[mydata$returnShipment == 0, c("customerID", "size")])
maxes = max.col(tab, "first")
ifelse(rowSums(tab == tab[cbind(seq_len(nrow(tab)), maxes)]) > 1, 
       "-", 
       colnames(tab)[maxes])[mydata$customerID]

4 个答案:

答案 0 :(得分:2)

我对这些编辑感到有些困惑,但您所描述的内容似乎是通过原始答案中的以下修改来实现的:

DB[["customerID"]] = factor(DB[["customerID"]]) #I guess it should be a factor    
tab = table(DB[DB$ItemReturned == "Yes", c("customerID", "size")])
maxes = max.col(tab, "first")
ans = ifelse(rowSums(tab == tab[cbind(seq_len(nrow(tab)), maxes)]) > 1, 
             "-", 
             colnames(tab)[maxes])
ans[rowSums(tab) == 0] = "retained all items"
unname(ans[DB$customerID])
#[1] "-"                  "retained all items" "42"                
# [4] "-"                  "-"                  "42"                
# [7] "retained all items" "retained all items" "-"                 
#[10] "-"

答案 1 :(得分:1)

这就是我想出来的......

DB <- data.frame(orderID  = c(1,2,3,4,5,6,7,8,9,10),     
                 orderDate = c("1.1.14","1.1.14","1.1.14","1.1.14","2.1.14", "2.1.14","2.1.14","2.1.14","2.1.14","2.1.14"),  
                 itemID = c(2,3,2,5,12,4,2,3,1,5),  
                 size = c("m", "l", 42, "xxl", "m", 42, 39, "m", "m", 44),
                 customerID = c(1, 2, 3, 1, 1, 3, 2, 2, 1, 1),
                 ItemReturned = c('No', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'No', 'No'))
theMost <- function(ID){
  df <- subset(DB, customerID==ID & ItemReturned=="No")
  tbl <- table(df$size)
  theMax <- max(tbl)
  theSizes <- names(tbl)[tbl==theMax]
  theSizes <- paste(theSizes, collapse="-")
  return(theSizes)
}
lapply(unique(DB$customerID), theMost)

答案 2 :(得分:0)

你可以尝试

res <- unsplit( lapply(split(DB, DB$customerID), function(x) {
            tbl <- table(factor(x$size[x$ItemReturned=='No']))
            x$mostKeptsize <- if(sum(tbl==max(tbl))>1) '-' 
                     else names(which.max(tbl)); x}),
                    DB$customerID)
res$mostKeptsize
#[1] "m"  "-"  "42" "m"  "m"  "42" "-"  "-"  "m"  "m" 

更新

基于新数据

 res <- unsplit(lapply(split(DB, DB$customerID), function(x) {
          if(any(x$ItemReturned=='No')) {
          tbl <- table(factor(x$size[x$ItemReturned=='No']))
           x$mostKeptsize <- if(sum(tbl==max(tbl))>1) '-' 
           else names(which.max(tbl)) } 
        else x$mostKeptsize <-rep('returned all', nrow(x));x}),
        DB$customerID)

 res$mostKeptsize
 #[1] "m"            "-"            "returned all" "m"            "m"           
 #[6] "returned all" "-"            "-"            "m"            "m"          

答案 3 :(得分:0)

我使用dplyr来尝试使代码更具人性化。函数getMostFrequent首先剥离返回的项目,然后按大小变量分组,获取每个的计数,生成一个新列,用于评估该大小是否为最大值,仅筛选那些行并仅保留size变量。然后检查 - 如果有多行,则返回&#34; - &#34;否则返回最常用的大小作为一个角色(以便将来更容易进行其他操作)。

现在 - 如何应用这个:我首先使用split()获取原始数据并将其转换为列表,每个customerID一个。这具有使用userID命名元素的优点 - 顺序并不重要。我使用lapply将函数应用于列表的每个元素,该元素返回带有命名元素的列表。然后我将其提取到具有customerID和最常见大小的单个data.frame中。如果你真的希望这是一个与原作相匹配的矢量,你可以在left_join中重复变量和dplyr

library(dplyr)

getMostFrequent = function(x) {
  data = x %>%
    filter(ItemReturned == "No") %>%
    group_by(size) %>%
    summarise(Count = n()) %>%
    mutate(mostFrequent = ifelse(Count==max(Count),1,0)) %>%
    filter(mostFrequent == 1) %>%
    select(size)
  if(nrow(data) > 1){
    return("-")
  } else {
    return(as.character(data$size))
  }
}

j = lapply(with(DB,split(DB,customerID)),getMostFrequent)
data.frame(customerID = names(j),mostCommonSize=unlist(j))

  customerID mostCommonSize
1          1              m
2          2              -
3          3             42