不幸的是,新月带来了一些我无法单独解决的新问题 - 我想列出每个用户返回的大小[returnShipment = 1]。如果2个或更多尺寸具有相同的出现率,则应写入“ - ”
我已经尝试过这样但我仍坚持如何解决它;)
tmp <- with(DB, tapply(size, customerID, returnShipment, function(x) {
tbl <- table(x)
most <- which(tbl == max(tbl) & returnShipment == "No")
if (length(most) > 1) return('-') else return(names(tbl)[most])
}))
DB$mostKeptSize <- tmp[DB$customerID]
预期结果(使用返回的项目进行更新而不是保留[ItemReturned ==“是”]):
mostreturnedSize = c("-", "retained all items", "42", "-", "-", "42", "retained all items", "retained all items", "-", "-")
(用户1:返回尺寸“m”一次&尺寸“xxl”一次
用户2:没有返回项目
用户3:返回大小“42”一次)
数据:
DB <- data.frame(orderID = c(1,2,3,4,5,6,7,8,9,10),
orderDate = c("1.1.14","1.1.14","1.1.14","1.1.14","2.1.14", "2.1.14","2.1.14","2.1.14","2.1.14","2.1.14"),
itemID = c(2,3,2,5,12,4,2,3,1,5),
size = c("m", "l", 42, "xxl", "m", 42, 39, "m", "m", 44),
customerID = c(1, 2, 3, 1, 1, 3, 2, 2, 1, 1),
ItemReturned = c('No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'No', 'No', 'No', 'No'))
希望你能告诉我什么是错的,或者告诉我另一种解决问题的可能性。
这就是我到目前为止:
tab = table(mydata[mydata$returnShipment == 0, c("customerID", "size")])
maxes = max.col(tab, "first")
ifelse(rowSums(tab == tab[cbind(seq_len(nrow(tab)), maxes)]) > 1,
"-",
colnames(tab)[maxes])[mydata$customerID]
答案 0 :(得分:2)
我对这些编辑感到有些困惑,但您所描述的内容似乎是通过原始答案中的以下修改来实现的:
DB[["customerID"]] = factor(DB[["customerID"]]) #I guess it should be a factor
tab = table(DB[DB$ItemReturned == "Yes", c("customerID", "size")])
maxes = max.col(tab, "first")
ans = ifelse(rowSums(tab == tab[cbind(seq_len(nrow(tab)), maxes)]) > 1,
"-",
colnames(tab)[maxes])
ans[rowSums(tab) == 0] = "retained all items"
unname(ans[DB$customerID])
#[1] "-" "retained all items" "42"
# [4] "-" "-" "42"
# [7] "retained all items" "retained all items" "-"
#[10] "-"
答案 1 :(得分:1)
这就是我想出来的......
DB <- data.frame(orderID = c(1,2,3,4,5,6,7,8,9,10),
orderDate = c("1.1.14","1.1.14","1.1.14","1.1.14","2.1.14", "2.1.14","2.1.14","2.1.14","2.1.14","2.1.14"),
itemID = c(2,3,2,5,12,4,2,3,1,5),
size = c("m", "l", 42, "xxl", "m", 42, 39, "m", "m", 44),
customerID = c(1, 2, 3, 1, 1, 3, 2, 2, 1, 1),
ItemReturned = c('No', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'No', 'No'))
theMost <- function(ID){
df <- subset(DB, customerID==ID & ItemReturned=="No")
tbl <- table(df$size)
theMax <- max(tbl)
theSizes <- names(tbl)[tbl==theMax]
theSizes <- paste(theSizes, collapse="-")
return(theSizes)
}
lapply(unique(DB$customerID), theMost)
答案 2 :(得分:0)
你可以尝试
res <- unsplit( lapply(split(DB, DB$customerID), function(x) {
tbl <- table(factor(x$size[x$ItemReturned=='No']))
x$mostKeptsize <- if(sum(tbl==max(tbl))>1) '-'
else names(which.max(tbl)); x}),
DB$customerID)
res$mostKeptsize
#[1] "m" "-" "42" "m" "m" "42" "-" "-" "m" "m"
基于新数据
res <- unsplit(lapply(split(DB, DB$customerID), function(x) {
if(any(x$ItemReturned=='No')) {
tbl <- table(factor(x$size[x$ItemReturned=='No']))
x$mostKeptsize <- if(sum(tbl==max(tbl))>1) '-'
else names(which.max(tbl)) }
else x$mostKeptsize <-rep('returned all', nrow(x));x}),
DB$customerID)
res$mostKeptsize
#[1] "m" "-" "returned all" "m" "m"
#[6] "returned all" "-" "-" "m" "m"
答案 3 :(得分:0)
我使用dplyr
来尝试使代码更具人性化。函数getMostFrequent首先剥离返回的项目,然后按大小变量分组,获取每个的计数,生成一个新列,用于评估该大小是否为最大值,仅筛选那些行并仅保留size变量。然后检查 - 如果有多行,则返回&#34; - &#34;否则返回最常用的大小作为一个角色(以便将来更容易进行其他操作)。
现在 - 如何应用这个:我首先使用split()获取原始数据并将其转换为列表,每个customerID一个。这具有使用userID命名元素的优点 - 顺序并不重要。我使用lapply将函数应用于列表的每个元素,该元素返回带有命名元素的列表。然后我将其提取到具有customerID和最常见大小的单个data.frame中。如果你真的希望这是一个与原作相匹配的矢量,你可以在left_join
中重复变量和dplyr
。
library(dplyr)
getMostFrequent = function(x) {
data = x %>%
filter(ItemReturned == "No") %>%
group_by(size) %>%
summarise(Count = n()) %>%
mutate(mostFrequent = ifelse(Count==max(Count),1,0)) %>%
filter(mostFrequent == 1) %>%
select(size)
if(nrow(data) > 1){
return("-")
} else {
return(as.character(data$size))
}
}
j = lapply(with(DB,split(DB,customerID)),getMostFrequent)
data.frame(customerID = names(j),mostCommonSize=unlist(j))
customerID mostCommonSize
1 1 m
2 2 -
3 3 42