我是R的新手,所以如果这个问题很简单,我会事先道歉。我有一个数据集,分为四列(transaction_date,product_group,unit_segment_label,amount)。基本上我想通过product_group和unit_segment_label运行线性模型,这意味着我希望能够为出现在unit_segment_label = 0和product_group ="信用卡"中的客户创建预测。与出现在unit_segment_label = 1和product_group ="信用卡中的客户分开。"如果我在开始时将数据子集化到规范我正在寻找代码正常工作但理想情况下我想通过这一个代码块运行所有场景,而不必为每个场景写出一个子集(见下文)。
temp <- fetch(input_ds)
#This filters the data to the product and customer segment
x <- subset(temp,temp$product_group %in% "Credit Card" & temp$unit_segment_label == 0)
#This creates a single variable based on the product group used.
v <- x$product_group[1]
#This creates a single variable based on the customer segment used.
u <- 0
x$date <- as.Date(x$transaction_date)
x$year <- as.numeric(format(x$transaction_date,'%Y'))
x$month <- format(x$transaction_date,'M%m')
x$dow <- format(x$transaction_date,"DW%u")
x.lm <- lm(amount ~ year + month + dow,data=x)
x$fitted <- predict(x.lm,x)
last_day <- as.Date(format(seq(from=max(x$date),by='month',length.out=2),'%Y-%m-01'))[2]-1
new_values <- data.frame(date=seq(from=min(x$date),to=last_day+(365*1),by='day'))
VariablePG <- "product_group"
new_values[, VariablePG] <- v
VariableCS <- "unit_segment_label"
new_values[, VariableCS] <- u
new_values$year <- as.numeric(format(new_values$date,'%Y'))
new_values$month <- format(new_values$date,'M%m')
new_values$dow <- format(new_values$date,"DW%u")
new_values$forecasts <- predict(x.lm,new_values)
CC <- select(merge(new_values,x,by=c('date','product_group','unit_segment_label'),all.x=TRUE),date,product_group,unit_segment_label,amount,forecasts)
我以为我可以将它写入一个函数并使用ddply但它并没有像我期望的那样工作,当然我所期望的可能与ddply完全不同。下面是我尝试使用ddply编写的代码。
temp <- fetch(input_ds)
x <- subset(temp,temp$unit_segment_label == 0)
zero_tO_20_unit_function <- function(temp,product_group) {
y <- x
unit_segment_label <- y$unit_segment_label[1]
product_group <- unique(y$product_group)
y$date <- as.Date(y$transaction_date)
y$year <- as.numeric(format(y$transaction_date,'%Y'))
y$month <- format(y$transaction_date,'M%m')
y$dow <- format(y$transaction_date,"DW%u")
y.lm <- lm(amount ~ year + month + dow,data=y)
y$fitted <- predict(y.lm,y)
last_day <- as.Date(format(seq(from=max(y$date),by='month',length.out=2),'%Y-%m-01'))[2]-1
new_values_temp <- data.frame(date=seq(from=min(y$date),to=last_day+(365*1),by='day'),unit_segment_label)
new_values <- merge(new_values_temp,product_group,all=TRUE)
colnames(new_values)[3] <- "product_group"
new_values$year <- as.numeric(format(new_values$date,'%Y'))
new_values$month <- format(new_values$date,'M%m')
new_values$dow <- format(new_values$date,"DW%u")
new_values$forecasts <- predict(y.lm,new_values)
CC <- select(merge(new_values,y,by=c('date','product_group','unit_segment_label'),all.y=TRUE),date,product_group,unit_segment_label,amount,forecasts)
}
all_add_on <- ddply(temp,.(product_group), zero_tO_20_unit_function)
我确信在我可以分割数据,应用算法然后再将它组合在一起时,我遗漏了一些东西。非常感谢任何帮助和见解。
谢谢!
布赖恩
答案 0 :(得分:0)
考虑by
,这个函数允许您将数据帧切分为一个或多个因子,并将每个子集传递给定义的函数,以返回函数返回对象的列表(此处为数据帧)。还可以考虑transform
和within
进行简洁的列分配。
run_model <- function(x) {
v <- x$product_group[[1]] # product group used
u <- x$customer_segment[[1]] # customer segment used
x <- transform(x, date = as.Date(transaction_date),
year = as.numeric(format(transaction_date, '%Y')),
month = format(transaction_date, 'M%m'),
dow = format(transaction_date, 'DW%u'))
x.lm <- lm(amount ~ year + month + dow, data=x)
x$fitted <- predict(x.lm, x)
last_day <- as.Date(format(seq(from=max(x$date), by='month', length.out=2),
'%Y-%m-01'))[2]-1
new_values <- data.frame(date=seq(from=min(x$date), to=last_day+(365*1), by='day'),
product_group = v,
unit_segment_label = u)
new_values <- within(new_values, { year <- as.numeric(format(date, '%Y'))
month <- format(date, 'M%m')
dow <- format(date, 'DW%u')
forecasts <- predict(x.lm, new_values)})
CC <- merge(new_values, x, by=c('date', 'product_group', 'unit_segment_label'),
all.x=TRUE)[c("date", "product_group", "unit_segment_label",
"amount", "forecasts")]
}
temp <- fetch(input_ds)
# This processes data for each product and customer segment
df_list <- by(temp, temp[, c("product_group", "unit_segment_label")], run_model)
# Combine all df subsets into one final df
finaldf <- do.call(rbind, df_list)