所以我每个月都有一个很长的功能。我的目标是使用vertica能够运行用R编写的函数来创建Vertica UDF。我希望这可以通过我的公司自动化来实现。数据仓库。我已经在互联网上寻找一个类似于我但却找不到的例子。我的函数将两个数据帧作为输入,一个数据帧作为输出。以下是功能和工厂功能代码。
任何帮助将不胜感激。 谢谢, 本
TV_Attribution_Function <- function(MAP.data, tv_data) {
# Inputs are two queries
# MAP.data = visits data
# tv_data = Data with TV spots
MAP.data$Date_time <- as.POSIXct(as.character(MAP.data$Date_time), format="%F %R")
tv_data$IMPRESSIONS <- as.numeric(tv_data$IMPRESSIONS)
tv_data$Date_time <- as.POSIXct(as.character(tv_data$Date_time), format="%F %R")
missing <- tv_data[tv_data$IMPRESSIONS <= 0,]
tv_data[which(tv_data$IMPRESSIONS == 0),'IMPRESSIONS'] <- 1
#replace nas w/ 0
tv_data[is.na(tv_data)] <- 0
MAP.data[is.na(MAP.data)] <- 0
for(i in c(1:8,10:12)){
if(class(tv_data[,i])[1] == 'character'){tv_data[,i] <- as.factor(tv_data[,i])}
}
tv_data$Feed <- factor(tv_data$Feed, levels = c("",unique(tv_data$Feed)))
tv_data$SPOT_LENGTH <- as.integer(tv_data$SPOT_LENGTH)
for(i in 2:9){
if(class(MAP.data[,i])[1] == 'character'){MAP.data[,i] <- as.factor(MAP.data[,i])}
if(typeof(MAP.data[,i])[1] == 'double'){MAP.data[,i] <- as.integer(MAP.data[,i])}
}
visits_data <- MAP.data
span = 0.2
sd_x = 1.0
span_b = 0.35
sd_b = 1.0
minutes_gap = 5
days <- unique(visits_data$Day_Only, na.rm=TRUE)
final_outcome <- data.frame()
for (i in 1:length(days)){
outcome_day <- subset(visits_data, Day_Only == days[i])
outcome_2 <- outcome_day$Visits_Count
outcome_2[which(is.na(outcome_2))] <- 0
t <- 1:nrow(outcome_day)
output_set <- cbind(data.frame(outcome_day$Date_time),data.frame(outcome_day$Day_Only), data.frame(outcome_day$Visits_Minute))
v.lo <- loess(outcome_2 ~ t, span = span)
v.sd <- sd(v.lo$residuals)
baseline <- v.lo$fitted + sd_x * v.sd
direct_response <- outcome_2 - baseline
direct_response[direct_response < 0] <- 0
direct_response <- data.frame(direct_response)
colnames(direct_response) <- c('mapped_visits')
output_set <- cbind(output_set, direct_response)
final_outcome <- rbind(final_outcome, output_set)
}
colnames(final_outcome)[1] <- c("Date_time")
temp_imps <- aggregate(IMPRESSIONS ~ Date_time, data = tv_data, FUN = sum)
t_imps_plus <- merge(final_outcome, temp_imps, by = 'Date_time', all.x = TRUE, all.y = FALSE)
t_imps_plus$lagged_imps <- rep(0, nrow(t_imps_plus))
for (i in 1:nrow(t_imps_plus)){
t_imps_plus$lagged_imps[i] <- sum(t_imps_plus$IMPRESSIONS[max(1,i-minutes_gap+1):i], na.rm = TRUE)
}
tv_data$mapped_visits <- rep(0,nrow(tv_data))
for (n in (1:nrow(tv_data))){
num <- match(tv_data[n,1], t_imps_plus$Date_time)
if (is.na(num) == FALSE) {
tv_data$mapped_visits[n] <- tv_data$IMPRESSIONS[n] * minutes_gap * sum(t_imps_plus$mapped_visits[num:(num+minutes_gap-1)] / t_imps_plus$lagged_imps[num:(num+minutes_gap-1)])
} else {tv_data$mapped_visits[n] <- NA}
}
#Next for visits that resulted in a seeker signup
final_outcome <- data.frame()
# Loop for each day to create baseline visits and detect spikes
for(i in 1:length(days)){
outcome_day <- subset(visits_data, Day_Only == days[i])
outcome_2 <- outcome_day$new_seekers
outcome_2[which(is.na(outcome_2))] <- 0
t <- c(1:length(outcome_2))
output_set <- cbind(data.frame(outcome_day$Date_time),data.frame(outcome_day$Day_Only), data.frame(outcome_day$Visits_Minute))
v.lo <- loess(outcome_2 ~ t, span = span_b)
v.sd <- sd(v.lo$residuals)
baseline <- v.lo$fitted + sd_b * v.sd
#shouldnt direct response be outcome_2 - baseline, not fitted???
direct_response <- outcome_2 - v.lo$fitted
direct_response[direct_response < 0] <- 0
direct_response <- data.frame(direct_response)
colnames(direct_response) <- c('mapped_ns_visits')
output_set <- cbind(output_set, direct_response)
final_outcome <- rbind(final_outcome, output_set)
}
colnames(final_outcome)[1] <- c("Date_time")
temp_imps <- aggregate(IMPRESSIONS ~ Date_time, data = tv_data, FUN = sum)
#basically left join
t_imps_plus <- merge(final_outcome, temp_imps, by = 'Date_time', all.x = TRUE, all.y = FALSE)
t_imps_plus$lagged_imps <- rep(0, nrow(t_imps_plus))
for (i in 1:nrow(t_imps_plus)){
t_imps_plus$lagged_imps[i] <- sum(t_imps_plus$IMPRESSIONS[max(1,i-minutes_gap+1):i], na.rm = TRUE)
}
tv_data$mapped_ns_visits <- rep(0,nrow(tv_data))
for (n in 1:nrow(tv_data)){
num <- match(tv_data[n,1], t_imps_plus$Date_time)
if (is.na(num) == FALSE) {
tv_data$mapped_ns_visits[n] <- sum(t_imps_plus$mapped_ns_visits[num:(num+minutes_gap-1)]) * tv_data$IMPRESSIONS[n]*minutes_gap / sum(t_imps_plus$lagged_imps[num:(num+minutes_gap-1)])
} else {n}
}
final_outcome <- data.frame()
# Loop for each day to create baseline visits and detect spikes
for (i in 1:length(days)){
outcome_day <- subset(visits_data, Day_Only == days[i])
outcome_2 <- outcome_day$new_sitters
outcome_2[which(is.na(outcome_2))] <- 0
t <- c(1:nrow(outcome_day))
output_set <- cbind(data.frame(outcome_day$Date_time),data.frame(outcome_day$Day_Only), data.frame(outcome_day$Visits_Minute))
v.lo <- loess(outcome_2 ~ t, span = span_b)
v.sd <- sd(v.lo$residuals)
baseline <- v.lo$fitted + sd_b * v.sd
direct_response <- outcome_2 - baseline
direct_response[direct_response < 0] <- 0
direct_response <- data.frame(direct_response)
colnames(direct_response) <- c('mapped_np_visits')
output_set <- cbind(output_set, direct_response)
final_outcome <- rbind(final_outcome, output_set)
}
colnames(final_outcome)[1] <- c("Date_time")
#Fill in total impressions including lags
temp_imps <- aggregate(IMPRESSIONS ~ Date_time, data = tv_data, FUN = sum)
t_imps_plus <- merge(final_outcome, temp_imps, by = 'Date_time', all.x = TRUE, all.y = FALSE)
t_imps_plus$lagged_imps <- rep(0, nrow(t_imps_plus))
for (i in 1:nrow(t_imps_plus)){
t_imps_plus$lagged_imps[i] <- sum(t_imps_plus$IMPRESSIONS[max(1,i-minutes_gap+1):i], na.rm = TRUE)
}
#Add mapped visits data to the tv_data table and account for any overlapping spots using ratio of impressions
tv_data$mapped_np_visits <- rep(0,nrow(tv_data))
for (n in 1:nrow(tv_data)){
num <- match(tv_data[n,1], t_imps_plus$Date_time) # mapped new members when spot aired
if (is.na(num) == FALSE) {
tv_data$mapped_np_visits[n] <- tv_data$IMPRESSIONS[n] * minutes_gap *
sum(t_imps_plus$mapped_np_visits[num:(num+minutes_gap-1)] / t_imps_plus$lagged_imps[num:(num+minutes_gap-1)])
} else {n}
}
for (n in 1:nrow(tv_data)){
num <- match(tv_data[n,1], visits_data$Date_time)
if(is.na(num) == FALSE){
temp <- visits_data$new_seekers[num:(num+minutes_gap-1)]
tv_data$total_ns_visits[n] <- sum(temp, na.rm = T)
} else if(is.na(num) == TRUE){tv_data$total_ns_visits[n] <- 0}
}
## Add total new provider visits ####
for (n in 1:nrow(tv_data)){
num <- match(tv_data[n,1], visits_data$Date_time)
if(is.na(num) == FALSE){
temp <- visits_data$new_sitters[num:(num+minutes_gap-1)]
tv_data$total_np_visits[n] <- sum(temp, na.rm = T)
} else if(is.na(num) == TRUE){tv_data$total_np_visits[n] <- 0}
}
### add total day1 prems #####
for (n in 1:nrow(tv_data)){
num <- match(tv_data[n,1], visits_data$Date_time)
if(is.na(num) == FALSE){
temp <- visits_data$day1_premiums[num:(num+minutes_gap-1)]
tv_data$day1_premiums[n] <- sum(temp, na.rm = T)
} else if(is.na(num) == TRUE){tv_data$day1_premiums[n] <- 0}
}
# add week1 prems ###########
for (n in 1:nrow(tv_data)){
num <- match(tv_data[n,1], visits_data$Date_time)
if(is.na(num) == FALSE){
temp <- visits_data$week1_premiums[num:(num+minutes_gap-1)]
tv_data$week1_premiums[n] <- sum(temp, na.rm = T)
} else if(is.na(num) == TRUE){tv_data$week1_premiums[n] <- 0}
}
tv_data$attr_premiums <- tv_data$week1_premiums * tv_data$mapped_ns_visits / tv_data$total_ns_visits
return(tv_data)
}
# Factory Function
TV_Attribution_Function_Factory <- function() {
list (
name = TV_Attribution_Function
,udxtype=c("scalar")
,intype = c("any")
,outtype = c("any")
)
}