尝试在Vertica中创建R UDF

时间:2016-04-11 18:02:55

标签: r vertica udf

所以我每个月都有一个很长的功能。我的目标是使用vertica能够运行用R编写的函数来创建Vertica UDF。我希望这可以通过我的公司自动化来实现。数据仓库。我已经在互联网上寻找一个类似于我但却找不到的例子。我的函数将两个数据帧作为输入,一个数据帧作为输出。以下是功能和工厂功能代码。

任何帮助将不胜感激。 谢谢, 本

TV_Attribution_Function <- function(MAP.data, tv_data) {
    # Inputs are two queries
    # MAP.data = visits data
    # tv_data = Data with TV spots
  MAP.data$Date_time <- as.POSIXct(as.character(MAP.data$Date_time), format="%F %R")
  tv_data$IMPRESSIONS <- as.numeric(tv_data$IMPRESSIONS)
  tv_data$Date_time <- as.POSIXct(as.character(tv_data$Date_time), format="%F %R")

  missing <- tv_data[tv_data$IMPRESSIONS <= 0,]


  tv_data[which(tv_data$IMPRESSIONS == 0),'IMPRESSIONS'] <- 1

  #replace nas w/ 0
  tv_data[is.na(tv_data)] <- 0
  MAP.data[is.na(MAP.data)] <- 0



  for(i in c(1:8,10:12)){
    if(class(tv_data[,i])[1] == 'character'){tv_data[,i] <- as.factor(tv_data[,i])}
  }
  tv_data$Feed <- factor(tv_data$Feed, levels = c("",unique(tv_data$Feed)))
  tv_data$SPOT_LENGTH <- as.integer(tv_data$SPOT_LENGTH)

  for(i in 2:9){
    if(class(MAP.data[,i])[1] == 'character'){MAP.data[,i] <- as.factor(MAP.data[,i])}
    if(typeof(MAP.data[,i])[1] == 'double'){MAP.data[,i] <- as.integer(MAP.data[,i])}


  }

  visits_data <- MAP.data

  span = 0.2
  sd_x = 1.0
  span_b = 0.35
  sd_b = 1.0
  minutes_gap = 5

  days <- unique(visits_data$Day_Only, na.rm=TRUE)
  final_outcome <- data.frame()

  for (i in 1:length(days)){
    outcome_day <- subset(visits_data, Day_Only == days[i])
    outcome_2 <- outcome_day$Visits_Count
    outcome_2[which(is.na(outcome_2))] <- 0
    t <- 1:nrow(outcome_day)
    output_set <- cbind(data.frame(outcome_day$Date_time),data.frame(outcome_day$Day_Only), data.frame(outcome_day$Visits_Minute))
    v.lo <- loess(outcome_2 ~ t, span = span)
    v.sd <- sd(v.lo$residuals)
    baseline <- v.lo$fitted + sd_x * v.sd
    direct_response <- outcome_2 - baseline
    direct_response[direct_response < 0] <- 0
    direct_response <- data.frame(direct_response)
    colnames(direct_response) <- c('mapped_visits')
    output_set <- cbind(output_set, direct_response)
    final_outcome <- rbind(final_outcome, output_set)
  }
  colnames(final_outcome)[1] <- c("Date_time")



  temp_imps <- aggregate(IMPRESSIONS ~ Date_time, data = tv_data, FUN = sum)
  t_imps_plus <- merge(final_outcome, temp_imps, by = 'Date_time', all.x = TRUE, all.y = FALSE)
  t_imps_plus$lagged_imps <- rep(0, nrow(t_imps_plus))

  for (i in 1:nrow(t_imps_plus)){
    t_imps_plus$lagged_imps[i] <- sum(t_imps_plus$IMPRESSIONS[max(1,i-minutes_gap+1):i], na.rm = TRUE)
  }

  tv_data$mapped_visits <- rep(0,nrow(tv_data))

  for (n in (1:nrow(tv_data))){

    num <- match(tv_data[n,1], t_imps_plus$Date_time)
    if (is.na(num) == FALSE) {   
      tv_data$mapped_visits[n] <- tv_data$IMPRESSIONS[n] * minutes_gap * sum(t_imps_plus$mapped_visits[num:(num+minutes_gap-1)] / t_imps_plus$lagged_imps[num:(num+minutes_gap-1)])     
    } else {tv_data$mapped_visits[n] <- NA}  
  }


  #Next for visits that resulted in a seeker signup

  final_outcome <- data.frame()

  # Loop for each day to create baseline visits and detect spikes
  for(i in 1:length(days)){
    outcome_day <- subset(visits_data, Day_Only == days[i])
    outcome_2 <- outcome_day$new_seekers
    outcome_2[which(is.na(outcome_2))] <- 0
    t <- c(1:length(outcome_2))
    output_set <- cbind(data.frame(outcome_day$Date_time),data.frame(outcome_day$Day_Only), data.frame(outcome_day$Visits_Minute))
    v.lo <- loess(outcome_2 ~ t, span = span_b)
    v.sd <- sd(v.lo$residuals)
    baseline <- v.lo$fitted + sd_b * v.sd
    #shouldnt direct response be outcome_2 - baseline, not fitted???
    direct_response <- outcome_2 - v.lo$fitted
    direct_response[direct_response < 0] <- 0
    direct_response <- data.frame(direct_response)
    colnames(direct_response) <- c('mapped_ns_visits')
    output_set <- cbind(output_set, direct_response)
    final_outcome <- rbind(final_outcome, output_set)
  }
  colnames(final_outcome)[1] <- c("Date_time")


  temp_imps <- aggregate(IMPRESSIONS ~ Date_time, data = tv_data, FUN = sum)
  #basically left join
  t_imps_plus <- merge(final_outcome, temp_imps, by = 'Date_time', all.x = TRUE, all.y = FALSE)
  t_imps_plus$lagged_imps <- rep(0, nrow(t_imps_plus)) 
  for (i in 1:nrow(t_imps_plus)){
    t_imps_plus$lagged_imps[i] <- sum(t_imps_plus$IMPRESSIONS[max(1,i-minutes_gap+1):i], na.rm = TRUE)
  }


  tv_data$mapped_ns_visits <- rep(0,nrow(tv_data))
  for (n in 1:nrow(tv_data)){
    num <- match(tv_data[n,1], t_imps_plus$Date_time)
    if (is.na(num) == FALSE) {
      tv_data$mapped_ns_visits[n] <- sum(t_imps_plus$mapped_ns_visits[num:(num+minutes_gap-1)]) * tv_data$IMPRESSIONS[n]*minutes_gap / sum(t_imps_plus$lagged_imps[num:(num+minutes_gap-1)])     
    } else {n}
  }



  final_outcome <- data.frame()

  # Loop for each day to create baseline visits and detect spikes
  for (i in 1:length(days)){
    outcome_day <- subset(visits_data, Day_Only == days[i])
    outcome_2 <- outcome_day$new_sitters
    outcome_2[which(is.na(outcome_2))] <- 0
    t <- c(1:nrow(outcome_day))
    output_set <- cbind(data.frame(outcome_day$Date_time),data.frame(outcome_day$Day_Only), data.frame(outcome_day$Visits_Minute))
    v.lo <- loess(outcome_2 ~ t, span = span_b)
    v.sd <- sd(v.lo$residuals)
    baseline <- v.lo$fitted + sd_b * v.sd
    direct_response <- outcome_2 - baseline
    direct_response[direct_response < 0] <- 0
    direct_response <- data.frame(direct_response)
    colnames(direct_response) <- c('mapped_np_visits')
    output_set <- cbind(output_set, direct_response)
    final_outcome <- rbind(final_outcome, output_set)
  }
  colnames(final_outcome)[1] <- c("Date_time")

  #Fill in total impressions including lags 
  temp_imps <- aggregate(IMPRESSIONS ~ Date_time, data = tv_data, FUN = sum)
  t_imps_plus <- merge(final_outcome, temp_imps, by = 'Date_time', all.x = TRUE, all.y = FALSE)
  t_imps_plus$lagged_imps <- rep(0, nrow(t_imps_plus))
  for (i in 1:nrow(t_imps_plus)){
    t_imps_plus$lagged_imps[i] <- sum(t_imps_plus$IMPRESSIONS[max(1,i-minutes_gap+1):i], na.rm = TRUE)
  }

  #Add mapped visits data to the tv_data table and account for any overlapping spots using ratio of impressions
  tv_data$mapped_np_visits <- rep(0,nrow(tv_data))  
  for (n in 1:nrow(tv_data)){
    num <- match(tv_data[n,1], t_imps_plus$Date_time)  # mapped new members when spot aired
    if (is.na(num) == FALSE) {
      tv_data$mapped_np_visits[n] <- tv_data$IMPRESSIONS[n] * minutes_gap *
        sum(t_imps_plus$mapped_np_visits[num:(num+minutes_gap-1)] / t_imps_plus$lagged_imps[num:(num+minutes_gap-1)])     
    } else {n}
  }

  for (n in 1:nrow(tv_data)){
    num <- match(tv_data[n,1], visits_data$Date_time)

    if(is.na(num) == FALSE){
      temp <- visits_data$new_seekers[num:(num+minutes_gap-1)]
      tv_data$total_ns_visits[n] <- sum(temp, na.rm = T)

    } else if(is.na(num) == TRUE){tv_data$total_ns_visits[n] <- 0}  
  }

  ## Add total new provider visits ####

  for (n in 1:nrow(tv_data)){
    num <- match(tv_data[n,1], visits_data$Date_time)

    if(is.na(num) == FALSE){
      temp <- visits_data$new_sitters[num:(num+minutes_gap-1)]
      tv_data$total_np_visits[n] <- sum(temp, na.rm = T)

    } else if(is.na(num) == TRUE){tv_data$total_np_visits[n] <- 0}
  }


  ### add total day1 prems #####

  for (n in 1:nrow(tv_data)){
    num <- match(tv_data[n,1], visits_data$Date_time)

    if(is.na(num) == FALSE){
      temp <- visits_data$day1_premiums[num:(num+minutes_gap-1)]
      tv_data$day1_premiums[n] <- sum(temp, na.rm = T)

    } else if(is.na(num) == TRUE){tv_data$day1_premiums[n] <- 0}

  }

  # add week1 prems ###########

  for (n in 1:nrow(tv_data)){
    num <- match(tv_data[n,1], visits_data$Date_time)

    if(is.na(num) == FALSE){
      temp <- visits_data$week1_premiums[num:(num+minutes_gap-1)]
      tv_data$week1_premiums[n] <- sum(temp, na.rm = T)

    } else if(is.na(num) == TRUE){tv_data$week1_premiums[n] <- 0}

  }

  tv_data$attr_premiums <- tv_data$week1_premiums * tv_data$mapped_ns_visits / tv_data$total_ns_visits
  return(tv_data)
}


# Factory Function
TV_Attribution_Function_Factory <- function() {
list (
name = TV_Attribution_Function
,udxtype=c("scalar")
,intype = c("any")
,outtype = c("any")
)
}

0 个答案:

没有答案