Question

我使用R Tools for Visual Studio指南将R模型集成到存储过程中。语法如下：

ALTER PROCEDURE [dbo].[spRegressionPeak]
AS
BEGIN


EXEC sp_execute_external_script @language = N'R'
, @script = N'
# @InputDataSet: input data frame, result of SQL query execution
# @OutputDataSet: data frame to pass back to SQL

# Test code
#library(RODBC)
# channel <- odbcDriverConnect(dbConnection)
# InputDataSet <- sqlQuery(channel,iconv(paste(readLines(''~/visual studio 2017/prod360/regressionpeak.query.sql'', encoding = ''UTF-8'', warn = FALSE), collapse=''\n''), from = ''UTF-8'', to = ''ASCII'', sub = '''') )
# odbcClose(channel)

#'' Regression Peaks
#''
#'' Runs polynomial regressions on a data table with one model for each
#'' user ID - independent variable pair. Note that independent variables
#'' are identified as all columns matching the following pattern: the
#'' letter "c" followed by a one-or-more digit number. The dependent
#'' variable is identified by its name "dv". The user ID is identified by
#'' its name "id". Also note that the regressors are the means of the
#'' original observations, grouped by \code{code}.
#''
#'' @param x Table to run regressions on
#'' @param c_means Code means table
#'' @importFrom rlang .data
#''
#'' @return Summary table where each distinct \code{code} value is
#''   represented by one row with columns for the respective standard
#''   deviations of each independent variable.
#'' @export
regression_peak <- function(x, c_means) {

df <-
dplyr::select(x, .data$id, .data$code, .data$dv) %>%
dplyr::left_join(c_means, by = "code")

id <- unique(df$id)
iv <- names(df)[stringr::str_detect(names(df), "c\\d+")]
grid <- tidyr::crossing(id, iv)

peaks <- purrr::map2_df(grid$id, grid$iv, function(i_id, i_iv) {

    x <-
    dplyr::filter(df, .data$id == i_id) %>%
    dplyr::select_at(dplyr::vars(.data$dv, i_iv)) %>%
    dplyr::rename(iv = !!i_iv)

    fit <- stats::lm(dv ~ iv + I(iv ^ 2), data = x)
    coef_a <- stats::coef(fit)["iv"]
    coef_b <- stats::coef(fit)["I(iv^2)"]

    extr <-
    tibble::tibble(
    type = c(2, 1, 1),
    iv = c(unname(-coef_a / (2 * coef_b)), min(x$iv), max(x$iv))
  ) %>%
    dplyr::mutate(y = stats::predict(fit, newdata = ., type = "response"))

    t_max <- extr$iv[extr$type == 2]

    tibble::tibble(
  id = i_id,
  iv = i_iv,
  max = dplyr::case_when(
    min(x$iv) < t_max & t_max < max(x$iv) ~ extr$iv[which.max(extr$y)],
    TRUE ~ extr$iv[which.max(extr$y[extr$type == 1])]
  ) # should be x value
)

})

tidyr::spread(peaks, .data$iv, .data$max) %>%
dplyr::select(.data$id, iv) %>%
dplyr::filter_at(dplyr::vars(dplyr::matches("c\\d+")), 
dplyr::any_vars(!is.na(.)))

}

OutputDataSet <- InputDataSet
'
, @input_data_1 = N'-- Place SQL query retrieving data for the R stored procedure here
DECLARE @cols AS NVARCHAR(MAX),
    @query  AS NVARCHAR(MAX),
    @StudyID int,
    @sStudyID VARCHAR(50)

Select a.StudyId, a.RespID, p.ProductNumber, p.ProductSequence, 
CONVERT(varchar(50),a.DateAdded,101) as StudyDate,
CONVERT(VARCHAR(15),CAST((a.DateAdded)AS TIME),100) as 
StudyTime,DATENAME(dw,a.DateAdded) as [DayOfWeek],
p.A_Value as A,p.B_Value as B,p.C_Value as C,p.D_Value AS D,p.E_Value AS 
E,p.F_Value AS F, q.QuestionNumber  
from answers a
inner join Products p on a.ProductID = p.ProductID 
inner join Questions q on a.QuestionID = q.QuestionID
where a.StudyID = @sStudyID'
--- Edit this line to handle the output data frame.
WITH RESULT SETS (([StudyID] int, [RespID] int, [ProductNumber] int, 
[ProductSequence] int, [StudyDate] date, [StudyTime] time, [DayOfWeek] 
VARCHAR(10),[QuestionNumber] int, [A] int, [B] int, [C] int, [D] int, [E] 
int, [F] INT));
END;

当我在SQL Server Management Studio中执行存储过程时，返回值= 0并且不输出任何数据。我不确定变量是否被正确声明，因为我在执行存储过程时没有提示它们。

如何修改存储过程以返回预期数据？我可以通过提供适当的学习ID来从ASP.NET调用它吗？

Answer 1

您的代码很难阅读，但对我而言，您似乎并没有声明要获取的参数。以下是如何执行此操作的示例：

DECLARE @out_val float;

exec sp_execute_external_script
  @language = N'R',
  @script = N'
    iris_dataset <- iris

    setosa <- iris[iris$Species == "setosa",]

    menSepWidth <- mean(setosa$Sepal.Width)

    iris_dataset$Sepal.Length <- iris_dataset$Sepal.Length * multiplier
    OutputDataSet <- data.frame(iris_dataset$Sepal.Length)    

  ',
  @params = N'@multiplier float, @menSepWidth float OUTPUT',
  @multiplier = 5,
  @menSepWidth = @out_val OUTPUT
  WITH RESULT SETS ((SepalLength float));

SELECT @out_val AS MeanSepWidth

当您使用sp_execute_external_script时，请查看this blog post我在哪里讨论如何处理参数等。

希望这有帮助！

如何用R和TSQL声明存储过程中的输入变量？

1 个答案: