Question

我的一个学生发送了他的R脚本以及他的作业，他显然试图从data.frame中的几个串行函数调用中收集相关输出。它有效，但我认为这是一个非常难看的解决方案。我举一个有效的例子：

--Create table with dummy data.
with c(cookie, channel, order_id, order_timestamp) AS(
values
('hxaxlw79u', 'price_comparison', Null, '2016-03-10 10:24:55'),
('hxaxlw79u', 'price_comparison', Null, '2016-03-10 10:24:56'),
('hxaxlw79u', 'price_comparison', Null, '2016-03-10 10:24:57'),
('hxaxlw79u', 'price_comparison', 1, '2016-03-10 10:24:58'),
('hxaxlw79u', 'price_comparison', Null, '2016-03-10 10:24:59'),
('hxaxlw79u', 'price_comparison', Null, '2016-03-10 10:25:00'),
('hxaxlw79u', 'price_comparison', Null, '2016-03-10 10:25:01'),
('hxaxlw79u', 'price_comparison', 2, '2016-03-10 10:25:02'),
('hxaxlw79u2', 'price_comparison', Null, '2016-03-10 10:25:00'),
('hxaxlw79u2', 'price_comparison', 1, '2016-03-10 10:25:01'),
('hxaxlw79u2', 'price_comparison', Null, '2016-03-10 10:25:02'),
('hxaxlw79u2', 'price_comparison', 2, '2016-03-10 10:25:02')

),

--Get a lagged table.
Data AS
(
SELECT      c.cookie, c.channel, c.order_id, c.order_timestamp, 
        LAG(COALESCE(c.order_id, 0), 1, 0) OVER (PARTITION BY c.cookie, c.channel   ORDER BY c.order_timestamp) as lag

FROM c
)

--Get the result
SELECT      d.cookie, d.channel, d.order_id, d.order_timestamp,
        1+ SUM(d.lag) OVER(PARTITION BY d.cookie, d.channel ORDER BY d.order_timestamp) as result


FROM        data d

我想这个data.frame用于复制“将相关结果粘贴到Excel而不必手动浏览每个CrossTables ...我想提供一些建议如何避免这样的麻烦但是，鉴于这种数据结构，我还没有找到一个简洁的解决方案。

任何？

Answer 1

由于data.frame本质上是一个向量列表，您可以使用lapply依次将函数应用于每个列，并将列表转换回data.frame。这将避免一些代码重复，并避免累积“rbinding”。

library("dplyr") # Using dplyr for bind_rows
library("gmodels")

ref <- sample(c(0,1), 20, T)
cat1 <- sample(c("ab", "cd", "ef"), 20, T)
cat2 <- sample(c("mm", "nn"), 20, T)
cat3 <- sample(c("low", "mid", "high"), 20, T)
mytable <- data.frame(ref, cat1, cat2, cat3)

# Function to extract the statistical results comparing 
# the ref column to each of the other columns in turn

results_fun <- function(x) {
  x <- CrossTable(mytable$ref, x, chisq = TRUE, prop.r = FALSE,
       prop.chisq = FALSE, prop.t = FALSE)
  res <- data.frame(a = names(x$prop.col[2,]), b = x$prop.col[2, ], 
       c = x$chisq[1], d = x$chisq[3],row.names = NULL)
  res
}

mylist <- lapply(mytable[-1], results_fun)
myresults <- bind_rows(mylist)

给

Source: local data frame [8 x 4]

      a         b statistic   p.value
  (chr)     (dbl)     (dbl)     (dbl)
1    ab 0.5000000 0.3428571 0.8424604
2    cd 0.4000000 0.3428571 0.8424604
3    ef 0.5714286 0.3428571 0.8424604
4    mm 0.4444444 0.2020202 0.6530951
5    nn 0.5454545 0.2020202 0.6530951
6  high 0.3333333 1.8666667 0.3932407
7   low 0.6666667 1.8666667 0.3932407
8   mid 0.6000000 1.8666667 0.3932407

R：从串行函数调用到data.frame的写函数输出

1 个答案: