Question

我们有以这种格式的CSV文件：

id1, id2, id3, id4, id5
23,24,23,25,23
25,46,23,756,34
23,54,73,83,74
...

我们准备了一个R脚本，可以读取数据，处理数据（进行预测和预测），然后将结果写入文件。问题是，目前它只需要CSV中的1列/ id。我们最近在CSV中引入了多个列（如上所示）。我们希望脚本读取每个列，处理它，然后将预测存储在输出CSV中，然后对其他列执行相同操作。我们的脚本是：

library("forecast")
data = read.csv("data.csv")
seasonal_per <- msts(data,seasonal.periods=c(24,168))
best_model <- tbats(seasonal_per)
fcst <- forecast.tbats(best_model,h=24,level=90)
dfForec <- print(fcst)
result <- cbind(0:23,dfForec[, 1])
write.csv(result, file="out.csv")

由此产生的＆＃39; out.csv＆＃39;包含：

"","V1","V2"
"1",0,5080.64143403622
"2",1,5024.80341301731
"3",2,4697.62476220884
"4",3,4419.10506083084
"5",4,4262.78237536907
"6",5,4187.62903442766
"7",6,4349.19557668607
"8",7,4484.10807151227
"9",8,4247.8575479654
"10",9,3851.37930582024
"11",10,3575.95149262212
"12",11,3494.94340348126
"13",12,3501.35397669752
"14",13,3445.56274629188
"15",14,3362.23686727733
"16",15,3365.56431618894
"17",16,3573.96314478735
"18",17,3945.43879134651
"19",18,4278.44501871782
"20",19,4499.11200729996
"21",20,4574.2023320236
"22",21,4555.22528793877
"23",22,4550.89877322609
"24",23,4517.26727161547

（基于1个id的24个预测）

新脚本将逐个读取每列（不引用硬编码列名称），处理，预测，然后以此格式存储结果：

"id","V1","V2"
"id1",0,5080.64143403622
"id1",1,5024.80341301731
"id1",2,4697.62476220884
"id1",3,4419.10506083084
"id1",4,4262.78237536907
"id1",5,4187.62903442766
"id1",6,4349.19557668607
"id1",7,4484.10807151227
"id1",8,4247.8575479654
"id1",9,3851.37930582024
"id1",10,3575.95149262212
"id1",11,3494.94340348126
"id1",12,3501.35397669752
"id1",13,3445.56274629188
"id1",14,3362.23686727733
"id1",15,3365.56431618894
"id1",16,3573.96314478735
"id1",17,3945.43879134651
"id1",18,4278.44501871782
"id1",19,4499.11200729996
"id1",20,4574.2023320236
"id1",21,4555.22528793877
"id1",22,4550.89877322609
"id1",23,4517.26727161547
"id2",0,5080.64143403622    <-- id2 predictions begin
"id2",1,5024.80341301731    <-- id2 predictions begin

更新：按照@Parfait的建议尝试了代码，但它出现以下错误：

Error in file(file, ifelse(append, "a", "w")) : 
  cannot open the connection
In addition: There were 26 warnings (use warnings() to see them)

此外，finaldf数据帧为空：

> finaldf
      <NA> V1 V2
 [1,] NULL 0  1 
 [2,] NULL 0  1 
 [3,] NULL 0  1 
 [4,] NULL 0  1 
 [5,] NULL 0  1 
 [6,] NULL 0  1 
 [7,] NULL 0  1 
 [8,] NULL 0  1 
 [9,] NULL 0  1 
[10,] NULL 0  1 
[11,] NULL 0  1 
[12,] NULL 0  1 
[13,] NULL 0  1 
[14,] NULL 0  1 
[15,] NULL 0  1 
[16,] NULL 0  1 
[17,] NULL 0  1 
[18,] NULL 0  1 
[19,] NULL 0  1 
[20,] NULL 0  1 
[21,] NULL 0  1 
[22,] NULL 0  1 
[23,] NULL 0  1 
[24,] NULL 0  1

Answer 1

考虑使用lapply()和seq_along()来检索每列的数字索引。此类索引将用于检索列的数据和名称。总之，这将产生预测值的数据帧列表。迭代完成后，运行do.call()以堆叠列表的各个数据帧：

data <- read.csv("data.csv")
data[is.na(data)] <- 0

dfList <- lapply(seq_along(data), function(i) {
    seasonal_per <- msts(data[, i], seasonal.periods=c(24,168))
    best_model <- tbats(seasonal_per)
    fcst <- forecast.tbats(best_model, h=24, level=90)
    dfForec <- print(fcst)
    result <- cbind(0:23, dfForec[, 1])
    result$id <- names(data)[i]
    names(result)[1:2] <- c("V1", "V2")

    return(result[c("id", "V1", "V2")])
})

finaldf <- do.call(rbind, dfList)
write.csv(finaldf, file = "out.csv", row.names = FALSE)

Answer 2

以下是使用lapply包中的ldply和plyr的方法：

# Make replicable example with fake data and function
d <- as.data.frame(replicate(5, rnorm(100)))
names(d) <- paste0("id", 1:5)
get_numbers <- function(x) {
  data.frame(V1 = 1:10, V2 = sample(x, 10))
}

out <- lapply(d, get_numbers)
out <- plyr::ldply(out)
out    
#    .id V1           V2
# 1  id1  1 -0.462111424
# 2  id1  2  0.431549655
# 3  id1  3  1.360865990
# ..snip..
# 9  id1  9 -0.110209805
# 10 id1 10 -0.335578166
# 11 id2  1 -0.246840243
# 12 id2  2 -0.881482663
# 13 id2  3  0.352842837
# ..snip..

要为您的任务修改此示例，您可能希望将处理步骤包装到一个函数中，例如

process <- function(x) {
  seasonal_per <- msts(x, seasonal.periods=c(24,168))
  best_model <- tbats(seasonal_per)
  fcst <- forecast.tbats(best_model,h=24,level=90)
  dfForec <- print(fcst)
  result <- cbind(0:23, dfForec[, 1])
}

然后做，例如lapply(d, process)。

如何从R中的CSV读取特定列并将结果写入文件行？

2 个答案: