在csv文件中应用diff()函数,然后导出到另一个csv文件

时间:2015-11-13 19:38:24

标签: r csv

我能够从每个单独的cvs文件中读出每个列数据集,然后使用diff()函数计算每个时间步的差异。完成后,我可以写入另一个csv文件。

我期待更高级的方式来完成这项任务。有什么好主意吗?

我的数据集:

    timestamp   c1.1    c1.2    c1.3    c1.4    c1.5    c1.6    c1.7    c1.8    c1.9    c1.10   c1.11   c1.12
1   2015-08-13 18:30:00 1970.75 1973.81 1965.77 1953.62 2035.94 1959.28 1985.22 1962.63 1955.05 1952.25 2010.46 1996.06
2   2015-08-13 21:30:00 1968.87 1972.71 1963.52 1952.29 2035.03 1958.61 1984.38 1960.87 1953.44 1951.83 2008.73 1994.43
3   2015-08-14 00:30:00 1969.74 1972.39 1964.06 1952.44 2035.59 1958.40 1983.44 1961.65 1953.45 1951.61 2009.37 1994.27
4   2015-08-14 03:30:00 1970.31 1972.57 1963.91 1952.33 2035.62 1958.41 1983.63 1961.71 1953.21 1951.67 2009.35 1994.42
5   2015-08-14 06:30:00 1970.75 1971.79 1964.28 1952.28 2035.66 1958.26 1983.39 1961.54 1953.40 1951.70 2009.40 1994.05

这是我的程序:

fpath_ch1 = file.path("ch1_1h.csv"); 
data_ch1 = fread(fpath_ch1, header = T, sep = ','); 

diff_ch1_c1.1 = diff(data_ch1$c1.1);
diff_ch1_c1.2 = diff(data_ch1$c1.2);
diff_ch1_c1.3 = diff(data_ch1$c1.3);
diff_ch1_c1.4 = diff(data_ch1$c1.4);
diff_ch1_c1.5 = diff(data_ch1$c1.5);
diff_ch1_c1.6 = diff(data_ch1$c1.6);
diff_ch1_c1.7 = diff(data_ch1$c1.7);
diff_ch1_c1.8 = diff(data_ch1$c1.8);
diff_ch1_c1.9 = diff(data_ch1$c1.9);
diff_ch1_c1.10 = diff(data_ch1$c1.10);
diff_ch1_c1.11 = diff(data_ch1$c1.11);
diff_ch1_c1.12 = diff(data_ch1$c1.12);

QAQC_diff_ch1_c1.1 = ifelse(diff_ch1_c1.1 <= -5 | diff_ch1_c1.1 >= 5, NA, diff_ch1_c1.1);
QAQC_diff_ch1_c1.2 = ifelse(diff_ch1_c1.2 <= -5 | diff_ch1_c1.2 >= 5, NA, diff_ch1_c1.2);
QAQC_diff_ch1_c1.3 = ifelse(diff_ch1_c1.3 <= -5 | diff_ch1_c1.3 >= 5, NA, diff_ch1_c1.3);
QAQC_diff_ch1_c1.4 = ifelse(diff_ch1_c1.4 <= -5 | diff_ch1_c1.4 >= 5, NA, diff_ch1_c1.4);
QAQC_diff_ch1_c1.5 = ifelse(diff_ch1_c1.5 <= -5 | diff_ch1_c1.5 >= 5, NA, diff_ch1_c1.5);
QAQC_diff_ch1_c1.6 = ifelse(diff_ch1_c1.6 <= -5 | diff_ch1_c1.6 >= 5, NA, diff_ch1_c1.6);
QAQC_diff_ch1_c1.7 = ifelse(diff_ch1_c1.7 <= -5 | diff_ch1_c1.7 >= 5, NA, diff_ch1_c1.7);
QAQC_diff_ch1_c1.8 = ifelse(diff_ch1_c1.8 <= -5 | diff_ch1_c1.8 >= 5, NA, diff_ch1_c1.8);
QAQC_diff_ch1_c1.9 = ifelse(diff_ch1_c1.9 <= -5 | diff_ch1_c1.9 >= 5, NA, diff_ch1_c1.9);
QAQC_diff_ch1_c1.10 = ifelse(diff_ch1_c1.10 <= -5 | diff_ch1_c1.10 >= 5, NA, diff_ch1_c1.10);
QAQC_diff_ch1_c1.11 = ifelse(diff_ch1_c1.11 <= -5 | diff_ch1_c1.11 >= 5, NA, diff_ch1_c1.11);
QAQC_diff_ch1_c1.12 = ifelse(diff_ch1_c1.12 <= -5 | diff_ch1_c1.12 >= 5, NA, diff_ch1_c1.12);

dfrm <- data.frame(timestamp = data_ch1$timestamp(2:length(data_ch1$timestamp)), c1.1 = QAQC_diff_ch1_c1.1, c1.2 = QAQC_diff_ch1_c1.2, c1.3 = QAQC_diff_ch1_c1.3)

write.table(dfrm, file="z_st.csv", sep=",", row.names=FALSE, col.names=T)}

1 个答案:

答案 0 :(得分:2)

我们可以遍历除第一个列之外的列,即'timestamp'(lapply(data_chi[-1], ..),获取diff,并根据逻辑条件,我们用NA替换元素使用ifelse,最后cbind使用数据集的第一列。

library(data.table)
setDF(data_chi)#convert to `data.frame`.
dfrm <- cbind(data_chi[-1,1],data.frame(lapply(data_ch1[-1], function(x) {
             x1 <- diff(x)
             ifelse(x1 <=-5|x1 >=5, NA, x1)})))

更新

注意到数据是用fread读取的。在这种情况下,它将是data.table。因此,我们可以尝试data.table语法。我们在.SDcols中指定要循环的列,并使用lapplylapply(.SD, ..)循环这些列,获取差异(diff),替换diff值如果NA小于或等于-5或大于或等于5,则创建一个列'时间戳',而不包含原始列'data_chi $ timestamp'的第一行

DT <- data_chi[, lapply(.SD, function(x) {x1 <- diff(x)
            NA^(x1 <= -5 | x1 >=5)*x1}), .SDcols= 2:ncol(data_chi)
           ][, timestamp := data_chi[[1]][-1]]

 DT[, c(ncol(DT), 1:(ncol(DT)-1)), with=FALSE]
 #         timestamp  c1.1  c1.2  c1.3  c1.4  c1.5  c1.6  c1.7  c1.8  c1.9 c1.10 c1.11 c1.12
#1: 2015-08-13 21:30:00 -1.88 -1.10 -2.25 -1.33 -0.91 -0.67 -0.84 -1.76 -1.61 -0.42 -1.73 -1.63
#2: 2015-08-14 00:30:00  0.87 -0.32  0.54  0.15  0.56 -0.21 -0.94  0.78  0.01 -0.22  0.64 -0.16
#3: 2015-08-14 03:30:00  0.57  0.18 -0.15 -0.11  0.03  0.01  0.19  0.06 -0.24  0.06 -0.02  0.15
#4: 2015-08-14 06:30:00  0.44 -0.78  0.37 -0.05  0.04 -0.15 -0.24 -0.17  0.19  0.03  0.05 -0.37