R-数据框中的匹配列

时间:2018-11-08 19:39:40

标签: r dataframe

我有这个NDVI时间序列数据集,其中第一列是日期,接下来的三列是三种不同ID(59231、158157、282302)的NDVI数据

    Date X59231 X158157 X282302
1  13149     NA   0.398      NA
2  13157  0.344   0.267   0.327
3  13165     NA   0.431      NA
.  .....  .....   .....   .....  

这是赔率:

structure(list(Date = c(13149L, 13157L, 13165L, 13173L, 13181L, 
13189L, 13197L, 13205L, 13213L, 13221L, 13229L, 13237L, 13245L, 
13253L, 13261L, 13269L, 13277L, 13285L, 13293L, 13301L, 13309L, 
13317L, 13325L, 13333L, 13341L, 13349L, 13357L, 13365L, 13373L, 
13381L, 13389L, 13397L, 13405L, 13413L, 13421L, 13429L, 13437L, 
13445L, 13453L, 13461L, 13469L, 13477L, 13485L, 13493L, 13501L, 
13509L), X59231 = c(NA, 0.344, NA, 0.398, NA, 0.587, NA, NA, 
0.451, 0.597, 0.593, 0.556, 0.559, 0.375, 0.374, 0.386, 0.425, 
0.383, 0.349, 0.315, 0.282, 0.323, 0.315, 0.359, 0.292, 0.271, 
0.297, 0.307, 0.322, 0.344, 0.297, 0.285, 0.273, 0.282, 0.281, 
0.304, 0.314, NA, 0.391, 0.601, 0.65, NA, 0.653, 0.666, 0.519, 
0.625), X158157 = c(0.398, 0.267, 0.431, NA, 0.36, 0.434, 0.434, 
0.465, 0.447, 0.521, 0.539, 0.563, 0.595, 0.541, 0.553, 0.381, 
0.533, 0.505, 0.551, NA, 0.546, 0.535, 0.523, 0.501, 0.508, 0.51, 
0.506, 0.51, 0.514, 0.526, 0.555, 0.545, 0.53, 0.539, 0.531, 
0.53, NA, 0.585, 0.597, 0.32, 0.569, 0.601, NA, NA, 0.52, 0.532
), X282302 = c(NA, 0.327, NA, 0.282, 0.26, 0.293, 0.25, 0.288, 
0.336, 0.299, 0.29, 0.28, NA, 0.305, 0.319, NA, 0.255, 0.292, 
0.294, NA, NA, 0.367, 0.331, 0.344, 0.283, 0.284, 0.291, 0.273, 
0.239, 0.285, 0.249, 0.285, 0.247, 0.288, 0.276, NA, 0.317, 0.375, 
0.38, 0.417, 0.374, 0.491, NA, NA, NA, 0.471)), class = "data.frame", row.names = c(NA, 
-46L))

我运行以下代码来平滑时间序列(消除噪声),并找到每个ID的NDVI时间序列的多个最大值和最小值。

rm(list=ls())

#Read in csv data
df=read.csv("Data.csv", header = TRUE)
date_col = df[,1]

num_cols = length(df[1,]) #count number of columns there are
num_Dcols = num_cols-1 #count the number of columns there are minus the index (first) column


#Function to append columns to a dataframe
cbind.fill <- function(...){
  nm <- list(...) 
  nm <- lapply(nm, as.matrix)
  n <- max(sapply(nm, nrow)) 
  do.call(cbind, lapply(nm, function (x) 
    rbind(x, matrix(, n-nrow(x), ncol(x))))) 
}

#Create an empty data frame
finalDF = data.frame(matrix(ncol=(0),nrow=0)) #create empty dataframe

#Create an empty vector for column names
CNames = c()

for (i in c(1:num_Dcols)){
  df_sub = df[,c(1,i+1)] #create a data frame of the date column and the i+1 column

  df_removeNA = na.omit(df_sub)

  #Append the date column to the final data frame
  df_date = df_removeNA[,1]
  finalDF = cbind.fill(finalDF, df_date)

  #Append the NDVI timeseries column to the final data frame
  df_data = df_removeNA[,2]
  finalDF = cbind.fill(finalDF, df_data)


  stl_1=stl(ts(df_data, frequency=4), "periodic")

  #Function to calculate all the maximums
  ts_max<-function(signal)
  {
    points_max=which(diff(sign(diff(signal)))==-2)+1
    return(points_max)
  }

  #Function to calculate all the minimums
  ts_min<-function(signal)
  {
    points_min=which(diff(sign(diff(-signal)))==-2)+1
    return(points_min)
  }

  #Smooth the timeseries
  trend_1=as.numeric(stl_1$time.series[,2])

  #Find max and mins of the smoothed timeseries
  max_1=ts_max(trend_1)
  min_1=ts_min(trend_1)

  #Append max and mins to the final data frame
  finalDF = cbind.fill(finalDF, df_data[max_1])
  finalDF = cbind.fill(finalDF, df_data[min_1])

  #Append column names to the column names vector
  CNames = c(CNames, toString(colnames(df_sub[1])))
  CNames = c(CNames, toString(colnames(df_sub[2])))
  CNames = c(CNames, paste(c(toString(colnames(df_sub[2])), "_Max"), collapse=''))
  CNames = c(CNames, paste(c(toString(colnames(df_sub[2])), "_Min"), collapse=''))

  #Plot final results
  plot(df_date, trend_1, type = 'l')
  abline(v=df_date[max_1], col="red")
  abline(v=df_date[min_1], col="blue")
}

#Rename final data frame's column names
colnames(finalDF) = CNames

#Export final data frame to CSV
write.csv(finalDF, file = "finalDF_smooth.csv")

这是NDVI时间序列数据的第一列的所有最大值和最小值的图像。 enter image description here 我要弄清楚的是如何在每个ID列旁边的原始(或新)数据框中添加两个新列,以便在其中存储最大值和最小值。最大值和最小值需要放在与其相应日期匹配的单元格中。换句话说,每个ID列都需要两个重复的列。插入每个ID列旁边,所有值均用NA替换,最大值和最小值除外。两者都是在上面的平滑代码中计算的。例如,这就是我需要的最终数据框看起来像这样:

 Date  59231   59231_Max   59231_Min  158157   158157_Max   158157_Min  282302    282302_Max    282302_Min
13149     NA          NA          NA   0.398           NA           NA      NA            NA            NA
13157  0.344          NA          NA   0.267           NA           NA   0.327            NA            NA
13165     NA          NA          NA   0.431           NA           NA      NA            NA            NA
13173  0.398          NA          NA      NA           NA           NA   0.282            NA            NA
13181     NA          NA          NA   0.360           NA           NA   0.260            NA            NA
13189  0.587          NA          NA   0.434           NA           NA   0.293            NA         0.293
13197     NA          NA          NA   0.434           NA           NA    0.25            NA            NA
13205     NA          NA          NA   0.465           NA           NA   0.288            NA            NA
13213  0.451          NA          NA   0.447           NA           NA   0.336            NA            NA
13221  0.597          NA          NA   0.521           NA           NA   0.299         0.299            NA
  ...    ...          ..          ..     ...           ..           ..     ...           ...            ..

这就是现在的样子。

 Date  59231   59231_Max   59231_Min     Date  158157   158157_Max   158157_Min    Date  282302    282302_Max    282302_Min
13157  0.344       0.593       0.386    13149   0.398        0.595        0.533   13157   0.327         0.299         0.293
13173  0.398       0.425       0.282    13157   0.267        0.546        0.508   13173   0.282         0.331         0.255
13189  0.587       0.315       0.297    13165   0.431        0.545        0.539   13181   0.260            NA         0.285
13213  0.451       0.322       0.273    13181   0.360        0.530        0.320   13189   0.293            NA            NA
13221  0.597       0.653          NA    13189   0.434           NA           NA   13197   0.250            NA            NA
13229  0.593          NA          NA    13197   0.434           NA           NA   13205   0.288            NA            NA
13237  0.556          NA          NA    13205   0.465           NA           NA   13213   0.336            NA            NA
13245  0.559          NA          NA    13213   0.447           NA           NA   13221   0.299            NA            NA
13253  0.375          NA          NA    13221   0.521           NA           NA   13229   0.290            NA            NA
13261  0.374          NA          NA    13229   0.539           NA           NA   13237   0.280            NA            NA
.....    ...          ..          ..    .....   .....           ..           ..   .....   .....           ...            ..

注意:我必须在每个循环中省略NA,因此代码会生成一个CSV文件,其中每个ID都有一个唯一的子集日期列。我只想像上面理想表中的一个日期列即可。

在我的代码中,我开始创建一个新的数据框,并在每个循环后追加每一列,但是我不知道如何在正确的单元格中匹配最大值和最小值。现在,所有的最大值和最小值都堆积在其列的顶部。有任何想法吗?谢谢。

1 个答案:

答案 0 :(得分:0)

这个怎么样?它将添加最小和最大列。

df
df$max <- apply(df[2:4], 1, max, na.rm = TRUE)
df$min <- apply(df[2:4], 1, min, na.rm = TRUE)
head(df)

哪个会产生:

     ID X59231 X158157 X282302   max   min
1 13149     NA   0.398      NA 0.398 0.398
2 13157  0.344   0.267   0.327 0.344 0.267
3 13165     NA   0.431      NA 0.431 0.431
4 13173  0.398      NA   0.282 0.398 0.282
5 13181     NA   0.360   0.260 0.360 0.260
6 13189  0.587   0.434   0.293 0.587 0.293

我已根据您提供的说明添加了此内容。您可以忽略以上内容:

这将产生您想要的。我只为第一列做过,但是您可以更改变量以获取其他列。

library(dplyr)
df2 <- as_tibble(df)
df2 <- df2 %>% 
  mutate(X59231_min = min(X59231, na.rm = TRUE))%>% 
  mutate(X59231_min = ifelse(X59231 == X59231_min, X59231_min, NA)) %>% 
  mutate(X59231_max = max(X59231, na.rm = TRUE))%>% 
  mutate(X59231_max = ifelse(X59231 == X59231_max, X59231_max, NA))

所以:

df2 %>% filter(!is.na(X59231_min))

给予我们

# A tibble: 1 x 6
     ID X59231 X158157 X282302 X59231_min X59231_max
  <int>  <dbl>   <dbl>   <dbl>      <dbl>      <dbl>
1 13349  0.271    0.51   0.284      0.271         NA

并且:

df2 %>% filter(!is.na(X59231_max))

显示:

# A tibble: 1 x 6
     ID X59231 X158157 X282302 X59231_min X59231_max
  <int>  <dbl>   <dbl>   <dbl>      <dbl>      <dbl>
1 13493  0.666      NA      NA         NA      0.666

您应该能够在其他列中做到这一点。