根据df1的一个变量(df1 $ var1)和df2的一个变量在df1中创建一个变量,该变量可以根据df1 $ var1进行更改。

时间:2019-05-17 16:20:19

标签: r dplyr tidyverse lubridate

我有数据框df1,该数据框总结了随时间变化的鱼类深度。 df1$Site告诉您鱼的位置,df1$Ind告诉您个人,df1$Depth告诉您特定df1$Datetime处鱼的深度。

另一方面,我有df2总结了从地表到39米深度的时间间隔(每三小时)的电流强度,间隔为8​​米(m0-7,{{ 1}},m8-15m16-23m24-31)。例如:

m32-39

我想在df1<-data.frame(Datetime=c("2016-08-01 15:34:07","2016-08-01 16:25:16","2016-08-01 17:29:16","2016-08-01 18:33:16","2016-08-01 20:54:16","2016-08-01 22:48:16"),Site=c("BD","HG","BD","BD","BD","BD"),Ind=c(16,17,19,16,17,16), Depth=c(5.3,24,36.4,42,NA,22.1)) df1$Datetime<-as.POSIXct(df1$Datetime, format="%Y-%m-%d %H:%M:%S",tz="UTC") > df1 Datetime Site Ind Depth 1 2016-08-01 15:34:07 BD 16 5.3 2 2016-08-01 16:25:16 HG 17 24.0 3 2016-08-01 17:29:16 BD 19 36.4 4 2016-08-01 18:33:16 BD 16 42.0 5 2016-08-01 20:54:16 BD 17 NA 6 2016-08-01 22:48:16 BD 16 22.1 df2<-data.frame(Datetime=c("2016-08-01 12:00:00","2016-08-01 15:00:00","2016-08-01 18:00:00","2016-08-01 21:00:00","2016-08-02 00:00:00"), Site=c("BD","BD","BD","BD","BD"),var1=c(2.75,4,6.75,2.25,4.3),var2=c(3,4,4.75,3,2.1),var3=c(2.75,4,5.75,2.25,1.4),var4=c(3.25,3,6.5,2.75,3.4),var5=c(3,4,4.75,3,1.7)) df2$Datetime<-as.POSIXct(df2$Datetime, format="%Y-%m-%d %H:%M:%S",tz="UTC") colnames(df2)<-c("Datetime","Site","m0-7","m8-15","m16-23","m24-31","m32-39") > df2 Datetime Site m0-7 m8-15 m16-23 m24-31 m32-39 1 2016-08-01 12:00:00 BD 2.75 3.00 2.75 3.25 3.00 2 2016-08-01 15:00:00 BD 4.00 4.00 4.00 3.00 4.00 3 2016-08-01 18:00:00 BD 6.75 4.75 5.75 6.50 4.75 4 2016-08-01 21:00:00 BD 2.25 3.00 2.25 2.75 3.00 5 2016-08-02 00:00:00 BD 4.30 2.10 1.40 3.40 1.70 中创建一个名为df1的新列,以根据df1$Current.Int关于潮流的说法总结鱼在何时何地的当前强度。

我想得到这个:

df2

仅需指出的是,由于当前记录是每三个小时一次,> df1 Datetime Site Ind Depth Current.Int 1 2016-08-01 15:34:07 BD 16 5.3 4.00 2 2016-08-01 16:25:16 HG 17 24.0 NA # Currents of this site are not included in df2 3 2016-08-01 17:29:16 BD 19 36.4 4.75 4 2016-08-01 18:33:16 BD 16 42.0 4.75 5 2016-08-01 20:54:16 BD 17 NA NA 6 2016-08-01 22:48:16 BD 16 22.1 1.40 中指示的每小时代表一个多半小时和一个小时半。也就是说,df2$Datetime中位于df2处的电流强度反映了21:00:0019:30:00之间的电流。其余时间也一样。

有人知道怎么做吗?

3 个答案:

答案 0 :(得分:1)

日期不匹配,因此示例中的日期已更改。使用这种方法,您可以准确检查匹配的工作方式并确保匹配符合您的要求。

df1<-data.frame(Datetime=c("2016-08-18 15:34:07","2016-08-18 16:25:16","2016-08-18 17:29:16","2016-08-18 18:33:16","2016-08-18 20:54:16","2016-08-18 22:48:16"),Site=c("BD","HG","BD","BD","BD","BD"),Ind=c(16,17,19,16,17,16), Depth=c(5.3,24,36.4,42,NA,22.1))
df1$Datetime<-as.POSIXct(df1$Datetime, format="%Y-%m-%d %H:%M:%S",tz="UTC")

df2<-data.frame(Datetime=c("2016-08-18 12:00:00","2016-08-18 15:00:00","2016-08-18 18:00:00","2016-08-18 21:00:00","2016-08-19 00:00:00"), Site=c("BD","BD","BD","BD","BD"),var1=c(2.75,4,6.75,2.25,4.3),var2=c(3,4,4.75,3,2.1),var3=c(2.75,4,5.75,2.25,1.4),var4=c(3.25,3,6.5,2.75,3.4),var5=c(3,4,4.75,3,1.7))
df2$Datetime<-as.POSIXct(df2$Datetime, format="%Y-%m-%d %H:%M:%S",tz="UTC")
colnames(df2)<-c("Datetime","Site","m0-7","m8-15","m16-23","m24-31","m32-39")

library(dplyr)
library(lubridate)

# Round the date and convert the depth to match the look-up. 
df1 = df1 %>% 
  mutate(
    Datetime_rounded = round_date(Datetime, "3 hour"),
    Depth_ind = ifelse(Depth < 8, "m0-7", 
                  ifelse(Depth > 7 & Depth < 16, "m8-15", 
                    ifelse(Depth > 15 & Depth < 24, "m16-23",
                      ifelse(Depth > 23 & Depth < 32, "m24-31",
                        ifelse(Depth > 31 & Depth < 40, "m32-39", NA)
                      )
                    )
                  )
                )
  )

# Wide to long on the intensity columns. 
df2 = df2 %>% 
  tidyr::gather("Depth_ind", "Intensity", 3:7)

# Join
df1 %>% 
  left_join(df2, by = c("Datetime_rounded" = "Datetime", 
                        "Site",
                        "Depth_ind"))

             Datetime Site Ind Depth    Datetime_rounded Depth_ind Intensity
1 2016-08-18 15:34:07   BD  16   5.3 2016-08-18 15:00:00      m0-7      4.00
2 2016-08-18 16:25:16   HG  17  24.0 2016-08-18 15:00:00    m24-31        NA
3 2016-08-18 17:29:16   BD  19  36.4 2016-08-18 18:00:00    m32-39      4.75
4 2016-08-18 18:33:16   BD  16  42.0 2016-08-18 18:00:00      <NA>        NA
5 2016-08-18 20:54:16   BD  17    NA 2016-08-18 21:00:00      <NA>        NA
6 2016-08-18 22:48:16   BD  16  22.1 2016-08-19 00:00:00    m16-23      1.40

# EDIT ----
## As per the request, the width of the final depth range can be adjusted as you wish, e.g. to a max depth of 60 m.

# Round the date and convert the depth to match the look-up. 
df1 = df1 %>% 
  mutate(
    Datetime_rounded = round_date(Datetime, "3 hour"),
    Depth_ind = ifelse(Depth < 8, "m0-7", 
                  ifelse(Depth > 7 & Depth < 16, "m8-15", 
                    ifelse(Depth > 15 & Depth < 24, "m16-23",
                      ifelse(Depth > 23 & Depth < 32, "m24-31",
                        ifelse(Depth > 31 & Depth < 60, "m32-39", NA)
                      )
                    )
                  )
                )
  )

答案 1 :(得分:1)

这可以直接在单个SQL语句中完成。我们通过df1行将指示的df2条件分组保留了ondf1的连接。在指定的组上计算max(b.Datetime),将选出df2的相应行。 (如果a.Datetimea.Site不能唯一地定义df1的行,则改为按a.rowid分组。)最后,我们使用[-1]删除该列。

由于问题中的数据在df1df2中没有对应的日期,因此我们使用了注释中显示的数据。

library(sqldf)

sqldf("select max(b.Datetime), a.*,
  case when a.Depth <= 7 then b.[m0-7]
       when a.Depth <= 15 then b.[m8-15]
       when a.Depth <= 23 then b.[m16-23]
       when a.Depth <= 31 then b.[m24-31]
       else b.[m32-39]
  end as [Current.Int]
  from df1 a
  left join df2 b on a.Site = b.Site and a.Datetime >= b.Datetime
  group by a.Datetime, a.Site")[-1]

给予:

             Datetime Site Ind Depth Current.Int
1 2016-08-01 15:34:07   BD  16   5.3        4.00
2 2016-08-01 16:25:16   HG  17  24.0          NA
3 2016-08-01 17:29:16   BD  19  36.4        4.00
4 2016-08-01 18:33:16   BD  16  42.0        4.75
5 2016-08-01 20:54:16   BD  17    NA        4.75
6 2016-08-01 22:48:16   BD  16  22.1        2.25

注意

这是使用的输入,与问题中的输入相同,除了:

  1. UTC时区已被消除。如果要保留UTC时区,请使用Sys.setenv(TZ='UTC')将会话时区更改为UTC。处理时区的另一种可能性是在Datetime列中使用字符串而不是POSIXct。在这种情况下,您首先不会遇到时区问题。

  2. 添加了最后一行以改进示例,因为日期不匹配。

这里是使用的输入。

df1<-data.frame(Datetime=c("2016-08-01 15:34:07","2016-08-01 16:25:16","2016-08-01 17:29:16","2016-08-01 18:33:16","2016-08-01 20:54:16","2016-08-01 22:48:16"),Site=c("BD","HG","BD","BD","BD","BD"),Ind=c(16,17,19,16,17,16), Depth=c(5.3,24,36.4,42,NA,22.1))
df1$Datetime<-as.POSIXct(df1$Datetime, format="%Y-%m-%d %H:%M:%S")

df2<-data.frame(Datetime=c("2016-08-18 12:00:00","2016-08-18 15:00:00","2016-08-18 18:00:00","2016-08-18 21:00:00","2016-08-19 00:00:00"), Site=c("BD","BD","BD","BD","BD"),var1=c(2.75,4,6.75,2.25,4.3),var2=c(3,4,4.75,3,2.1),var3=c(2.75,4,5.75,2.25,1.4),var4=c(3.25,3,6.5,2.75,3.4),var5=c(3,4,4.75,3,1.7))
df2$Datetime<-as.POSIXct(df2$Datetime, format="%Y-%m-%d %H:%M:%S")
colnames(df2)<-c("Datetime","Site","m0-7","m8-15","m16-23","m24-31","m32-39")

df2$Datetime <- as.POSIXct(paste("2016-08-01", sub(".* ", "", df2$Datetime)))

答案 2 :(得分:0)

只要您的数据不是很大,就不必走条件连接的道路。取而代之的是,首先仅使用站点加入,然后再过滤掉多余的观察结果。它不是特别有效,但它可能比转向sqldf更容易。

请注意,我对您提供的数据进行了一些更改,以使日期匹配。

library(tidyverse)  

df1<-data.frame(Datetime=c("2016-08-01 15:34:07","2016-08-01 16:25:16","2016-08-01 17:29:16","2016-08-01 18:33:16","2016-08-01 20:54:16","2016-08-01 22:48:16"),
                Site=c("BD","HG","BD","BD","BD","BD"),
                Ind=c(16,17,19,16,17,16), 
                Depth=c(5.3,24,36.4,42,NA,22.1),
                stringsAsFactors = FALSE)
df1$Datetime<-as.POSIXct(df1$Datetime, format="%Y-%m-%d %H:%M:%S",tz="UTC")

df2<-data.frame(Datetime=c("2016-08-01 12:00:00","2016-08-01 15:00:00","2016-08-01 18:00:00","2016-08-01 21:00:00","2016-08-02 00:00:00"), 
                Site=c("BD","BD","BD","BD","BD"),
                var1=c(2.75,4,6.75,2.25,4.3),
                var2=c(3,4,4.75,3,2.1),
                var3=c(2.75,4,5.75,2.25,1.4),
                var4=c(3.25,3,6.5,2.75,3.4),
                var5=c(3,4,4.75,3,1.7),
                stringsAsFactors = FALSE)
df2$Datetime<-as.POSIXct(df2$Datetime, format="%Y-%m-%d %H:%M:%S",tz="UTC")
colnames(df2)<-c("Datetime_CI","Site","m0-7","m8-15","m16-23","m24-31","m32-39")



#Tidy the data in df2 so that that we have two columns for min and max Depth
#and a single column for the value of the current intensity
df2 <- df2 %>% 
  gather(-Datetime_CI, -Site, key = Depth, value = Current.Int) %>% 
  separate(Depth, c("minDepth", "maxDepth")) %>% 
  mutate(minDepth = as.numeric(str_sub(minDepth, 2, nchar(minDepth))))

#join df1 and df2 based on the Site alone
df1 %>% 
  inner_join(df2, by = "Site") %>% 
  #now filter out any observations where depth is not between the min and max
  filter(Depth >= minDepth,
         Depth <= maxDepth,
         #now exclude any current intensity observations prior to Datetime
         Datetime > Datetime_CI) %>% 
  #finally, take the first current intensity observation after Datetime
  group_by(Datetime, Site, Ind, Depth) %>% 
  filter(Datetime_CI == max(Datetime_CI))


# A tibble: 6 x 8
# Groups:   Datetime, Site, Ind, Depth [4]
Datetime            Site    Ind Depth Datetime_CI         minDepth maxDepth Current.Int
<dttm>              <chr> <dbl> <dbl> <dttm>                 <dbl> <chr>          <dbl>
1 2016-08-01 15:34:07 BD       16   5.3 2016-08-01 15:00:00        0 7               4   
2 2016-08-01 17:29:16 BD       19  36.4 2016-08-01 15:00:00        0 7               4   
3 2016-08-01 17:29:16 BD       19  36.4 2016-08-01 15:00:00       32 39              4   
4 2016-08-01 18:33:16 BD       16  42   2016-08-01 18:00:00        0 7               6.75
5 2016-08-01 22:48:16 BD       16  22.1 2016-08-01 21:00:00        0 7               2.25
6 2016-08-01 22:48:16 BD       16  22.1 2016-08-01 21:00:00       16 23              2.25