如何计算R中值不均匀的两个数据帧的比率?

时间:2019-05-15 11:53:15

标签: r dataframe ggplot2

我有两个数据集,其中在几个时间点测量了某个值。我想用折线图可视化两个数据集之间的比率。但是,测量的时间点不能完全重叠,某些实验包含的值比其他实验多。

数据

df_1 <- as.data.frame(cbind(c(1,2,3,4,5,6,7,8,9,10), c(1,4,7,8,9.5,17,41,27, 26, 10)))
df_2 <- as.data.frame(cbind(c(0.1, 0.5, 2, 3, 4.4,5.3,6,7,8,10,12,15,16,20), c(0.1,2,4,6,7.5,6.3,9,19,22,23,24,51,31,23)))

library(tidyverse)

ggplot()+
   geom_line(data = df_1, aes(x=V1, y=V2), col = "black") + 
   geom_line(data = df_2, aes(x=V1, y=V2), col = "red")

接下来,我想绘制两条曲线之间的比率。但是,由于测量的数据点没有均匀分布(并且在不同的时间测量),因此我无法简单地将每个时间点的值相除并绘制结果。

问题

如何计算R中两个不均匀间隔的数据帧(可以是不同长度)之间的比率?

其他示例

我有两个(示例)数据集。第三个数据集是预期的输出(df_3)。绿线表示预期的输出:df_2 / df_1之间的比率。

df_1 <- as.data.frame(cbind(time = c(1:5), value = c(1:5)))
df_2 <- as.data.frame(cbind(time = c(1.5, 2, 3,4), value = c(2,2,2.5,3.5)))
df_3 <- as.data.frame(cbind(time = c(1.5, 2, 3,4), value = c(1.33, 1, 0.83, 0.875)))

ggplot() + 
  geom_point(data = df_1, aes(x=time, y=value), col = "black", size = 3) + 
  geom_point(data = df_2, aes(x=time, y=value), col = "red", size =3) +
  geom_line(data = df_1, aes(x=time, y = value), col = "black") + 
  geom_line(data = df_2, aes(x = time, y = value), col = "red") + 
  geom_point(data =df_3, aes(x = time, y = value), col = "green", size = 3) +
  geom_line(data=df_3, aes(x=time, y = value), col = "green")

1 个答案:

答案 0 :(得分:1)

您可以使用dplyr软件包进行完全连接。 请注意,我正在使用的数据帧与您使用的数据帧相同,但列名为timevalue

比率上的线性插值

df_1 <- as.data.frame(cbind(time = c(1,2,3,4,5,6,7,8,9,10), value = c(1,4,7,8,9.5,17,41,27, 26, 10)))
df_2 <- as.data.frame(cbind(time = c(0.1, 0.5,2,3,4.4,5.3,6,7,8,10,12,15,16,20), value = c(0.1,2,4,6,7.5,6.3,9,19,22,23,24,51,31,23)))

library(dplyr)
df_1 %>% full_join(df_2, by = "time", suffix = c("_1", "_2")) %>%
  arrange(time) %>% 
  mutate(ratio = value_1/value_2,
         ratio = approx(time, ratio, xout = time, rule = 2:2)$y)

   time value_1 value_2     ratio
1   0.1      NA     0.1 1.0000000
2   0.5      NA     2.0 1.0000000
3   1.0     1.0      NA 1.0000000
4   2.0     4.0     4.0 1.0000000
5   3.0     7.0     6.0 1.1666667
6   4.0     8.0      NA 1.4074074
7   4.4      NA     7.5 1.5037037
8   5.0     9.5      NA 1.6481481
9   5.3      NA     6.3 1.7203704
10  6.0    17.0     9.0 1.8888889
11  7.0    41.0    19.0 2.1578947
12  8.0    27.0    22.0 1.2272727
13  9.0    26.0      NA 0.8310277
14 10.0    10.0    23.0 0.4347826
15 12.0      NA    24.0 0.4347826
16 15.0      NA    51.0 0.4347826
17 16.0      NA    31.0 0.4347826
18 20.0      NA    23.0 0.4347826

df_2上value_1的线性插值

# Interpolated dataframe 1
# Interpolation ensures that for every time point in df_2, a value_1 is calculated. 
# Next, the ratio of value_2 / value_1 is calculated. 

int_df_1 <- as.data.frame(approx(df_1$time, df_1$value, xout = df_2$time, rule = 1:1))
names(int_df_1) <- c("time", "value")

# Again full join + division of df_2 by interpolated df_1

int_df_1 %>% 
   full_join(df_2, by = "time", suffix = c("_1", "_2")) %>%
   arrange(time) %>% 
   mutate(ratio = value_2/value_1)