在R中设置一个特例

时间:2018-05-22 12:25:32

标签: r

我有一个包含多个数字列的数据集

x = 
a b c d
1 2 3 2
2 3 5 1
1 3 4 6
. . . .
. . . .

我有另一个列表,其中一些规则是根据每个变量的范围定义的

y = 
y[[1]]
y[[1]][[1]]
1
2
y[[1]][[2]]
5
6
y[[2]]
y[[2]][[1]]
2
3
y[[3]]
y[[3]][[1]]
5
6
y[[3]][[2]]
8
9
y[[3]][[3]]
10
11
y[[4]]    
y[[4]][[1]]
12
15

我希望在采用每个变量组合时创建所有可能的数据框:

dataframe1 = variable 1 is from 1:2,
variable 2 is from 2:3, 
variable 3 is from 5:6 and 
variable 4 is from 12:15 is one data frame

dataframe2 = variable 1 is from 5:6, 
variable 2 is from 2:3, 
variable 3 i from 5:6 and 
variable 4 is from 12:15 is one data frame 

依旧......

示例数据

structure(list(c(2441.321994, 2441.295962, 2441.269929, 2441.243897, 
2441.217865, 2441.191832, 2441.1658, 2441.139767, 2441.113735, 
2441.087702, 2441.06167, 2441.035637, 2441.009605, 2440.983572, 
2440.95754, 2440.931507, 2440.905475, 2440.879443, 2440.85341, 
2440.827378), c(998.6875404, 998.6989679, 998.7103955, 998.721823, 
998.7332505, 998.744678, 998.7561055, 998.767533, 998.7789605, 
998.7903881, 998.8018156, 998.8132431, 998.8246706, 998.8360981, 
998.8475256, 998.8589531, 998.8703806, 998.8818082, 998.8932357, 
998.9046632), c(4272184.479, 4272207.767, 4272231.055, 4272254.344, 
4272277.632, 4272300.92, 4272324.209, 4272347.497, 4272370.786, 
4272394.074, 4272417.362, 4272440.651, 4272463.939, 4272487.227, 
4272510.516, 4272533.804, 4272557.093, 4272580.381, 4272603.669, 
4272626.958), c(560.8931785, 562.3330729, 563.0583984, 562.3908637, 
563.3387891, 562.7498197, 561.9703322, 561.9606988, 563.5061892, 
563.2322049, 562.7148307, 561.6984375, 561.3954731, 560.8977865, 
560.8382398, 562.8975675, 563.9937645, 563.540068, 562.5878996, 
561.2869919)), row.names = c(NA, 20L), class = "data.frame")

样本列表

list(list(c(2400, 2430), c(2435, 2530), c(2600,2700)), list(c(900, 950), c(960,1000)), list(c(4272184,4272417)), list(c(560,561), c(562:563)))

在此示例中,所需输出为2个数据帧,其中:

DF1 = var1 between 2400 2430, var2 between 900 950, var3 between 4272184 4272417, var4 between 560 561
DF2 = var1 between 2435 2530, var2 between 900 950, var3 between 4272184 4272417, var4 between 560 561
DF3 = var1 between 2600 2700, var2 between 900 950, var3 between 4272184 4272417, var4 between 560 561
DF4 = var1 between 2400 2430, var2 between 960 1000, var3 between 4272184 4272417, var4 between 560 561
DF5 = var1 between 2435 2530, var2 between 960 1000, var3 between 4272184 4272417, var4 between 560 561
DF6 = var1 between 2600 2700, var2 between 960 1000, var3 between 4272184 4272417, var4 between 560 561
DF7 = var1 between 2400 2430, var2 between 900 950, var3 between 4272184 4272417, var4 between 562 563
DF8 = var1 between 2435 2530, var2 between 900 950, var3 between 4272184 4272417, var4 between 562 563
DF9 = var1 between 2600 2700, var2 between 900 950, var3 between 4272184 4272417, var4 between 562 563
DF10 = var1 between 2400 2430, var2 between 960 1000, var3 between 4272184 4272417, var4 between 562 563
DF11 = var1 between 2435 2530, var2 between 960 1000, var3 between 4272184 4272417, var4 between 562 563
DF12 = var1 between 2600 2700, var2 between 960 1000, var3 between 4272184 4272417, var4 between 562 563

1 个答案:

答案 0 :(得分:1)

使用data.table - 包的可能解决方案:

# load the package
library(data.table)

# create a reference data.table from the list
ref <- as.data.table(lapply(l, unlist))[, grp := rowid(V2)
                                        ][, dcast(.SD, grp ~ rowid(grp),
                                                  value.var = c('V1','V2'))]


# convert 'df' to a data.table
setDT(df)

# use the join capability of data.table to add a grouping variable
# from 'ref' to those rows that fit the criteria
df[, grp := ref[.SD
                , on = .(V1_1 < var1, V1_2 > var1, V2_1 < var2, V2_2 > var2)
                , grp]]


# filter out the rows where no grp was assigned
df2 <- df[!is.na(grp)]

# split the result
split(df2, df2$grp)

给出:

$`1`
        var1     var2    var3     var4 grp
 1: 2421.322 948.6875 4272184 560.8932   1
 2: 2421.270 948.7104 4272231 563.0584   1
 3: 2421.218 948.7333 4272278 563.3388   1
 4: 2421.166 948.7561 4272324 561.9703   1
 5: 2421.114 948.7790 4272371 563.5062   1
 6: 2421.062 948.8018 4272417 562.7148   1
 7: 2421.010 948.8247 4272464 561.3955   1
 8: 2420.958 948.8475 4272511 560.8382   1
 9: 2420.905 948.8704 4272557 563.9938   1
10: 2420.853 948.8932 4272604 562.5879   1

$`2`
        var1     var2    var3     var4 grp
 1: 2441.322 998.6875 4272184 560.8932   2
 2: 2441.296 998.6990 4272208 562.3331   2
 3: 2441.270 998.7104 4272231 563.0584   2
 4: 2441.244 998.7218 4272254 562.3909   2
 5: 2441.218 998.7333 4272278 563.3388   2
 6: 2441.192 998.7447 4272301 562.7498   2
 7: 2441.166 998.7561 4272324 561.9703   2
 8: 2441.140 998.7675 4272347 561.9607   2
 9: 2441.114 998.7790 4272371 563.5062   2
10: 2441.088 998.7904 4272394 563.2322   2
11: 2441.062 998.8018 4272417 562.7148   2
12: 2441.036 998.8132 4272441 561.6984   2
13: 2441.010 998.8247 4272464 561.3955   2
14: 2440.984 998.8361 4272487 560.8978   2
15: 2440.958 998.8475 4272511 560.8382   2
16: 2440.932 998.8590 4272534 562.8976   2
17: 2440.905 998.8704 4272557 563.9938   2
18: 2440.879 998.8818 4272580 563.5401   2
19: 2440.853 998.8932 4272604 562.5879   2
20: 2440.827 998.9047 4272627 561.2870   2

使用list2env - 函数,您可以分解列表并根据需要将数据表放在Global环境中。

使用过的数据:

df <- structure(list(var1 = c(2421.321994, 2421.295962, 2421.269929, 2421.243897, 2421.217865, 2421.191832, 2421.1658, 2421.139767, 2421.113735, 2421.087702, 2421.06167, 2421.035637, 2421.009605, 2420.983572, 2420.95754, 2420.931507, 2420.905475, 2420.879443, 2420.85341, 2420.827378, 2441.321994, 2441.295962, 2441.269929, 2441.243897, 2441.217865, 2441.191832, 2441.1658, 2441.139767, 2441.113735, 2441.087702, 2441.06167, 2441.035637, 2441.009605, 2440.983572, 2440.95754, 2440.931507, 2440.905475, 2440.879443, 2440.85341, 2440.827378, 2461.321994, 2461.295962, 2461.269929, 2461.243897, 2461.217865, 2461.191832, 2461.1658, 2461.139767, 2461.113735, 2461.087702, 2461.06167, 2461.035637, 2461.009605, 2460.983572, 2460.95754, 2460.931507, 2460.905475, 2460.879443, 2460.85341, 2460.827378),
                     var2 = c(948.6875404, 898.6989679, 948.7103955, 898.721823, 948.7332505, 898.744678, 948.7561055, 898.767533, 948.7789605, 898.7903881, 948.8018156, 898.8132431, 948.8246706, 898.8360981, 948.8475256, 898.8589531, 948.8703806, 898.8818082, 948.8932357, 898.9046632, 998.6875404, 998.6989679, 998.7103955, 998.721823, 998.7332505, 998.744678, 998.7561055, 998.767533, 998.7789605, 998.7903881, 998.8018156, 998.8132431, 998.8246706, 998.8360981, 998.8475256, 998.8589531, 998.8703806, 998.8818082, 998.8932357, 998.9046632, 1048.6875404, 1098.6989679, 1048.7103955, 1098.721823, 1048.7332505, 1098.744678, 1048.7561055, 1098.767533, 1048.7789605, 1098.7903881, 1048.8018156, 1098.8132431, 1048.8246706, 1098.8360981, 1048.8475256, 1098.8589531, 1048.8703806, 1098.8818082, 1048.8932357, 1098.9046632),
                     var3 = c(4272184.479, 4272207.767, 4272231.055, 4272254.344, 4272277.632, 4272300.92, 4272324.209, 4272347.497, 4272370.786, 4272394.074, 4272417.362, 4272440.651, 4272463.939, 4272487.227, 4272510.516, 4272533.804, 4272557.093, 4272580.381, 4272603.669, 4272626.958, 4272184.479, 4272207.767, 4272231.055, 4272254.344, 4272277.632, 4272300.92, 4272324.209, 4272347.497, 4272370.786, 4272394.074, 4272417.362, 4272440.651, 4272463.939, 4272487.227, 4272510.516, 4272533.804, 4272557.093, 4272580.381, 4272603.669, 4272626.958, 4272184.479, 4272207.767, 4272231.055, 4272254.344, 4272277.632, 4272300.92, 4272324.209, 4272347.497, 4272370.786, 4272394.074, 4272417.362, 4272440.651, 4272463.939, 4272487.227, 4272510.516, 4272533.804, 4272557.093, 4272580.381, 4272603.669, 4272626.958),
                     var4 = c(560.8931785, 562.3330729, 563.0583984, 562.3908637, 563.3387891, 562.7498197, 561.9703322, 561.9606988, 563.5061892, 563.2322049, 562.7148307, 561.6984375, 561.3954731, 560.8977865, 560.8382398, 562.8975675, 563.9937645, 563.540068, 562.5878996, 561.2869919, 560.8931785, 562.3330729, 563.0583984, 562.3908637, 563.3387891, 562.7498197, 561.9703322, 561.9606988, 563.5061892, 563.2322049, 562.7148307, 561.6984375, 561.3954731, 560.8977865, 560.8382398, 562.8975675, 563.9937645, 563.540068, 562.5878996, 561.2869919, 560.8931785, 562.3330729, 563.0583984, 562.3908637, 563.3387891, 562.7498197, 561.9703322, 561.9606988, 563.5061892, 563.2322049, 562.7148307, 561.6984375, 561.3954731, 560.8977865, 560.8382398, 562.8975675, 563.9937645, 563.540068, 562.5878996, 561.2869919)),
                .Names = c("var1", "var2", "var3", "var4"), row.names = c(NA, -60L), class = "data.frame")

l <- list(list(c(2400, 2430), c(2435, 2530)), c(900, 1000))

df构建了OP的原始数据如下:

names(df) <- paste0('var', seq_along(df))
df <- rbind(df - list(20, c(50,100), 0, 0), df, df + list(20, c(50,100), 0, 0))