基于R中的前三个字母进行子集

时间:2016-04-03 02:06:25

标签: r variables subset

***Subset***

嘿,我想根据作为城市意识的前三个字母来构建此数据集的子集。我如何在R?中解决这个问题?

2 个答案:

答案 0 :(得分:0)

在进行子集化时,您可以对grep的列名称使用^来指示字符串的开头:

head(iris[,grep('^Petal', names(iris))])
#   Petal.Length Petal.Width
# 1          1.4         0.2
# 2          1.4         0.2
# 3          1.3         0.2
# 4          1.5         0.2
# 5          1.4         0.2
# 6          1.7         0.4

或者,dplyrstarts_with中有一个很好的select辅助函数,出于同样的目的:

library(dplyr)
iris %>% select(starts_with('Sepal')) %>% head()
#   Sepal.Length Sepal.Width
# 1          5.1         3.5
# 2          4.9         3.0
# 3          4.7         3.2
# 4          4.6         3.1
# 5          5.0         3.6
# 6          5.4         3.9

答案 1 :(得分:0)

我想OP希望基于列名中的前三个字母substr数据集。我们可以使用subgrp <- substr(colnames(df1), 1, 3) 来创建分组向量。

sub

grp <- sub("^(.{3}).*", "\\1", colnames(df1))

_

或另一种选择是删除从grp <- sub("_.*", "", colnames(df1)) 开始的子字符串。

list

基于“grp”,我们可以将列名拆分为lst <- lapply(split(colnames(df1), grp), function(nm) df1[nm]) lst #$DFW # DFW_rain DFW_snow DFW_fog DFW_tstorm DFW_hail #1 0 0 0 0 0 #2 1 0 0 0 0 #3 1 0 0 0 0 #4 0 0 0 0 0 #$ELP # ELP_tmax ELP_tmean ELP_tmin ELP_dewmax ELP_dewmean ELP_dewmin ELP_hummax ELP_hummean ELP_hummin ELP_prsmax ELP_prsmean ELP_prsmin ELP_windmax ELP_windmean ELP_winddir #1 64 55 46 42 34 23 73 47 22 29.89 29.83 29.75 29 12 243 #2 57 46 35 32 26 21 70 48 26 29.96 29.86 29.72 39 13 227 #3 46 40 34 25 13 3 59 36 17 30.43 30.17 29.92 37 18 306 #4 45 36 27 17 11 7 63 39 15 30.51 30.39 30.23 14 8 15 # ELP_precip ELP_cloud ELP_rain ELP_snow ELP_fog ELP_tstorm ELP_hail #1 0 3 0 0 0 0 0 #2 0 1 0 0 0 0 0 #3 0 1 0 0 0 0 0 #4 0 0 0 0 0 0 0 并对数据集进行子集化。

list

“lst”是data.frame的{​​{1}}。最好在list内执行所有操作,而不是在全局环境中使用多个对象

但是,如果我们需要单独的data.frame对象(不推荐)

list2env(setNames(lst, paste0("df", names(lst))),
         envir = .GlobalEnv)
dfDFW
#   DFW_rain DFW_snow DFW_fog DFW_tstorm DFW_hail
#1        0        0       0          0        0
#2        1        0       0          0        0
#3        1        0       0          0        0
#4        0        0       0          0        0

如果我们只需要提取特定列,而不是split,我们可以grep使用'grp'的值。例如

 Un1 <- unique(grp)
 df1[grep(Un1[1], colnames(df1))]
 #     DFW_rain DFW_snow DFW_fog DFW_tstorm DFW_hail
 #1        0        0       0          0        0
 #2        1        0       0          0        0
 #3        1        0       0          0        0
 #4        0        0       0          0        0

数据

df1 <- structure(list(DFW_rain = c(0, 1, 1, 0), 
DFW_snow = c(0, 0, 0, 0), DFW_fog = c(0, 0, 0, 0),
DFW_tstorm = c(0, 0, 0, 0), DFW_hail = c(0, 
0, 0, 0), 
ELP_tmax = c(64, 57, 46, 45), ELP_tmean = c(55, 46, 
40, 36), ELP_tmin = c(46, 35, 34, 27), ELP_dewmax = c(42, 32, 
25, 17), ELP_dewmean = c(34, 26, 13, 11), ELP_dewmin = c(23, 
21, 3, 7), ELP_hummax = c(73, 70, 59, 63), ELP_hummean = c(47, 
48, 36, 39), ELP_hummin = c(22, 26, 17, 15), ELP_prsmax = c(29.89, 
29.96, 30.43, 30.51), ELP_prsmean = c(29.83, 29.86, 30.17, 30.39
), ELP_prsmin = c(29.75, 29.72, 29.92, 30.23),
ELP_windmax = c(29, 39, 37, 14), ELP_windmean = c(12, 13, 
18, 8), ELP_winddir = c(243, 
227, 306, 15), ELP_precip = c(0, 0, 0, 0), 
ELP_cloud = c(3, 1, 1, 0), ELP_rain = c(0, 0, 0, 0), 
ELP_snow = c(0, 0, 0, 0), ELP_fog = c(0,0, 0, 0), 
ELP_tstorm = c(0, 0, 0, 0), ELP_hail = c(0, 0, 0, 0
)), .Names = c("DFW_rain", "DFW_snow", "DFW_fog", 
"DFW_tstorm", "DFW_hail", "ELP_tmax", "ELP_tmean", 
"ELP_tmin", "ELP_dewmax", "ELP_dewmean", "ELP_dewmin", 
"ELP_hummax", "ELP_hummean", "ELP_hummin", 
"ELP_prsmax", "ELP_prsmean", "ELP_prsmin", "ELP_windmax",
 "ELP_windmean", "ELP_winddir", "ELP_precip", 
 "ELP_cloud", "ELP_rain", "ELP_snow", 
 "ELP_fog", "ELP_tstorm", "ELP_hail"), 
 row.names = c(NA, -4L), class = "data.frame")