嘿,我想根据作为城市意识的前三个字母来构建此数据集的子集。我如何在R?中解决这个问题?
答案 0 :(得分:0)
在进行子集化时,您可以对grep
的列名称使用^
来指示字符串的开头:
head(iris[,grep('^Petal', names(iris))])
# Petal.Length Petal.Width
# 1 1.4 0.2
# 2 1.4 0.2
# 3 1.3 0.2
# 4 1.5 0.2
# 5 1.4 0.2
# 6 1.7 0.4
或者,dplyr
在starts_with
中有一个很好的select
辅助函数,出于同样的目的:
library(dplyr)
iris %>% select(starts_with('Sepal')) %>% head()
# Sepal.Length Sepal.Width
# 1 5.1 3.5
# 2 4.9 3.0
# 3 4.7 3.2
# 4 4.6 3.1
# 5 5.0 3.6
# 6 5.4 3.9
答案 1 :(得分:0)
我想OP希望基于列名中的前三个字母来substr
数据集。我们可以使用sub
或grp <- substr(colnames(df1), 1, 3)
来创建分组向量。
sub
或grp <- sub("^(.{3}).*", "\\1", colnames(df1))
_
或另一种选择是删除从grp <- sub("_.*", "", colnames(df1))
开始的子字符串。
list
基于“grp”,我们可以将列名拆分为lst <- lapply(split(colnames(df1), grp), function(nm) df1[nm])
lst
#$DFW
# DFW_rain DFW_snow DFW_fog DFW_tstorm DFW_hail
#1 0 0 0 0 0
#2 1 0 0 0 0
#3 1 0 0 0 0
#4 0 0 0 0 0
#$ELP
# ELP_tmax ELP_tmean ELP_tmin ELP_dewmax ELP_dewmean ELP_dewmin ELP_hummax ELP_hummean ELP_hummin ELP_prsmax ELP_prsmean ELP_prsmin ELP_windmax ELP_windmean ELP_winddir
#1 64 55 46 42 34 23 73 47 22 29.89 29.83 29.75 29 12 243
#2 57 46 35 32 26 21 70 48 26 29.96 29.86 29.72 39 13 227
#3 46 40 34 25 13 3 59 36 17 30.43 30.17 29.92 37 18 306
#4 45 36 27 17 11 7 63 39 15 30.51 30.39 30.23 14 8 15
# ELP_precip ELP_cloud ELP_rain ELP_snow ELP_fog ELP_tstorm ELP_hail
#1 0 3 0 0 0 0 0
#2 0 1 0 0 0 0 0
#3 0 1 0 0 0 0 0
#4 0 0 0 0 0 0 0
并对数据集进行子集化。
list
“lst”是data.frame
的{{1}}。最好在list
内执行所有操作,而不是在全局环境中使用多个对象
但是,如果我们需要单独的data.frame对象(不推荐)
list2env(setNames(lst, paste0("df", names(lst))),
envir = .GlobalEnv)
dfDFW
# DFW_rain DFW_snow DFW_fog DFW_tstorm DFW_hail
#1 0 0 0 0 0
#2 1 0 0 0 0
#3 1 0 0 0 0
#4 0 0 0 0 0
如果我们只需要提取特定列,而不是split
,我们可以grep
使用'grp'的值。例如
Un1 <- unique(grp)
df1[grep(Un1[1], colnames(df1))]
# DFW_rain DFW_snow DFW_fog DFW_tstorm DFW_hail
#1 0 0 0 0 0
#2 1 0 0 0 0
#3 1 0 0 0 0
#4 0 0 0 0 0
df1 <- structure(list(DFW_rain = c(0, 1, 1, 0),
DFW_snow = c(0, 0, 0, 0), DFW_fog = c(0, 0, 0, 0),
DFW_tstorm = c(0, 0, 0, 0), DFW_hail = c(0,
0, 0, 0),
ELP_tmax = c(64, 57, 46, 45), ELP_tmean = c(55, 46,
40, 36), ELP_tmin = c(46, 35, 34, 27), ELP_dewmax = c(42, 32,
25, 17), ELP_dewmean = c(34, 26, 13, 11), ELP_dewmin = c(23,
21, 3, 7), ELP_hummax = c(73, 70, 59, 63), ELP_hummean = c(47,
48, 36, 39), ELP_hummin = c(22, 26, 17, 15), ELP_prsmax = c(29.89,
29.96, 30.43, 30.51), ELP_prsmean = c(29.83, 29.86, 30.17, 30.39
), ELP_prsmin = c(29.75, 29.72, 29.92, 30.23),
ELP_windmax = c(29, 39, 37, 14), ELP_windmean = c(12, 13,
18, 8), ELP_winddir = c(243,
227, 306, 15), ELP_precip = c(0, 0, 0, 0),
ELP_cloud = c(3, 1, 1, 0), ELP_rain = c(0, 0, 0, 0),
ELP_snow = c(0, 0, 0, 0), ELP_fog = c(0,0, 0, 0),
ELP_tstorm = c(0, 0, 0, 0), ELP_hail = c(0, 0, 0, 0
)), .Names = c("DFW_rain", "DFW_snow", "DFW_fog",
"DFW_tstorm", "DFW_hail", "ELP_tmax", "ELP_tmean",
"ELP_tmin", "ELP_dewmax", "ELP_dewmean", "ELP_dewmin",
"ELP_hummax", "ELP_hummean", "ELP_hummin",
"ELP_prsmax", "ELP_prsmean", "ELP_prsmin", "ELP_windmax",
"ELP_windmean", "ELP_winddir", "ELP_precip",
"ELP_cloud", "ELP_rain", "ELP_snow",
"ELP_fog", "ELP_tstorm", "ELP_hail"),
row.names = c(NA, -4L), class = "data.frame")