我有一个包含约300,000行和60列的大型数据集。目前,如果我想在我的一个变量中对唯一特征进行子集化,我使用unique()
函数创建该变量中所有唯一值的data.frame
列表。然后我将它与主数据帧匹配,以从我的主文件中获取相关数据。
然而这个过程有点麻烦,所以我想知道是否有更快的方法来做同样的事情?例如,是否有一个函数可用于选择唯一字段以及连接到这些值的关联数据?
例如:我想创建一个新的数据框,其中只包含唯一的SurveyID_Block ID及其相关的岛代码和丰度。
structure(list(SurveyID_Block = c("62003713_2", "62003087_2",
"62003713_2", "62003713_2", "62003713_1", "62003713_2", "62003713_1",
"62003713_2", "62003713_2", "62003087_1", "62003713_1", "62003713_1",
"62003713_2", "62003713_2", "62003713_1", "62003087_1", "62003087_2",
"62003713_2", "62003713_2", "62003713_2", "62003087_2", "62003713_2",
"62003713_1", "62003713_1", "62003713_1", "62003713_1", "62003713_2",
"62003713_1", "62003713_2", "62003087_1", "62003713_2", "62003087_1",
"62003713_1", "62003087_2", "62003087_2", "62003713_2", "62003713_1",
"62003087_1", "62003713_1", "62003713_1", "62003713_1", "62003087_2",
"62003087_2", "62003713_2", "62003713_2", "62003713_2", "62003713_1",
"62003087_1", "62003713_2", "62003087_2", "62003713_1", "62003713_1",
"62003713_2", "62003713_1", "62003713_2", "62003087_2", "62003087_2",
"62003087_1", "62003087_1", "62003713_1", "62003087_1", "62003087_1",
"62003087_2", "62003087_2", "62003713_2", "62003713_1", "62003713_2",
"62003713_2", "62003713_2", "62003713_1", "62003713_2", "62003087_1",
"62003713_1", "62003713_1", "62003087_1", "62003087_1", "62003713_1",
"62003087_2", "62003087_1", "62003087_2", "62003087_2", "62003087_1",
"62003087_1", "62003087_1", "62003713_2", "62003087_2", "62003713_2",
"62003087_2", "62003713_1", "62003713_1", "62003087_2", "62003087_1",
"62003087_1", "62003087_1", "62003713_2", "62003713_2", "62003087_1",
"62003713_1", "62003087_1", "62003087_2"), IslandCode = c(1391L,
1391L, 1391L, 1391L, 1391L, 1391L, 1391L, 1391L, 1391L, 1391L,
1391L, 1391L, 1391L, 1391L, 1391L, 1391L, 1391L, 1391L, 1391L,
1391L, 1391L, 1391L, 1391L, 1391L, 1391L, 1391L, 1391L, 1391L,
1391L, 1391L, 1391L, 1391L, 1391L, 1391L, 1391L, 1391L, 1391L,
1391L, 1391L, 1391L, 1391L, 1391L, 1391L, 1391L, 1391L, 1391L,
1391L, 1391L, 1391L, 1391L, 1391L, 1391L, 1391L, 1391L, 1391L,
1391L, 1391L, 1391L, 1391L, 1391L, 1391L, 1391L, 1391L, 1391L,
1391L, 1391L, 1391L, 1391L, 1391L, 1391L, 1391L, 1391L, 1391L,
1391L, 1391L, 1391L, 1391L, 1391L, 1391L, 1391L, 1391L, 1391L,
1391L, 1391L, 1391L, 1391L, 1391L, 1391L, 1391L, 1391L, 1391L,
1391L, 1391L, 1391L, 1391L, 1391L, 1391L, 1391L, 1391L, 1391L
), totalAbun = c(667L, 174L, 667L, 667L, 715L, 667L, 715L, 667L,
667L, 1365L, 715L, 715L, 667L, 667L, 715L, 1365L, 174L, 667L,
667L, 667L, 174L, 667L, 715L, 715L, 715L, 715L, 667L, 715L, 667L,
1365L, 667L, 1365L, 715L, 174L, 174L, 667L, 715L, 1365L, 715L,
715L, 715L, 174L, 174L, 667L, 667L, 667L, 715L, 1365L, 667L,
174L, 715L, 715L, 667L, 715L, 667L, 174L, 174L, 1365L, 1365L,
715L, 1365L, 1365L, 174L, 174L, 667L, 715L, 667L, 667L, 667L,
715L, 667L, 1365L, 715L, 715L, 1365L, 1365L, 715L, 174L, 1365L,
174L, 174L, 1365L, 1365L, 1365L, 667L, 174L, 667L, 174L, 715L,
715L, 174L, 1365L, 1365L, 1365L, 667L, 667L, 1365L, 715L, 1365L,
174L)), .Names = c("SurveyID_Block", "IslandCode", "totalAbun"
), row.names = c(NA, 100L), class = "data.frame")
答案 0 :(得分:1)
我们可以通过'SurveyID_Block'split
数据集来创建list
data.frame
个。最好将数据集保存在list
中,而不是在全局环境中创建单独的data.frame对象。
lst <- split(df1, df1$SurveyID_Block)
但是,如果我们需要创建单个数据集,可以使用list2env
list2env(setNames(lst, paste0('dfN', seq_along(lst))),
envir=.GlobalEnv)