在R中有效地设置非常大的数据帧

时间:2015-02-27 14:59:25

标签: r dataframe large-data bigdata

所以我有一个16列和大约1700万行的数据框。

我首先想在数据框上做一些ddply,然后查看不同列之间的相关性。实现这一目标的最佳和最有效的方法是什么?我目前的方法花了太长时间:

数据框为all_df,列名为ABC,...,NOP

avB <- ddply(all_df, c(“A”), summarise, NB_av=mean(B), NB_sd=sd(B))
avC <- ddply(all_df, c(“A”), summarise, NC_av=mean(C), NC_sd=sd(C))
avD <- ddply(all_df, c(“A”), summarise, ND_av=mean(D), ND_sd=sd(D))
avE <- ddply(all_df, c(“A”), summarise, NE_av=mean(E), NE_sd=sd(E))
avF <- ddply(all_df, c(“A”), summarise, NF_av=mean(F), NF_sd=sd(F))
avG <- ddply(all_df, c(“A”), summarise, NG_av=mean(G), NG_sd=sd(G))

summary_df <- avB
summary_df <- merge(summary_df, avC, by=c(“A”))
summary_df <- merge(summary_df, avD, by=c(“A”))
summary_df <- merge(summary_df, avE, by=c(“A”))
summary_df <- merge(summary_df, avF, by=c(“A”))
summary_df <- merge(summary_df, avG, by=c(“A”))

#quick look at the correlation
plot((summary_df[,c(2,4,6,8,10,12)]), gap=0) 

所以,实际上,我正在解决在MySQL中做很多这些,平均值,标准偏差等,然后在R中进行最终的相关关系分析。但是,我发现它并不优雅。 / p>

为什么我使用数据框而不是数据表?因为我正在读取R中的MySQL表,并且语法dbGetQuery(con,"select * from mysql_table")返回一个数据帧。

2 个答案:

答案 0 :(得分:5)

你可以尝试

library(dplyr)
 all_df %>% 
       group_by(A) %>% 
       summarise_each(funs(mean, sd), B:G)

或另一个选项是data.table

library(data.table)
setDT(all_df)[, lapply(.SD, function(x) c(mean(x), sd(x))), by = A,
              .SDcols=LETTERS[2:6]][,var:= c('mean', 'sd')][]

注意:第一种形式的结果是宽格式,而第二种形式的结果是'mean','sd'作为替代行。

基准

 all_df1 <- all_df[rep(1:nrow(all_df), 1e5),]
 system.time(all_df1%>% group_by(A) %>% summarise_each(funs(mean, sd), B:G))
 #  user  system elapsed 
 # 0.189   0.000   0.189 

 DT1 <- as.data.table(all_df1)
 system.time(DT1[,lapply(.SD, function(x) c(mean(x), sd(x))),
              A, .SDcols=LETTERS[2:6]][,var:= c('mean', 'sd')][])
 #  user  system elapsed 
 #0.232   0.002   0.235 

数据

set.seed(25)
m1 <- matrix(sample(1:20, 15*20, replace=TRUE), ncol=15)
set.seed(353)
all_df <- data.frame(sample(letters[1:3], 20, replace=TRUE), m1)
colnames(d1) <- LETTERS[1:ncol(d1)]    

答案 1 :(得分:0)

非常感谢akrun!

我建立了你的答案,并写了一个完整的例子,也借鉴了http://www.carlboettiger.info/2012/02/12/elegant-fast-data-manipulation-with-data-table.html它还展示了如何调用由lapply产生的对象的特定元素。

#create a super large data frame:
grpsize = ceiling(1e7/26^2)

all_df1 <- data.frame(
            x=rep(LETTERS,each=26*grpsize),
            y=rep(letters,each=grpsize),
            v=runif(grpsize*26^2),
            v2=runif(grpsize*26^2),
            stringsAsFactors=FALSE)

#to group by x and y andget length, mean from data frame
sumalldf <- ddply(all_df1, c("x","y"), summarise, ntotalldf = length(x), nmeanalldf = mean(v))

#convert to data.table
#more efficient way:
DT1 <- data.table(all_df1)

##less efficient way:   
DT2 <- as.data.table(all_df1)

#set keys on x,y columns
setkey(DT1,x,y) # for x only, use: setkey(DT,x)

#setting the key as above allows calling by column value : DT1["A"]
#if you don't setkey and attempt the above, you'll get an error warning you to set key

#take a look at DT1
print(head(DT1))
print(tail(DT1))

#now group data table by x,y and get mean and standard deviation for all other columns
sumalldt <- DT1[,lapply(.SD, function(x) c(mean(x), sd(x))), by= list(x,y)][,var:= c('mean', 'sd')][]
#.SD stands for subset of data, in lay words it applies function (mean, sd) to all columns
#except the by columns

#take a look at this new object that holds the
#mean and standard deviation for all other columns
#after grouping by x,y
print(head(sumalldt))
print(tail(sumalldt)) 

#the keys for sumalldt get set by the 'by' components in lapply
print("some key, attributes etc for sumalldt")
print(key(sumalldt))
print(haskey(sumalldt))

#to get all values for x=B
#sumalldt["B"]

#to get all values for y=r
#sumalldt[list(unique(x),'r')]
#or
#sumalldt[y=="r"] # the former is more efficient

#say then you want to get the values only of x=B, y=r
print(paste("values for x=B, y=r"))
print(sumalldt[list('B','r')])

print("only the mean")
print(subset(sumalldt[list('B','r')],sumalldt[list('B','r')]$var=='mean')$v)
print(subset(sumalldt[list('B','r')],sumalldt[list('B','r')]$var=='mean')$v2)