我有一些可重现的数据(我的原始数据集包含约2,000,000行)。由于这个原因,我的for循环效率很低,将需要很长时间才能运行这么多数据。我想知道是否有一种更有效的方式来运行此数据。我在代码中附加了可复制的数据
#----Reproducible data example--------------------#
#Upload first data set#
words1<-c("How","did","Quebec","nationalists","see","their","province","as","a","nation","in","the","1960s")
words2<-c("Why","does","volicty","effect","time",'?',NA,NA,NA,NA,NA,NA,NA)
words3<-c("How","do","I","wash","a","car",NA,NA,NA,NA,NA,NA,NA)
library<-c("The","the","How","see","as","a","for","then","than","example")
embedding1<-c(.5,.6,.7,.8,.9,.3,.46,.48,.53,.42)
embedding2<-c(.1,.5,.4,.8,.9,.3,.98,.73,.48,.56)
df <- data.frame(words1,words2,words3)
names(df)<-c("words1","words2","words3")
#--------Upload 2nd dataset-------#
df2 <- data.frame(library,embedding1, embedding2)
names(df2)<-c("library","embedding1","embedding2")
df2$meanembedding=rowMeans(df2[c("embedding1","embedding2")],na.rm=T)
df2<-df2[,-c(2,3)]
#-----Find columns--------#
l=ncol(df)
names<-names(df)
head(names)
classes<-sapply(df[,c(1:l)],class)
head(classes)
#------Combine and match libary to training data------#
require(gridExtra)
List = list()
for( name in names){
df1<-df[,name]
df1<-as.data.frame(df1)
x_train2<-merge(x= df1, y = df2,
by.x = "df1", by.y = 'library',all.x=T, sort=F)
x_train2<-x_train2[,-1]
x_train2<-as.data.frame(x_train2)
names(x_train2) <- name
List[[length(List)+1]] = x_train2
}
答案 0 :(得分:1)
更好的方法是使用lapply
:
myList2 <- lapply(names(df), function(x){
y <- merge(x = df[, x, drop = FALSE],
y = df2,
by.x = x,
by.y = 'library',
all.x = T,
sort = F)[, -1, drop = FALSE]
names(y) <- x
return(y)
})
我们使用names(df)
遍历向量[drop = FALSE]
,子集并进行合并,以防止从单列data.frame简化为向量,并覆盖列名。输出是一个列表。
后脚本:@RuiBarradas指出,如果您使用drop = FALSE
而不是df[x]
,则从技术上讲,您不需要df[, x]
。但是我认为在需要同时对行和列进行子集化的情况下了解drop = FALSE
选项很有帮助。
答案 1 :(得分:0)
在加入大数据量时,请尝试一下data.table ...
library( data.table )
dt <- as.data.table( df )
dt2 <- as.data.table ( df2 )
lapply( names(dt), function(x) {
on_expr <- parse( text = paste0( "c( library = \"", x, "\")" ) )
dt2[dt, on = eval( on_expr )][,2]
})
# [[1]]
# meanembedding
# 1: 0.55
# 2: NA
# 3: NA
# 4: NA
# 5: 0.80
# 6: NA
# 7: NA
# 8: 0.90
# 9: 0.30
# 10: NA
# 11: NA
# 12: 0.55
# 13: NA
#
# [[2]]
# meanembedding
# 1: NA
# 2: NA
# 3: NA
# 4: NA
# 5: NA
# 6: NA
# 7: NA
# 8: NA
# 9: NA
# 10: NA
# 11: NA
# 12: NA
# 13: NA
#
# [[3]]
# meanembedding
# 1: 0.55
# 2: NA
# 3: NA
# 4: NA
# 5: 0.30
# 6: NA
# 7: NA
# 8: NA
# 9: NA
# 10: NA
# 11: NA
# 12: NA
# 13: NA