我正在使用RTextTools
中的DocumentTermMatrix
创建一个create_matrix()
,并在此基础上创建container
和model
。它适用于非常大的数据集。
我为每个类别(因子水平)执行此操作。因此,对于每个类别,它都必须运行矩阵,容器和模型。当我在下面的代码中运行时(例如16 core / 64 gb)-它仅在一个内核中运行,并且使用的内存不到10%。
是否可以加快此过程?也许使用doparallel
和foreach
?任何信息肯定会有所帮助。
#import the required libraries
library("RTextTools")
library("hash")
library(tm)
for ( n in 1:length(folderaddress)){
#Initialize the variables
traindata = list()
matrix = list()
container = list()
models = list()
trainingdata = list()
results = list()
classifiermodeldiv = 0.80`
#Create the directory to place the models and the output files
pradd = paste(combinedmodelsaveaddress[n],"SelftestClassifierModels",sep="")
if (!file.exists(pradd)){
dir.create(file.path(pradd))
}
Data$CATEGORY <- as.factor(Data$CATEGORY)
#Read the training files
X <- split(Data, Data$CATEGORY)
data <- lapply(seq_along(X), function(x) as.data.frame(X[[x]])[,5])
names(data) <- levels(Data$CATEGORY)
list2env(data, envir = .GlobalEnv)
files=as.matrix(names(data))
fileno=length(files)
fileno=as.integer(fileno)
print(fileno)
#For all the files in the training folder(the number of files in the training folder = Number of categories in Taxonomy)
for(i in 1:fileno){
filename = as.character(files[i,1])
data1 = as.data.frame(data[i])
data1 = as.matrix(data1)
filenamechanged = gsub ("\\.[[:alnum:]]+","",filename)
type = matrix(data = as.character(filenamechanged),nrow = length(data1[,1]),ncol=1 )
data1 = cbind(data1,type)
traindata[[i]] = data1
print(i)
}
for(i in 1:fileno){
#Obtain the unique classified data from the train files for one category
trainingdata1 = as.data.frame(traindata[[i]][,1])
uniquetraintweet = hash()
typetrain1 = matrix(data=as.character(traindata[[i]][1,2]), ncol =1, nrow = length(trainingdata1[,1]))
#If the training data is less than 10 records for a category, do not create a model
#The model created based on a smaller set of data will not be accurate
if (length(trainingdata1[,1])<200){
matrix[[i]] = NULL
next
}
#Obtain the unique classified data from the train files of all the other category except that is considered as training category
trainingdata2=matrix(data="",nrow=0,ncol=1)
for (j in 1:fileno){
if ( j==i) next
trainingdata2dummy = as.data.frame(traindata[[j]][,1])
length(trainingdata1[,1])
colnames(trainingdata2)="feedbacks"
colnames(trainingdata2dummy)="feedbacks"
trainingdata2 = rbind(trainingdata2,trainingdata2dummy)
}
#Consider one category as training set and make the remaining categories as Others
typetrain2 = matrix(data="ZZOther",nrow=length(trainingdata2[,1]),ncol=1)
colnames(trainingdata1)="feedbacks"
trainingdata[[i]]=rbind(trainingdata1,trainingdata2)
colnames(typetrain1)="type"
colnames(typetrain2)="type"
type=rbind(typetrain1,typetrain2)
trainingdata[[i]] = cbind(trainingdata[[i]],type)
trainingdata[[i]]=trainingdata[[i]][sample(nrow(trainingdata[[i]])),]
#Input the training set and other set to the classifier
mindoc = max(1,floor(min(0.001*length(trainingdata[[i]][,1]),3)))
#Create Matrix
matrix[[i]] <- create_matrix(trainingdata[[i]][,1], language="english",
removeNumbers=FALSE, stemWords=FALSE,weighting=weightTf,minWordLength=3, minDocFreq=mindoc, maxDocFreq=floor(0.5*(length(trainingdata[[i]][,1]))))
#rowTotals <- apply(matrix[[i]] , 1, sum) #Find the sum of words in each Document
#matrix[[i]] <- matrix[[i]][rowTotals> 0,]
print(i)
#Create Container
container[[i]] <- create_container(matrix[[i]],trainingdata[[i]][,2],trainSize=1:length(trainingdata[[i]][,1]),virgin=FALSE)
print(i)
#Create Models
models[[i]] <- train_models(container[[i]], algorithms=c("SVM"))
print(i)
}
save(matrix, file = paste(pradd,"/Matrix",sep=""))
save(models, file = paste(pradd,"/Models",sep=""))
}
答案 0 :(得分:4)
以下是并行处理RTextTools
的示例。我使用要查找的信息here创建了虚拟函数。
函数myFun
遵循上述链接中的介绍-最后,它写入一个包含分析/摘要的csv文件(未指定目录)。然后,它是base R
软件包parallel
的直接应用,以便并行运行myFun
。
library(parallel)
library(RTextTools)
# I. A dummy function
# Uses RTextTools
myFun <- function (trainMethod) {
library(RTextTools)
data(USCongress)
# Create the document-term matrix
doc_matrix <- create_matrix(USCongress$text, language="english", removeNumbers=TRUE,
stemWords=TRUE, removeSparseTerms=.998)
container <- create_container(doc_matrix, USCongress$major, trainSize=1:4000,
testSize=4001:4449, virgin=FALSE)
# Train
model <- train_model(container,trainMethod)
classify <- classify_model(container, model)
# Analytics
analytics <- create_analytics(container,
cbind(classify))
summary(analytics)
# Saving
nameToSave <- paste(trainMethod, 'DocumentSummary.csv', sep = '_')
write.csv(analytics@document_summary, nameToSave)
}
# II. Parallel Processing
#
# 1. Vector for parallelization & number of cores available
trainMethods <- c('SVM','GLMNET','MAXENT','SLDA','BOOSTING')
num_cores <- detectCores() - 1L
# 2. Start a cluster
cl <- makeCluster(num_cores)
# 3. Export Variables needed to the cluster
# specifying exactly which variables should be exported
clusterExport(cl, varlist = c('myFun', 'trainMethods'))
# 4. do in parallel
parLapply(cl, seq_along(trainMethods), function (n) myFun(trainMethod = trainMethods[n]))
# stop the cluster
stopCluster(cl)
在您的情况下,您必须将代码转换为函数myFun (n, ...)
,其中n
是seq_along(folderaddress)
的元素,并且当然用seq_along(trainMethods)
代替{{ 1}}在seq_along(folderaddress)
中。
当然有机会通过并行化来增强代码。问题在于没有样本数据,任何建议的改进只是猜测。