Question

R的新手...努力在10,000线上产生结果;数据模型实际上有大约1M行。有没有比Loop更好的选择？阅读有关矢量化的信息并试图点击，但没有成功。

数据集包含一列自由格式文本和与文本关联的类别。我需要将文本解析为不同的单词，然后对能够以一定精确度预测类别的单词的频率进行统计。我通过read.table读取数据并创建一个名为data的data.frame。

函数尝试解析Text，并计算每个单词的出现次数：

data <- data.frame(category = c("cat1","cat2","cat3", "cat4"), 
                   text = c("The quick brown fox", 
                            "Jumps over the fence", 
                            "The quick car hit a fence",
                            "Jumps brown"))

parsefunc <- function(data){
    finalframe <- data.frame()
    for (i in 1:nrow(data)){
    description <- strsplit(as.character(data[i,2]), " ")[[1]]
    category <- rep(data[i,1], length(description))
    worddataframe <- data.frame(description, category)
    finalframe <- rbind(finalframe, worddataframe)
    }
m1<- ddply(finalframe, c("description","category"), nrow)
m2<- ddply(m1, 'description', transform, totalcount = sum(nrow), percenttotal = nrow/sum(nrow))
m3 <- m2[(m2$totalcount>10) & (m2$percenttotal>0.8), ]
m3
}

Answer 1

这将得到你的最后一帧并做一些接近你的m1,2和3部分的事情。您必须对其进行编辑才能完全按照您的要求进行操作。我使用了更长的40k行数据集来确保它的表现正常：

# long data set
data <- data.frame(Category = rep(paste0('cat',1:4),10000),
                Text = rep(c('The quick brown fox','Jumps over the fence','The quick car hit a fence','Jumps brown cars'),10000),stringsAsFactors = F)

# split into words
wordbag <- strsplit(data$Text,split = ' ')

# find appropriate category for each word
categoryvar <- rep(data$Category,lapply(wordbag,length))

# stick them in a data frame and aggregate
newdf <- data.frame(category = categoryvar,word = tolower(unlist(wordbag)))
agg <- aggregate(list(wordcount = rep(1,nrow(newdf))),list(category = newdf$category,word =newdf$word),sum)

# find total count in entire data set and put in data set
wordagg <- aggregate(list(totalwordcount = rep(1,nrow(newdf))),list(word =newdf$word),sum)
agg <- merge(x = agg,y = wordagg,by = 'word')

# find percentages and do whatever else you need
agg$percentageofword <- agg$wordcount/agg$totalwordcount

表演循环功能差 - 选项？

1 个答案: