重写原帖。我希望消除一个plyr依赖。
我试图将tapply拼接到我的代码和lapply中。 tapply适用于一个变量(性别)而不是2(性别,成人)。滑动响应响应不会通过分组变量返回单词列表,它只返回一个大单词列表,其中分组变量位于顶部(因此对于人来说,它返回一个单词列表而不是每个人的一个单词列表)。
我为这篇文章的长度道歉但没有包括我正在研究的实际功能似乎并没有给你们提供帮助我的洞察力。
我将尝试在答案中包含我尝试使用您的建议更改功能,而不是在此处减少已经膨胀的帖子。除非对主要问题有帮助,否则请不要评论额外的用户定义的功能。它们正在进行中,只是为了向您展示问题所在。
与PLYR的正确输出:http://pastebin.com/mr9FvjpF
数据帧
DATA<-structure(list(person = structure(c(4L, 1L, 5L, 4L, 1L, 3L, 1L,
4L, 3L, 2L, 1L), .Label = c("greg", "researcher", "sally", "sam",
"teacher"), class = "factor"), sex = structure(c(2L, 2L, 2L,
2L, 2L, 1L, 2L, 2L, 1L, 1L, 2L), .Label = c("f", "m"), class = "factor"),
adult = c(0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L), state = structure(c(2L,
7L, 9L, 11L, 5L, 4L, 8L, 3L, 10L, 1L, 6L), .Label = c("Shall we move on? Good then.",
"Computer is fun. Not too fun.", "I distrust you.",
"How can we be certain?", "I am telling the truth!", "Im hungry. Lets eat. You already?",
"No its not, its ****.", "There is no way.", "What should we do?",
"What are you talking about?", "You liar, it stinks!"
), class = "factor"), code = structure(c(1L, 4L, 5L, 6L,
7L, 8L, 9L, 10L, 11L, 2L, 3L), .Label = c("K1", "K10", "K11",
"K2", "K3", "K4", "K5", "K6", "K7", "K8", "K9"), class = "factor")), .Names = c("person",
"sex", "adult", "state", "code"), row.names = c(NA, -11L), class = "data.frame")
#=====================
依赖于用户定义的工具
Trim<-function (x) gsub("^\\s+|\\s+$", "", x)
bracketX<-function(text, bracket='all'){
switch(bracket,
square=sapply(text, function(x)gsub("\\[.+?\\]", "", x)),
round=sapply(text, function(x)gsub("\\(.+?\\)", "", x)),
curly=sapply(text, function(x)gsub("\\{.+?\\}", "", x)),
all={P1<-sapply(text, function(x)gsub("\\[.+?\\]", "", x))
P1<-sapply(P1, function(x)gsub("\\(.+?\\)", "", x))
sapply(P1, function(x)gsub("\\{.+?\\}", "", x))})
}
words <- function(x){as.vector(unlist(strsplit(x, " ")))}
word.split <- function(x) lapply(x, words)
strip <- function(x){
sentence <- gsub('[[:punct:]]', '', as.character(x))
sentence <- gsub('[[:cntrl:]]', '', sentence)
sentence <- gsub('\\d+', '', sentence)
Trim(tolower(sentence))
}
#=====================
利益的功能
textLISTER <- function(dataframe = DFwcweb, text.var = "dialogue", group.vars = "person") {
require(plyr)
DF <- dataframe
DF$words <- Trim(as.character(bracketX(dataframe[, text.var])))
DF$words <- as.vector(word.split(strip(DF$words)))
#I'd like to get ride of the plyr dependency in the line below
dlply(DF, c(group.vars), summarise, words = as.vector(unlist(DF$words)))
}
#=====================
当前代码与一个或多个分组变量一起工作。
textLISTER(DATA, 'state', 'person')
textLISTER(DATA, 'state', c('sex','adult'))
答案 0 :(得分:3)
怎么样
d1 <- dlply(DF, .(sex, adult), summarise, words=as.vector(unlist(dia2word)))
d2 <- dlply(DF, .(person), summarise, words=as.vector(unlist(dia2word)))
ff <- function(x) {
u <- unlist(x)
data.frame(words=u,
row.names=seq(length(u)),
stringsAsFactors=FALSE)
}
d1B <- with(DF,lapply(split(dia2word,list(adult,sex)),ff))
all.equal(d1,d1B,check.attributes=FALSE) ## TRUE
d2B <- with(DF,lapply(split(dia2word,person),ff))
all.equal(d2,d2B,check.attributes=FALSE) ## TRUE
编辑:我没有密切关注您的代码,但似乎您的问题可能指定要隔离为字符串的组件。这是一个可能在代码中工作得更好的变体。
target <- "dia2word"
categ <- c("adult","sex")
d1C <- lapply(split(DF[[target]],lapply(categ,getElement,object=DF)),ff)
all.equal(d1,d1B,d1C,check.attributes=FALSE)
categ <- "person"
d2C <- lapply(split(DF[[target]],lapply(categ,getElement,object=DF)),ff)
all.equal(d2,d2B,d2C,check.attributes=FALSE)
答案 1 :(得分:2)
tapply
应该让你到那儿。
> tapply(DF$dia2word, DF[, c('sex', 'adult')], function(x) as.vector(unlist(x)))
adult
sex 0 1
f Character,10 Character,7
m Character,35 Character,4
如果你想模仿1d命名列表,那么它只需要更多的格式化......
答案 2 :(得分:0)
不是答案,而是尝试将建议纳入答案
尝试使用lapply建议
textLISTER<-function(dataframe, text.var, group.vars){
#require(plyr)
DF<-dataframe
DF$dia2word<-Trim(as.character(bracketX(dataframe[,text.var])))
DF$dia2word<-as.vector(word.split(strip(DF$dia2word)))
#dlply(DF, c(group.vars), summarise, words=as.vector(unlist(dia2word)))
ff <- function(x) {
u <- unlist(x)
data.frame(words=u,
row.names=seq(length(u)),
stringsAsFactors=FALSE)
}
with(DF,lapply(split(dia2word,list(group.vars)),ff))
}
#================================================================
#THE TEST
textLISTER(DATA, 'state', 'person')
textLISTER(DATA, 'state', c('sex','adult'))
尝试使用tapply建议
textLISTER <- function(dataframe, text.var, group.vars) {
#require(plyr)
DF <- dataframe
DF$dia2word <- Trim(as.character(bracketX(dataframe[, text.var])))
DF$dia2word <- as.vector(word.split(strip(DF$dia2word)))
#dlply(DF, c(group.vars), summarise,
# words=as.vector(unlist(dia2word)))
tapply(DF$dia2word, DF[, c(group.vars)], function(x) as.vector(unlist(x)))
}
#================================================================
#THE TEST
textLISTER(DATA, 'state', 'person')
textLISTER(DATA, 'state', c('sex','adult'))
答案 3 :(得分:0)
这是使用Ben Bolker的建议。发布此内容以完成主题。
textLISTER <- function(dataframe, text.var, group.vars) {
reducer <- function(x) gsub(" +", " ", x)
DF <- dataframe
DF$dia2word <- Trim(as.character(bracketX(dataframe[, text.var])))
DF$dia2word <- as.vector(word.split(reducer(strip(DF$dia2word))))
ff <- function(x) {
u <- unlist(x)
data.frame(words = u, row.names = seq(length(u)), stringsAsFactors = FALSE)
}
lapply(split(DF[["dia2word"]], lapply(group.vars, getElement,
object = DF)), ff)
}
谢谢大家通过一个明显臃肿的帖子来支持我。我讨厌这样做,但在我看来,它是捕捉正在发生的事情的唯一方法。