我有一个由单词组成的列表。
> head(splitWords2)
[[1]]
[1] "Some" "additional" "information" "that" "we" "would" "need" "to" "replicate" "the"
[11] "experiment" "is" "how" "much" "vinegar" "should" "be" "placed" "in" "each"
[21] "identical" "container" "or" "what" "tool" "use" "measure" "mass" "of" "four"
[31] "different" "samples" "and" "distilled" "water" "rinse" "after" "taking" "them" "out"
[[2]]
[1] "After" "reading" "the" "expirement" "I" "realized" "that" "additional" "information" "you"
[11] "need" "to" "replicate" "expireiment" "is" "one" "amant" "of" "vinegar" "poured"
[21] "in" "each" "container" "two" "label" "containers" "before" "start" "yar" "and"
[31] "three" "write" "a" "conclusion" "make" "sure" "results" "are" "accurate"
我有一个单词向量,我想计算列表中每个元素的出现次数,而不是整个列表中出现的总次数。
我认为这样做的方法是str_count()
包中的stringr
函数和*ply()
函数之一的组合,但我无法使其正常工作。< / p>
numWorder1 <- sapply(ifelse(str_count(unlist(splitWords2), ignore.case("we" ) )> 0, 1, 0))
其中“我们”最终将是一个单词矢量中的单词来计算出现的。
我的理想输出是:
lineNum count
1 0
2 1
3 1
4 0
... ...
有什么建议吗?
答案 0 :(得分:6)
对于一个特定的词:
words <- list(a = c("a","b","c","a","a","b"), b = c("w","w","q","a"))
$a
[1] "a" "b" "c" "a" "a" "b"
$b
[1] "w" "w" "q" "a"
wt <- data.frame(lineNum = 1:length(words))
wt$count <- sapply(words, function(x) sum(str_count(x, "a")))
lineNum count
1 1 3
2 2 1
如果向量w
包含您要计算的单词:
w <- c("a","q","e")
allwords <- lapply(w, function(z) data.frame(lineNum = 1:length(words),
count = sapply(words, function(x) sum(str_count(x, z)))))
names(allwords) <- w
$a
lineNum count
a 1 3
b 2 1
$q
lineNum count
a 1 0
b 2 1
$e
lineNum count
a 1 0
b 2 0
答案 1 :(得分:3)
这样的事情:
wordlist <- list(
c("the","and","it"),
c("we","and","it")
)
require(plyr); require(stringr)
> ldply(wordlist, function(x) str_count(x, "we"))
V1 V2 V3
1 0 0 0
2 1 0 0
答案 2 :(得分:2)
library(qdap)
#create a fake data set like yours:
words <- list(first = c("a","b","c","a","a","bc", "dBs"),
second = c("w","w","q","a"))
## termco functions require sentence like structure in a data frame so covert:
words2 <- list2df(lapply(words, paste, collapse = " "), "wl", "list")[2:1]
## trailing and leading spaces are important in match terms
## both a trailing and leading space will match exactly that trerm
termco(text.var=words2$wl, grouping.var=words2$list, match.list=c(" a "))
termco(words2$wl, words2$list, match.list=c(" b ", " a "))
## notice no space at the end of b finds and case of b + any.chunk
termco(words2$wl, words2$list, match.list=c(" b", " a "))
## no trailing/leading spaces means find any words containing the chunk b
termco(words2$wl, words2$list, match.list=c("b", " a "))
#ignores case
termco(words2$wl, words2$list, match.list=c("b", " a "), ignore.case=T)
## Last use yields:
##
## list word.count term(b) term( a )
## 1 first 7 3(42.86) 2(28.57)
## 2 second 4 0 1(25)
## Also:
## transpose like function that transposes a raw matrix
with(words2, termco2mat(termco(wl, list, match.list=c("b", " a "))))
## Which yields raw.score(percentage):
##
## first second
## b 2 0
## a 2 1
请注意,termco创建的类实际上是data.frames列表。
raw =原始频率计数(数字) prop =计数比例(数字) rnp =原始和比例合并(字符)
使用Scott的例子:
words <- list(
first=c("the","and","it", "we're"),
second=c("we","and","it")
)
words2 <- data.frame(list=names(words),
wl=unlist(lapply(words, paste, collapse=" ")))
termco(words2$wl, words2$list, match.list=c(" we ", " we"))
termco(words2$wl, words2$list, match.list=c(" we ", " we"), short.term = FALSE)
答案 3 :(得分:0)
为简单起见,您可以始终坚持使用基础包中的grep ...
LinesList <- list ( "1"=letters[1:10], "2"=rep(letters[1:3],3) )
CountsA <- grep("[a]", LinesList) # find 'a' in each element of list
length(CountsA) <- length(LinesList) # gives NAs if not counted
data.frame( lineNum = names(LinesList), count = CountsA)