我有一个令牌和标签列表,我想找到最接近介词的所有单词(那些名词的名词和描述性[形容词和动词])。例如,在下面的数据集中,如何选择闪烁,橙色和矩形, angular 和 table (因为这些名词和描述符最接近介词, on )。
token <- c("Glistening", "oranges", "on", "rectangular", "angular", "table", "are", "delectable", ".")
tag <- c("VBG", "NNS", "IN", "JJ", "JJ", "NN", "VBP", "JJ", "SENT")
data = cbind(token, tag)
token tag
[1,] "Glistening" "VBG"
[2,] "oranges" "NNS"
[3,] "on" "IN"
[4,] "rectangular" "JJ"
[5,] "angular" "JJ"
[6,] "table" "NN"
[7,] "are" "VBP"
[8,] "delectable" "JJ"
[9,] "." "SENT"
答案 0 :(得分:0)
基础R方法,假设......
# Vectorize the seq function so that it accepts vector
seqvector <- Vectorize(seq.default, vectorize.args = c("from", "to"))
# Vector of prepositions
prepo <- c("on", "by")
# Find the matches
matches = grep(paste(prepo, collapse = "|"), data$token)
# Final unique rows to be selected
selectedrows <-
unique(c(seqvector(
from = matches - 2, to = matches + 2, by = 1
)))[-(matches)]
# subset dataframe
data[selectedwords,]
# Output
token tag
1 Glistening VBG
2 oranges NNS
4 rectangular JJ
5 angular JJ
6 table NN
8 delectable JJ
9 hello SENT
# Data
token <- c("Glistening", "oranges", "on", "rectangular", "angular",
"table", "by", "delectable", "hello")
tag <- c("VBG", "NNS", "IN", "JJ", "JJ", "NN", "VBP", "JJ", "SENT")
data = data.frame(token, tag)