Question

我有一个令牌和标签列表，我想找到最接近介词的所有单词（那些名词的名词和描述性[形容词和动词]）。例如，在下面的数据集中，如何选择闪烁，橙色和矩形， angular 和 table （因为这些名词和描述符最接近介词， on ）。

 token <- c("Glistening", "oranges", "on", "rectangular", "angular", "table", "are", "delectable", ".")
 tag <- c("VBG", "NNS", "IN", "JJ", "JJ", "NN", "VBP", "JJ", "SENT") 
 data = cbind(token, tag)
       token         tag   
  [1,] "Glistening"  "VBG" 
  [2,] "oranges"     "NNS" 
  [3,] "on"          "IN"  
  [4,] "rectangular" "JJ"  
  [5,] "angular"     "JJ"  
  [6,] "table"       "NN"  
  [7,] "are"         "VBP" 
  [8,] "delectable"  "JJ"  
  [9,] "."           "SENT"

Answer 1

基础R方法，假设......

介词前面的窗口大小+2和-2
你有一个介词列表

# Vectorize the seq function so that it accepts vector
seqvector <- Vectorize(seq.default, vectorize.args = c("from", "to"))

# Vector of prepositions
prepo <- c("on", "by")

# Find the matches
matches = grep(paste(prepo, collapse = "|"), data$token)

# Final unique rows to be selected
selectedrows <-
  unique(c(seqvector(
    from = matches - 2, to = matches + 2, by = 1
  )))[-(matches)]

# subset dataframe
data[selectedwords,]

# Output

       token  tag
1  Glistening  VBG
2     oranges  NNS
4 rectangular   JJ
5     angular   JJ
6       table   NN
8  delectable   JJ
9       hello SENT

# Data
token <- c("Glistening", "oranges", "on", "rectangular", "angular",
             "table", "by", "delectable", "hello")
tag   <- c("VBG", "NNS", "IN", "JJ", "JJ", "NN", "VBP", "JJ", "SENT")
data = data.frame(token, tag)

选择最接近介词的单词（POS标记）

1 个答案: