我试图找到句子(之前的3个句子和之后的3个句子),当且仅当 word1 和 word2 彼此非常接近时。如果单词之间没有接近 30个单词的空格,则找不到任何结果。如果它们相距 30个单词空间,则返回之前的3个句子和之后的3个句子。
我设法:
然而,我不确定如何在不言辞的情况下实现这一目标。 我发现这些有用,但不是我要找的
任何帮助都会受到赞赏。以下示例中使用的2篇文章的组合数据框。
谢谢
#Dataframe
date<-c("2018-01-03","2018-06-10 ")
text=c(c("Bumblebees are found mainly in northern temperate regions, thought
here are a few native South American species and New Zealand has some
naturalised species that were introduced around 100 years ago to pollinate
red clover. They range much further north than honey bees, and colonies can
be found on Ellesmere Island in northern Canada, only 880 km from the north
pole!)",c("With the recent popularity of using bumblebees in glasshouse
pollination they will probably be found in most parts of the world before,
especially Bombus terrestris which seems to be the most popular species sold
for this purpose. Recently there have been proposals to introduce bumblebees
into Australia to pollinate crops in glasshouses. Now, though I dearly love
bumblebees, I do think that this might not be a very good idea. No matter
what security measures are taken, mated queens WILL escape eventually and
that will probably lead to their establishment in the wild. And yet another
non-native invasion of a country that has suffered more than most from such
things. This invasion may or may not be benign, but isn't it better to err
on the side of caution? Apparently there are already colonies of Bombus
terrestris on Tasmania, so I suppose it is now only a matter of time before
they reach the mainland.")))
library(dplyr)
library(lubridate)
tblcombo<-tibble(date,text)
View(tblcombo)
word1<-c("invasion")
#First Word occurence
one_word <- function(text, word_1){
text1 <- tblcombo %>%
mutate(comp_det = str_detect(tolower(text), tolower(word_1))) %>%
filter(comp_det == TRUE) %>%
mutate(first_word_Word = rep(word_1, length(comp_det))) %>%
dplyr::select(date,text,first_word,comp_det)
return(text1)
}
firstword<-one_word(tblcombo,word1)
View(firstword)
#Second word occurence
word2<-c("colonies")
second_word <- function(one_word,word2){
word_second<-one_word%>%
mutate(cat_det = str_detect(tolower(text), tolower(word2))) %>%
filter(cat_det == TRUE) %>%
mutate(second_word = rep(word2, length(cat_det))) %>%
dplyr::select(date,text,first_word,comp_det,second_word)
return(word_second)
}
secword<-second_word(firstword,word2)
View(secword)
#Extract 3 sentences before and 3 sentence word occurs
extract_sentences_3_either_side_word <- function(text, word){
rem <- c("Mr. ", "Wm. ", "St. ", "Co. ", "Sc. ", "Dr. ", "Ms. ", "No. ", "Us. ", "By. ", "Cs. ", "a.m.")
rem <- paste("(",paste(rem, collapse = "|"),")",sep="")
text <- str_replace_all(text, rem, "")
split_text <- unlist(str_split(text, "\\. "))
inds <- which(str_detect(tolower(split_text), tolower(word)))
p33 <- function(ind, split_text){
if ((ind-3) < 1) {
ind <- 4
}
pst33 <- paste(na.omit(split_text[(ind-3):(ind+3)]), collapse = ". ")
}
s33 <- paste(map_chr(inds, ~p33(.x, split_text)), collapse = " - NEXT - ")
return(s33)
}
extract_sentences_3_either_side_word(tblcombo$text,word1)
#Distance between words
distance <- function(string, term1, term2) {
words<-tolower(string)
words<-gsub('[[:punct:] ]+',' ',string)
words <- strsplit(words, "\\s")[[1]]
indices <- 1:length(words)
names(indices) <- words
abs(indices[term1] - indices[term2])
}
#Applying to table secword
data<-secword%>%mutate(dist_words=lapply(text,distance,first_word,second_word)<50)