我正在尝试按多个条件分割字符串,并存储每个分割的分割条件。
我一直在尝试使用stringr::str_split
包,但不能向该函数传递多个拆分条件。
例如,如果我有以下字符串:
data = "Julie (title) : This is the text Julie has: said. Extra sentence one. Extra sentence 2 and so on. Rt Hon Ellen: This is the text Ellen has said in response to Julie. TITLE OF SECTION Julie: More words from Julie."
和拆分条件:
names = c("Julie:", "Ellen:")
我想要这样的输出:
data.frame(Names = c("Julie:", "Ellen:","Julie:"),
text = c(" This is the text Julie has: said. Extra sentence one. Extra sentence 2 and so on. ", "This is the text Ellen has said in response to Julie.","More words from Julie."))
答案 0 :(得分:0)
我已经在coatless的答案中看到了您的评论,并创建了一个样本数据,该数据可能反映了您所说的内容。一种方法如下。我首先创建一个数据框。我使用unnest_tokens()
分割了每个句子的字符串。然后,我使用separate()
拆分句子。最后,我用人的名字代替了NA。我希望这会对您有所帮助。
library(tidyverse)
library(tidytext)
library(zoo)
so <- tibble(text = "Ana: I went to school today. I learned text mining. Bob: That is great! Ana: I know what to do: practice.")
unnest_tokens(so, output = sentence,
input = text,
token = "sentences") %>%
separate(col = sentence, into = c("person", "sentence"), sep = ": ",
extra = "merge", fill = "left") %>%
mutate(person = na.locf(person))
# A tibble: 4 x 2
# person sentence
# <chr> <chr>
#1 ana i went to school today.
#2 ana i learned text mining.
#3 bob that is great!
#4 ana i know what to do: practice.
答案 1 :(得分:0)
低效的Base R解决方案:
# Store a vector of the names:
text_names <- c("Julie", "Ellen")
# Create a dataframe of the patterns:
pattern_search <- data.frame(name_search = c(paste0(text_names, ":"),
paste0(text_names, " :"),
paste0(text_names, ".* :")),
stringsAsFactors = F)
# Split the text into sentences:
split_text <- data.frame(sentences = trimws(unlist(strsplit(df$Text, "[.]")), "both"), stringsAsFactors = F)
# Extract the names, store them in a vector:
names_in_order <- gsub("[[:punct:]]|\\s+.*",
"",
regmatches(grep(paste0(pattern_search$name_search, collapse = "|"),
split_text$sentences, value = T),
regexpr(paste0(pattern_search$name_search, collapse = "|"),
grep(paste0(pattern_search$name_search, collapse = "|"),
split_text$sentences, value = T))))
# Store a logical vector denoting which elements the names should go:
split_text$who_said_this <- grepl(paste0(pattern_search$name_search, collapse = "|"),
split_text$sentences)
# Replace all occurences of TRUE with the elements of the vector of names:
split_text$who_said_this[which(split_text$who_said_this == TRUE)] <- names_in_order
# Replace FALSE with NA values:
split_text$who_said_this[which(split_text$who_said_this == "FALSE")] <- NA
# Store a vector that's values denote the number of times dialogue changes between the names:
split_text$speech_group_no <- ave(split_text$who_said_this,
split_text$who_said_this,
FUN = seq.int)
# Apply a function to fill NA values with the non-NA value above it:
split_text <- data.frame(lapply(split_text, function(x){na.omit(x)[cumsum(!is.na(x))]}),
stringsAsFactors = F)
# Row-wise concatenate the dataframe by group:
split_text <- aggregate(list(sentences = c(split_text$sentences)),
list(speech_group_no = paste0(split_text$who_said_this, " - ", split_text$speech_group_no)),
paste0,
sep = ". ")
# Flatten list vector into a character vector and clean up punctuation:
split_text$sentences <- gsub(" [,] ", " ", sapply(split_text$sentences, toString))
# Order the dialogue:
split_text <- split_text[match(split_text$speech_group_no,
paste(names_in_order, ave(names_in_order, names_in_order, FUN = seq.int), sep = " - ")),]
数据:
df <- structure(
list(Text = "Julie (title) : This is the text Julie has: said. Extra sentence one. Extra sentence 2 and so on. Rt Hon Ellen: This is the text Ellen has said in response to Julie. TITLE OF SECTION Julie: More words from Julie."),
class = "data.frame",
row.names = c(NA,-1L)
)