posts_sentences <- data.frame("element_id" = c(1, 1, 2, 2, 2), "sentence_id" = c(1, 2, 1, 2, 3),
"sentence" = c("You know, when I grew up, I grew up in a very religious family, I had the same sought of troubles people have, I was excelling in alot of ways, but because there was alot of trouble at home, we were always moving around", "Im at breaking point.I have no one to talk to about this and if I’m honest I think I’m too scared to tell anyone because if I do then it becomes real.I dont know what to do.", "I feel like I’m going to explode.", "I have so many thoughts and feelings inside and I don't know who to tell and I was going to tell my friend about it but I'm not sure.", "I keep saying omg!it's too much"),
"sentence_wc" = c(60, 30, 7, 20, 7), stringsAsFactors=FALSE)
我已经在这方面工作了几天,但无法弄清楚如何做到这一点。我尝试过使用不需要的令牌,str_split / extract以及各种dplyr过滤器,mutate等组合以及google / SO搜索。有谁知道实现这个目标的最佳方法? Dplyr是首选,但我对任何有效的东西持开放态度。如果您需要任何澄清,请随时提出问题!
expected_output <- data.frame("element_id" = c(1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2), "sentence_id" = c(1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6),
"sentence" = c("You know, when I grew up", "I grew up in a very religious family", "I had the same sought of troubles people have", "I was excelling in alot of ways, but because there was alot of trouble at home, we were always moving around", "Im at breaking point.", "I have no one to talk to about this and if I’m honest I think I’m too scared to tell anyone because if I do then it becomes real.", "I dont know what to do.", "I feel like I’m going to explode.", "I have so many thoughts and feelings inside and", "I don't know who to tell and", "I was going to tell my friend about it but I'm not sure.", "I keep saying omg!", "it's too much"),
"sentence_wc" = c(6, 8, 8, 21, 4, 27, 6, 7, 9, 7, 13, 4, 3), stringsAsFactors=FALSE)
答案 0 :(得分:3)
split_too_long <- function(str, max.words=15L, ...) {
cuts <- stringi::stri_locate_all_words(str)[[1L]]
# return one of these
if (nrow(cuts) <= max.words) {
c(str, NA_character_)
else {
left <- substr(str, 1L, cuts[max.words, 2L])
right <- substr(str, cuts[max.words + 1L, 1L], nchar(str))
c(left, right)
recursive_split <- function(not_done, done=NULL, ...) {
left_right <- split_too_long(not_done, ...)
# return one of these
if (is.na(left_right[2L]))
c(done, left_right[1L])
recursive_split(left_right[2L], done=c(done, left_right[1L]), ...)
collapse_split <- function(sentences, regex="[.;:] ?", ...) {
sentences <- paste(sentences, collapse=". ")
sentences <- unlist(strsplit(sentences, split=regex))
# return
unlist(lapply(sentences, recursive_split, done=NULL, ...))
group_fun <- function(grouped_df, ...) {
# initialize new data frame with new number of rows
new_df <- data.frame(sentence=collapse_split(grouped_df$sentence, ...),
# count words
new_df$sentence_wc <- stringi::stri_count_words(new_df$sentence)
# add sentence_id
new_df$sentence_id <- 1L:nrow(new_df)
# element_id must be equal because it is a grouping variable,
# so take 1 to repeat it in output
new_df$element_id <- grouped_df$element_id[1L]
# return
dplyr::filter(new_df, sentence_wc > 0L)
out <- posts_sentences %>%
group_by(element_id) %>%
do(group_fun(., max.words=5L, regex="[.;:!] ?"))
答案 1 :(得分:3)
将句子拆分为多行,"[\\.\\?\\!] ?"
匹配任意", ?(?=[:upper:])"
匹配"and ?(?=[:upper:])"
可选空格,大写字母前面。它正确返回与预期输出相同的分割句子。 row_number
步骤中,这可能会使它更快,尽管我没有在大型数据集上对其进行分析。programming with dplyr
。我建议您查看posts_sentences <- data.frame(
"element_id" = c(1, 1, 2, 2, 2), "sentence_id" = c(1, 2, 1, 2, 3),
"sentence" = c("You know, when I grew up, I grew up in a very religious family, I had the same sought of troubles people have, I was excelling in alot of ways, but because there was alot of trouble at home, we were always moving around", "Im at breaking point.I have no one to talk to about this and if I’m honest I think I’m too scared to tell anyone because if I do then it becomes real.I dont know what to do.", "I feel like I’m going to explode.", "I have so many thoughts and feelings inside and I don't know who to tell and I was going to tell my friend about it but I'm not sure.", "I keep saying omg!it's too much"),
"sentence_wc" = c(60, 30, 7, 20, 7), stringsAsFactors = FALSE
split_too_long <- function(df, regexp, max_length) {
df %>%
mutate(wc = count_words(sentence)) %>%
pmap(function(...) tibble(...)) %>%
.p = ~ .$wc > max_length,
.f = ~ separate_rows(., sentence, sep = regexp)
) %>%
bind_rows() %>%
mutate(wc = count_words(sentence)) %>%
filter(wc != 0)
posts_sentences %>%
group_by(element_id) %>%
summarise(sentence = str_c(sentence, collapse = ".")) %>%
ungroup() %>%
split_too_long("[\\.\\?\\!] ?", 15) %>%
split_too_long(", ?(?=[:upper:])", 15) %>%
split_too_long("and ?(?=[:upper:])", 15) %>%
group_by(element_id) %>%
sentence = str_trim(sentence),
sentence_id = row_number()
) %>%
select(element_id, sentence_id, sentence, wc)
#> # A tibble: 13 x 4
#> # Groups: element_id [2]
#> element_id sentence_id sentence wc
#> <dbl> <int> <chr> <int>
#> 1 1 1 You know, when I grew up 6
#> 2 1 2 I grew up in a very religious family 8
#> 3 1 3 I had the same sought of troubles people ~ 9
#> 4 1 4 I was excelling in alot of ways, but beca~ 21
#> 5 1 5 Im at breaking point 4
#> 6 1 6 I have no one to talk to about this and i~ 29
#> 7 1 7 I dont know what to do 6
#> 8 2 1 I feel like I’m going to explode 7
#> 9 2 2 I have so many thoughts and feelings insi~ 8
#> 10 2 3 I don't know who to tell 6
#> 11 2 4 I was going to tell my friend about it bu~ 13
#> 12 2 5 I keep saying omg 4
#> 13 2 6 it's too much 3
由reprex package(v0.2.0)创建于2018-05-21。
答案 2 :(得分:1)
posts_sentences <- data.frame("element_id" = c(1, 1, 2, 2, 2), "sentence_id" = c(1, 2, 1, 2, 3),
"sentence" = c("You know, when I grew up, I grew up in a very religious family, I had the same sought of troubles people have, I was excelling in alot of ways, but because there was alot of trouble at home, we were always moving around", "Im at breaking point.I have no one to talk to about this and if I’m honest I think I’m too scared to tell anyone because if I do then it becomes real.I dont know what to do.", "I feel like I’m going to explode.", "I have so many thoughts and feelings inside and I don't know who to tell and I was going to tell my friend about it but I'm not sure.", "I keep saying omg!it's too much"),
"sentence_wc" = c(60, 30, 7, 20, 7), stringsAsFactors=FALSE)
# To create an empty data frame to save the new elements
new_posts_sentences <- data.frame(element_id = as.numeric(),
sentence_id =as.numeric(),
sentence = character(),
sentence_wc = as.numeric(), stringsAsFactors=FALSE)
limit_words <- 15 # 15 for this data set
countSentences <- 0
for (sentence in posts_sentences[,3]) {
vector <- character()
Velement_id <- posts_sentences$element_id[countSentences + 1]
vector <- c(vector, sentence) #To create a vector with the sentences
vector <- vector[!vector %in% ''] #remove empty elements from vector
## First we will separate the sentences that start with a uppercase after of a capital letter
if(lengths(gregexpr("[A-z]\\W+", sentence)) > limit_words ){
vector <- vector[!vector %in% sentence]
split_points <- unlist(gregexpr("[:,:]\\s[A-Z]", sentence)) # To get the character position
## If a sentences is still over the limit words value. Let's split it for each comma or period
sentences_1 <- substring(sentence, c(1, split_points + 2), c(split_points -1, nchar(sentence)))
for(sentence in sentences_1){
vector <- c(vector, sentence)
vector <- vector[!vector %in% '']
if(lengths(gregexpr("[A-z]\\W+", sentence)) > limit_words){
vector <- vector[!vector %in% sentence]
split_points <- unlist(gregexpr("[:,:]|[:.:]", sentence))
sentences_2 <- substring(sentence, c(1, split_points + 1), c(split_points -1, nchar(sentence)))
## If a sentence is still s still over the limit words value. Let's split it for each capital letter
for(sentence in sentences_2){
vector <- c(vector, sentence)
vector <- vector[!vector %in% '']
if(lengths(gregexpr("[A-z]\\W+", sentence)) > limit_words){
vector <- vector[!vector %in% sentence]
split_points <- unlist(gregexpr("[A-Z]", sentence))
sentences_3 <- substring(sentence,c(1, split_points), c(split_points -1, nchar(sentence)))
vector <- c(vector, sentences_3)
vector <- vector[!vector %in% '']
## To make a data frame o each original sentence
element_id <- rep(Velement_id, length(vector))
sentence_id <- 1:length(vector)
sentence_wc <- character()
for (element in vector){sentence_wc <- c(sentence_wc, (lengths(gregexpr("[A-z]\\W+", element)))) }
sentenceDataFrame <- data.frame(element_id, sentence_id, vector, sentence_wc)
## To join it with the final dataframe
new_posts_sentences <- rbind(new_posts_sentences, sentenceDataFrame)
countSentences <- countSentences + 1
element_id sentence_id vector sentence_wc
1 1 1 You know, when I grew up 5
2 1 2 I grew up in a very religious family 7
3 1 3 I had the same sought of troubles people have 8
4 1 4 I was excelling in alot of ways 6
5 1 5 but because there was alot of trouble at home 8
6 1 6 we were always moving around 4
7 1 1 Im at breaking point 3
8 1 2 I have no one to talk to about this and if 11
9 1 3 I’m honest 3
10 1 4 I think 2
11 1 5 I’m too scared to tell anyone because if 9
12 1 6 I do then it becomes real 5
13 1 7 I dont know what to do 5
14 2 1 I feel like I’m going to explode. 8
15 2 1 I have so many thoughts and feelings inside and 9
16 2 2 I don't know who to tell and 8
17 2 3 I was going to tell my friend about it but 10
18 2 4 I'm not sure 3
19 2 1 I keep saying omg!it's too much 7
答案 3 :(得分:0)
check_and_split <- function(element_id, sentence_id, sentence, sentence_wc,
word_count, attmpt){
methods <- c("\\.", ",\\s?(?=[I])", "and\\s?(?=[A-Z])")
df <- data.frame(element_id=element_id,
stringsAsFactors = FALSE)
if(word_count<=15 | attmpt>=3){
return(df) #early return
} else{
df %>%
tidyr::separate_rows(sentence, sep=methods[attmpt+1]) %>%
attmpt = attmpt+1)
posts_sentences %>%
attmpt=0) %>%
pmap_dfr(check_and_split) %>%
pmap_dfr(check_and_split) %>%
和number of attempts
我正在应用相同的函数三次 - 这可能被包装成一个循环(lapply / purrr :: map将无法工作,因为我们需要更新顺序更新数据帧)。
答案 4 :(得分:0)
sentences_split = posts_sentences %>%
mutate(text_split=str_split(sentence, pattern = "\\.")) %>%
unnest(text_split) %>%
#Count number of words in text_split
mutate(wc_split = str_count(text_split, "\\w+")) %>%
filter(wc_split!=0) %>%
#Split again if text_split column has >15 words
mutate(text_split_again = ifelse(wc_split>15, str_split(text_split, pattern = ",\\sI"), text_split)) %>%