使用R拆分文本

时间:2015-03-14 13:10:42

标签: r strsplit

我有一个包含长段落的字符变量数据框,我需要在某些短语确定的位置拆分。然而问题是,在许多情况下,这些短语与前面的单词合并。

这是我正在做的事情:

data  <- readLines(n=2)
= DAY 1 CHALLENGES = syndicated.= DAY 2 CHALLENGES = Red Sea.= DAY 3 CHALLENGES = framework.= DAY 4 CHALLENGES = Did ;-)= DAY 5 CHALLENGES = Paste ...= DAY 6 CHALLENGES = Name 
= DAY 1 CHALLENGES = very high.= DAY 2 CHALLENGES = Rank understand.= DAY 3 CHALLENGES = buy....= DAY 4 CHALLENGES = result.= DAY 5 CHALLENGES = coffee.= DAY 6 CHALLENGES = Bla.

df  <- as.data.frame(data)

delim  <- c("= DAY 1 CHALLENGES = ",
            "= DAY 2 CHALLENGES = ",
            "= DAY 3 CHALLENGES = ",
            "= DAY 4 CHALLENGES = ",
            "= DAY 5 CHALLENGES = ",
            "= DAY 6 CHALLENGES = ")

y  <- data.frame(do.call('rbind',
                         strsplit(as.character(df$data), delim, fixed = FALSE)))
y
                               X1
1                                
2 = DAY 1 CHALLENGES = very high.
                                                                                    X2
1 syndicated.= DAY 2 CHALLENGES = Red Sea.= DAY 3 CHALLENGES = framework.= DAY 4 CHALLENGES = Did ;-)= DAY 5 CHALLENGES = Paste ...= DAY 6 CHALLENGES = Name 
2                               Rank understand.= DAY 3 CHALLENGES = buy....= DAY 4 CHALLENGES = result.= DAY 5 CHALLENGES = coffee.= DAY 6 CHALLENGES = Bla.

我想得到每个= DAY x CHALLENGES =带有文本的段,直到下一个这样的段作为单独的变量。

谢谢!

使用提议的方法进行更新:

> a  <- scan(file ="~/Desktop/alm/a.txt", what="")
Read 1 item
> a
[1] "= DAY 1 CHALLENGES = very high.= DAY 2 CHALLENGES = Rank understand.= DAY 3 CHALLENGES = buy....= DAY 4 CHALLENGES = result. = DAY 5 CHALLENGES = Paste the link(s) that you think is Paid Media.http://lebron11.nikeinc.com/ DAY 5 CHALLENGE: Paste the link(s) that you think is Owned Media.http://www.nike.com/ ; https://www.pinterest.com/nikewomen DAY 5 CHALLENGE: Paste the link(s) that you think is BONUS QUESTION DAY 5 = DAY 6 CHALLENGES = Bla."
> b  <- scan(file ="~/Desktop/alm/b.txt", what="")
Read 1 item
> b
[1] "= DAY 1 CHALLENGES = very high.= DAY 2 CHALLENGES = Rank understand.= DAY 3 CHALLENGES = buy....= DAY 4 CHALLENGES = result. Paste the link(s) that you think is Paid Media.http://lebron11.nikeinc.com/ DAY 5 CHALLENGE: Paste the link(s) that you think is Owned Media.http://www.nike.com/ ; https://www.pinterest.com/nikewomen DAY 5 CHALLENGE: Paste the link(s) that you think is BONUS QUESTION DAY 5 ?= DAY 6 CHALLENGES = Bla."
> c <- c(a,b)
> df  <- as.data.frame(c)
> lst <- strsplit(gsub(" (?=\\= DAY)", ".", c, perl=TRUE), 
+                 '(?<=[.)])(?=\\=)', perl=TRUE)
> out <-  do.call(cbind, lapply(lst, function(x) sub('^=.*= ', '', x)))
Warning message:
In (function (..., deparse.level = 1)  :
  number of rows of result is not a multiple of vector length (arg 2)
> out
     [,1]                                                                                                                                                                                                                                                                                
[1,] "very high."                                                                                                                                                                                                                                                                        
[2,] "Rank understand."                                                                                                                                                                                                                                                                  
[3,] "buy...."                                                                                                                                                                                                                                                                           
[4,] "result.."                                                                                                                                                                                                                                                                          
[5,] "Paste the link(s) that you think is Paid Media.http://lebron11.nikeinc.com/ DAY 5 CHALLENGE: Paste the link(s) that you think is Owned Media.http://www.nike.com/ ; https://www.pinterest.com/nikewomen DAY 5 CHALLENGE: Paste the link(s) that you think is BONUS QUESTION DAY 5."
[6,] "Bla."                                                                                                                                                                                                                                                                              
     [,2]              
[1,] "very high."      
[2,] "Rank understand."
[3,] "buy...."         
[4,] "Bla." #this is not the value from the input file           
[5,] "very high." #this is missing in the input file, yet a value is getting output      
[6,] "Rank understand." #incorrect recognition of ?= DAY 6 CHALLENGES =; the same happens with := and != or similar

评论中指出了问题。 缺少值的指示将是有用的,而不是插入随机的。

1 个答案:

答案 0 :(得分:0)

可能有帮助

library(stringr)
str_extract_all(df$data, '= [A-Za-z]+ \\d+ [A-Za-z]+ = [A-Za-z ]+(\\.+| ;-\\)| \\.+| +)')
#[[1]]
#[1] "= DAY 1 CHALLENGES = syndicated." "= DAY 2 CHALLENGES = Red Sea."   
#[3] "= DAY 3 CHALLENGES = framework."  "= DAY 4 CHALLENGES = Did ;-)"    
#[5] "= DAY 5 CHALLENGES = Paste ..."   "= DAY 6 CHALLENGES = Name "      

#[[2]]
#[1] "= DAY 1 CHALLENGES = very high."      
#[2] "= DAY 2 CHALLENGES = Rank understand."
#[3] "= DAY 3 CHALLENGES = buy...."         
#[4] "= DAY 4 CHALLENGES = result."         
#[5] "= DAY 5 CHALLENGES = coffee."         
#[6] "= DAY 6 CHALLENGES = Bla."   

或使用strsplit

 lst <- strsplit(as.character(df$data), '(?<=[.)])(?=\\=)', perl=TRUE)
 lst
 #[[1]]
 #[1] "= DAY 1 CHALLENGES = syndicated." "= DAY 2 CHALLENGES = Red Sea."   
 #[3] "= DAY 3 CHALLENGES = framework."  "= DAY 4 CHALLENGES = Did ;-)"    
 #[5] "= DAY 5 CHALLENGES = Paste ..."   "= DAY 6 CHALLENGES = Name "      

 #[[2]]
 #[1] "= DAY 1 CHALLENGES = very high."      
 #[2] "= DAY 2 CHALLENGES = Rank understand."
 #[3] "= DAY 3 CHALLENGES = buy...."         
 #[4] "= DAY 4 CHALLENGES = result."         
 #[5] "= DAY 5 CHALLENGES = coffee."         
 #[6] "= DAY 6 CHALLENGES = Bla."   

如果要提取字符串syndicated.very high.等等。

  do.call(cbind, lapply(lst, function(x) sub('^=.*= ', '', x)))
  #       [,1]          [,2]              
  #[1,] "syndicated." "very high."      
  #[2,] "Red Sea."    "Rank understand."
  #[3,] "framework."  "buy...."         
  #[4,] "Did ;-)"     "result."         
  #[5,] "Paste ..."   "coffee."         
  #[6,] "Name "       "Bla."            

更新

基于更新后的字符串“a”

  lst <- strsplit(gsub(" (?=\\= DAY)", ".", a, perl=TRUE), 
                         '(?<=[.)])(?=\\=)', perl=TRUE)
  out <-  do.call(cbind, lapply(lst, function(x) sub('^=.*= ', '', x)))
  out[,1]
  #[1] "very high."                                                                                                                                                                                                                                                                        
  #[2] "Rank understand."                                                                                                                                                                                                                                                                  
  #[3] "buy...."                                                                                                                                                                                                                                                                           
  #[4] "result.."                                                                                                                                                                                                                                                                          
  #[5] "Paste the link(s) that you think is Paid Media.http://lebron11.nikeinc.com/ DAY 5 CHALLENGE: Paste the link(s) that you think is Owned Media.http://www.nike.com/ ; https://www.pinterest.com/nikewomen DAY 5 CHALLENGE: Paste the link(s) that you think is BONUS QUESTION DAY 5."
  #[6] "Bla."   

UPDATE2

我再次尝试c(将对象名称更改为c1,因为c是R中的函数

  c1 <- c(a,b)
  c2 <- gsub("( |\\?)(?=\\= DAY)|\\.com. (?=DAY)", " .", c1, perl=TRUE)
  lst <- strsplit(c2, '(?<=[.)])(?=(\\=|DAY))', perl=TRUE)
  lst2 <- lapply(lst, function(x) unname(unlist(tapply(x,
      gsub('.*?DAY (\\d+).*', '\\1', x), FUN=paste, collapse= ' '))))
  out <- do.call(cbind,lapply(lst2, function(x)
       sub('^=[^=:]+(\\=|:) ', '', sub('^(?=DAY)', '= ', x, perl=TRUE))))

  out[,1]
  #[1] "very high."                                                                                                                                                                                                                                                                      
  #[2] "Rank understand."                                                                                                                                                                                                                                                                
  #[3] "buy...."                                                                                                                                                                                                                                                                         
  #[4] "result. ."                                                                                                                                                                                                                                                                       
  #[5] "Paste the link(s) that you think is Paid Media.http://lebron11.nikeinc . DAY 5 CHALLENGE: Paste the link(s) that you think is Owned Media.http://www.nike.com/ ; https://www.pinterest.com/nikewomen DAY 5 CHALLENGE: Paste the link(s) that you think is BONUS QUESTION DAY 5 ."
  #[6] "Bla."                                                                                               

 out[,2]
 #[1] "very high."                                                                                                                                                                             
 #[2] "Rank understand."                                                                                                                                                                       
 #[3] "buy...."                                                                                                                                                                                
 #[4] "result. Paste the link(s) that you think is Paid Media.http://lebron11.nikeinc ."                                                                                                       
 #[5] "Paste the link(s) that you think is Owned Media.http://www.nike.com/ ; https://www.pinterest.com/nikewomen DAY 5 CHALLENGE: Paste the link(s) that you think is BONUS QUESTION DAY 5  ."
 #[6] "Bla."