具有特定模式的拆分字符串

时间:2016-05-25 11:30:34

标签: r

我有一个包含字符字段的表,可以使用以下任一模式:

input              

97                 # a single number
210 foo            # a number and a word
87 bar 89          # a number, a word, a number
21 23              # two numbers
123 2 fizzbuzz     # two number, a word           
12 fizz 34 buzz    # a number, a word, a number, a word 

我希望将每行最多分为4个部分,分别包含the first numberthe first word(如果存在),the second number(如果存在)和the second word如果它存在所以我的例子会给出:

input               nb_1    word_1    nb_2    word_2

97                  97
210 foo             210     foo
87 bar 89           87      bar       89
21 23               21                23
123 2 fizzbuzz      123               2       fizzbuzz
12 fizz 34 buzz     12      fizz      34      buzz

请注意two number, a word(最后一个例子之前的例子)的情况:word_1中没有任何内容,因为这两个数字之间没有字。

如果没有繁琐的if / if / else结构,有没有办法做到这一点?

如果可以提供帮助,则所有单词都属于10个特定单词的列表。此外,如果有两个单词,它们可以相同或不同。此外,数字可以是一位,两位或三位数。

由于

3 个答案:

答案 0 :(得分:1)

以下是使用gsub包中的cSplitsplitstackshape的想法,

library(splitstackshape)
df$num <- gsub('\\D', ' ', df$V1)
df$wrds <- gsub('\\d', ' ', df$V1) 
newdf <- cSplit(df, 2:3, ' ', 'wide')
newdf
#                                    V1 num_1 num_2   wrds_1 wrds_2
#1:                                  97    97    NA       NA     NA
#2:                             210 foo   210    NA      foo     NA
#3:                           87 bar 89    87    89      bar     NA
#4:                               21 23    21    23       NA     NA 
#5:                      123 2 fizzbuzz   123     2 fizzbuzz     NA
#6:                     12 fizz 34 buzz    12    34     fizz   buzz

唯一的问题是第5行,可以修复如下,

newdf$wrds_1 <- as.character(newdf$wrds_1)
newdf$wrds_2 <- as.character(newdf$wrds_2)
newdf$wrds_2[grep('[0-9]+\\s+[0-9]+\\s+[A-Za-z]', newdf$V1)] <- newdf$wrds_1[grep('[0-9]+\\s+[0-9]+\\s+[A-Za-z]', newdf$V1)]
newdf$wrds_1[grep('[0-9]+\\s+[0-9]+\\s+[A-Za-z]', newdf$V1)] <- NA

最终给出了,

newdf
#                                    V1 num_1 num_2 wrds_1   wrds_2
#1:                                  97    97    NA     NA       NA
#2:                             210 foo   210    NA    foo       NA
#3:                           87 bar 89    87    89    bar       NA
#4:                               21 23    21    23     NA       NA
#5:                      123 2 fizzbuzz   123     2     NA fizzbuzz
#6:                     12 fizz 34 buzz    12    34   fizz     buzz

数据

dput(df)
structure(list(V1 = c("97", "                  210 foo", "                          87 bar 89", 
"                    21 23", "                    123 2 fizzbuzz", 
"                    12 fizz 34 buzz")), .Names = "V1", row.names = c(NA, 
-6L), class = "data.frame")

答案 1 :(得分:1)

Tried in a different way...
library(splitstackshape)
    abc <- data.frame(a=c(97,"210 foo","87 bar 89","21 23","123 2 fizzbuzz","12 fizz 34 buzz"))
    abc1 <- data.frame(cSplit(abc, "a", " ", stripWhite = FALSE))
    abc <- cbind(abc,abc1)
    names(abc) <- c("input","nb_1", "word_1", "nb_2","word_2")
    abc[,1:5] <-apply(abc[,1:5] , 2, as.character)
    for(i in 1:nrow(abc)){
      abc$word_2[i] <- replace(abc$word_2[i] , is.na(abc$word_2[i]),abc$nb_2[grepl("[a-z]",abc$nb_2[i])][i])
      abc$nb_2[i] <- replace(abc$nb_2[i] , is.na(abc$nb_2[i])|grepl("[a-z]",abc$nb_2[i]),abc$word_1[grepl("[0-9]",abc$word_1[i])][i])
      }
    abc$word_1 <- ifelse(grepl("[0-9]",abc$word_1),NA,abc$word_1)
    abc[is.na(abc)] <- ""
    print(abc)
            input nb_1 word_1 nb_2   word_2
1              97   97                     
2         210 foo  210    foo              
3       87 bar 89   87    bar   89         
4           21 23   21          23         
5  123 2 fizzbuzz  123           2 fizzbuzz
6 12 fizz 34 buzz   12   fizz   34     buzz

答案 2 :(得分:1)

这是一个hacky功能...虽然你可能有其他情况会打破它。

f <- function(x){
  string2 <- strsplit(x, " ")[[1]]
  if (length(string2) < 2)
    return(c(string2, NA, NA, NA))
  arenums <- grepl("\\d", string2)
  c(string2[which(arenums)[1]], 
   if (arenums[2]) NA else string2[which(!arenums)[1]],    
   string2[which(arenums)[2]], 
   if (arenums[2]) string2[which(!arenums)[1]] else string2[which(!arenums)[2]])
}

> f("97")
[1] "97" NA   NA   NA  
> f("210 foo")
[1] "210" "foo" NA    NA   
> f("87 bar 89")
[1] "87"  "bar" "89"  NA   
> f("21 23")
[1] "21" NA   "23" NA  
> f("123 2 fizzbuzz")
[1] "123"      NA         "2"        "fizzbuzz"
> f("12 fizz 34 buzz")
[1] "12"   "fizz" "34"   "buzz"