如何在R中分别分割字符和数字

时间:2015-03-19 13:17:30

标签: regex r

我有一个如下所示的数据框:

df= data.frame(name= c("1Alex100.00","12Rina Faso92.31","113john00.00"))

我想将其拆分为一个包含3列的数据框,以便输出如下:

name1 name2      name3
1     Alex       100.00
12    Rina Faso  92.31
113   john       00.00

我尝试过stringr()和grep()并且成功有限。缺少分隔符会使其变得更加困难。

6 个答案:

答案 0 :(得分:6)

你可以尝试

library(tidyr)
res <- extract(df, name, into=c('name1', 'name2', 'name3'),
                  '(\\d+)([^0-9]+)([0-9.]+)', convert=TRUE)
res
#    name1     name2  name3
#1     1      Alex 100.00
#2     2 Rina Faso  92.31
#3     3      john  50.00

str(res)
# 'data.frame': 3 obs. of  3 variables:
#$ name1: int  1 2 3
#$ name2: Factor w/ 3 levels "Alex","john",..: 1 3 2
# $ name3: num  100 92.3 50

更新

基于&#39; df&#39;来自@ DavidArenburg的帖子

 res <- extract(df, name, into=c('name1', 'name2', 'name3'),
                   '(\\d+)([^0-9]+)([0-9.]+)', convert=TRUE)
 res
 #    name1         name2 name3
 #1   121       Réunion 13.76
 #2     2 Côte d'Ivoire 22.40
 #3     3          john 50.00

答案 1 :(得分:4)

尝试使用str_match中的stringr

str_match(df$name, "^([0-9]*)([A-Za-z ]*)([0-9\\.]*)")
#      [,1]              [,2] [,3]        [,4]    
# [1,] "1Alex100.00"     "1"  "Alex"      "100.00"
# [2,] "2Rina Faso92.31" "2"  "Rina Faso" "92.31" 
# [3,] "3john50.00"      "3"  "john"      "50.00" 

所以as.data.frame(str_match(df$name, "^([0-9]*)([A-Za-z ]*)([0-9\\.]*)")[,-1])应该会给你想要的结果。

答案 2 :(得分:3)

你也可以这样做。

> df <- data.frame(name= c("1Alex100.00","12Rina Faso92.31","113john00.00"))
> x <- do.call(rbind.data.frame, strsplit(as.character(df$name), "(?<=[A-Za-z])(?=\\d)|(?<=\\d)(?=[A-Za-z])", perl=T))
> colnames(x) <- c("name1", "name2", "name3")
> print(x, row.names=FALSE)
 name1     name2  name3
     1      Alex 100.00
    12 Rina Faso  92.31
   113      john  00.00

答案 3 :(得分:2)

对于基础R,虽然它也适用于特殊字符,但它可以在abit uglier上完成

with(df, cbind(sub("\\D.*", "", name), 
               gsub("[0-9.]", "", name), 
               gsub(".*[A-Za-z]", "", name)))

#     [,1]  [,2]        [,3]    
# [1,] "1"  "Alex"      "100.00"
# [2,] "2"  "Rina Faso" "92.31" 
# [3,] "3"  "john"      "50.00" 

关于特殊字符的示例

df = data.frame(name= c("121Réunion13.76","2Côte d'Ivoire22.40","3john50.00"))
with(df, cbind(sub("\\D.*", "", name), 
         gsub("[0-9.]", "", name), 
         gsub(".*[A-Za-z]", "", name)))

#     [,1]  [,2]            [,3]   
# [1,] "121" "Réunion"       "13.76"
# [2,] "2"   "Côte d'Ivoire" "22.40"
# [3,] "3"   "john"          "50.00"

答案 4 :(得分:0)

基础R不是丑陋的解决方案:

 proto=data.frame(name1=numeric(),name2=character(),name3=numeric())
 strcapture("(\\d+)(\\D+)(.*)",as.character(df$name),proto)
  name1     name2  name3
1     1      Alex 100.00
2    12 Rina Faso  92.31
3   113      john   0.00
 read.table(text=gsub("(\\d+)(\\D+)(.*)","\\1|\\2|\\3",df$name),sep="|")
   V1        V2     V3
1   1      Alex 100.00
2  12 Rina Faso  92.31
3 113      john   0.00

答案 5 :(得分:0)

您可以使用软件包 unglue

df <- data.frame(name= c("1Alex100.00","12Rina Faso92.31","113john00.00"))
library(unglue)
unglue_unnest(df, name, "{name1}{name2=\\D+}{name3}", convert = TRUE)
#>   name1     name2  name3
#> 1     1      Alex 100.00
#> 2    12 Rina Faso  92.31
#> 3   113      john   0.00