将数据框中的两个混乱向量拆分为一个公共列

时间:2018-03-01 18:33:03

标签: r split dplyr reshape data-cleaning

数据集示例:

library(dplyr)
sample <- structure(list(Rank = c(15, 17, 20, 2, 16, 8, 21, 5, 13, 31, 22, 18, 2, 19, 11, 11, 8, 7, 12, 9, 5, 23, 17, 16, 15, 14, 4, 20, 13, 2), Athlete = c("François Gourmet(BEL)", "Agustín Félix(ESP)", "Keisuke Ushiro", "Michael Schrader", "Pieter Braun", "Laurent Hernu(FRA)", "Dmitriy Karpov", "Laurent Hernu(FRA)", "Thomas van der Plaetsen", "Attila Szabó", "Nadir El Fassi", "Eduard Mikhan", "Leonel Suárez", "Janek Õiglane", "Hans van Alphen(BEL)", "Roman Šebrle", "André Niklaus(GER)", "Pascal Behrenbruch", "Pieter Braun", "Oleksandr Yurkov(UKR)", "Eelco Sintnicolaas", "Brent Newdick", "Kim Kun-woo", "Akihiko Nakamura", "Bastien Auzeil", "Frédéric Xhonneux", "Janek Õiglane", "Keisuke Ushiro", "Roman Šebrle", "Rico Freimuth"), Total = c(7974, 7749, 7498, 8670, 7890, 8280, 7550, 8218, 8069, 7610, 7922, 7968, 8640, 7581, 8034, 8266, 8020, 8211, 8114, 8264, 8298, 7915, 7860, 7745, 7922, 7616, 8371, 7532, 8069, 8564), `100m` = c(10.67, 11.17, 11.53, 10.73, 11.22, 10.97, 11.24, 11.2, 11.2, 11.15, 11.12, 10.97, 11.13, 11.51, 11.11, 11.16, 11.19, 11.08, 11.11, 10.93, 10.76, 11.11, 11.11, 10.86, 11.35, 11.28, 11.08, 11.51, 11.25, 10.53), LJ = c(7.15, 7.12, 6.64, 7.85, 7.17, 7.31, 6.86, 7.22, 7.79, 7.09, 7.26, 7.42, 7.24, 6.78, 7.35, 7.8, 7.21, 6.8, 7.29, 7.37, 7.29, 7.42, 7.24, 7.26, 6.87, 7.21, 7.33, 6.73, 7.3, 7.48), SP = c(13.74, 13.29, 13.43, 14.56, 14.48, 14.43, 15.69, 13.99, 12.76, 13.92, 13.62, 14.15, 15.2, 14.43, 14.67, 14.98, 13.87, 16.01, 13.9, 15.15, 14.13, 14.35, 12.96, 11.67, 15.23, 12.92, 15.13, 14.93, 15.2, 14.85), HJ = c(1.85, 2.03, 1.96, 1.99, 1.93, 2.03, 1.93, 2.03, 2.17, 1.84, 1.99, 1.96, 2.11, 1.92, 1.88, 2.11, 1.97, 1.93, 2.04, 1.97, 1.93, 1.99, 1.96, 1.95, 1.96, 2.03, 2.05, 1.89, 2.05, 1.99), `400m` = c(47.98, 52.08, 51.43, 47.66, 48.54, 49.31, 52.01, 48.95, 49.46, 49.79, 51.35, 48.8, 48, 50.95, 48.52, 50.42, 49.95, 49.9, 48.24, 49.45, 48.35, 50.1, 49.24, 47.81, 50.36, 49.04, 49.58, 50.85, 51.18, 48.41), `110mh` = c(15.02, 14.75, 15.35, 14.29, 14.67, 14.01, 14.64, 14.15, 14.79, 14.65, 14.9, 14.82, 14.45, 15.33, 14.77, 14.44, 14.5, 14.33, 14.37, 14.41, 14.42, 14.82, 14.95, 14.72, 14.59, 15.75, 14.56, 15.43, 14.75, 13.68), DT = c(39.87, 43.67, 47.64, 46.44, 42.59, 43.93, 47.1, 46.13, 37.2, 43.75, 42.25, 48, 44.71, 40.94, 44.3, 46.3, 42.68, 48.56, 42.09, 48.1, 42.23, 43.6, 39.53, 33.48, 46.86, 38.62, 42.11, 46.85, 46.93, 51.17), PV = c(5, 5, 4.6, 5, 4.7, 5.1, 4.8, 4.9, 5.1, 4.4, 4.8, 4.6, 5, 4.6, 4.3, 4.6, 5.1, 4.9, 4.9, 5, 5.2, 4.8, 4.9, 4.7, 4.8, 4.7, 5.1, 4.7, 4.8, 4.8), JT = c(57.73, 56.69, 63.28, 65.67, 59.26, 59.9, 46.91, 59.63, 58.91, 59.56, 57.65, 50.74, 75.19, 68.51, 65.71, 65.61, 57.55, 66.5, 56.95, 58.63, 61.07, 51.52, 53.33, 53.57, 60.8, 50.18, 71.73, 56.52, 67.28, 62.34), `1500m` = c(265.51, 288.27, 291.9, 265.38, 278.4, 277.41, 298.41, 268.4, 285.86, 285.64, 256.51, 273.71, 267.25, 283.06, 262.5, 290.33, 268.8, 276.64, 272.46, 278.43, 265.4, 270.57, 255.63, 256.36, 279.8, 262.71, 279.24, 283.51, 296.5, 281.57), Year = structure(c(4L, 4L, 9L, 7L, 9L, 1L, 6L, 2L, 6L, 5L, 5L, 7L, 5L, 8L, 4L, 5L, 2L, 6L, 8L, 1L, 6L, 5L, 6L, 8L, 9L,     3L, 9L, 8L, 6L, 9L), .Label = c("2001", "2003", "2005", "2007",     "2009", "2011", "2013", "2015", "2017"), class = "factor"),     Nationality = c(NA, NA, "Japan(JPN)", "Germany(GER)", "Netherlands(NED)",     NA, "Kazakhstan(KAZ)", NA, "Belgium(BEL)", "Hungary", "France",     "Belarus(BLR)", "Cuba", "Estonia(EST)", NA, "Czech Republic",     NA, "Germany(GER)", "Netherlands(NED)", NA, "Netherlands(NED)",     "New Zealand", "South Korea(KOR)", "Japan(JPN)", "France(FRA)",     NA, "Estonia(EST)", "Japan(JPN)", "Czech Republic(CZE)",     "Germany(GER)"), Notes = c(NA, NA, NA, "PB", NA, NA, NA,     NA, NA, NA, "SB", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,     "PB", "NR", NA, "SB", NA, "PB", NA, NA, NA)), .Names = c("Rank", "Athlete", "Total", "100m", "LJ", "SP", "HJ", "400m", "110mh", "DT", "PV", "JT", "1500m", "Year", "Nationality", "Notes"), row.names = c(NA, -30L), class = c("tbl_df", "tbl", "data.frame"))

# A tibble: 30 x 16
    Rank                 Athlete Total `100m`    LJ    SP    HJ `400m` `110mh`    DT    PV    JT `1500m`   Year      Nationality Notes
   <dbl>                   <chr> <dbl>  <dbl> <dbl> <dbl> <dbl>  <dbl>   <dbl> <dbl> <dbl> <dbl>   <dbl> <fctr>            <chr> <chr>
 1    15   François Gourmet(BEL)  7974  10.67  7.15 13.74  1.85  47.98   15.02 39.87   5.0 57.73  265.51   2007             <NA>  <NA>
 2    17      Agustín Félix(ESP)  7749  11.17  7.12 13.29  2.03  52.08   14.75 43.67   5.0 56.69  288.27   2007             <NA>  <NA>
 3    20          Keisuke Ushiro  7498  11.53  6.64 13.43  1.96  51.43   15.35 47.64   4.6 63.28  291.90   2017       Japan(JPN)  <NA>
 4     2        Michael Schrader  8670  10.73  7.85 14.56  1.99  47.66   14.29 46.44   5.0 65.67  265.38   2013     Germany(GER)    PB
 5    16            Pieter Braun  7890  11.22  7.17 14.48  1.93  48.54   14.67 42.59   4.7 59.26  278.40   2017 Netherlands(NED)  <NA>
 6     8      Laurent Hernu(FRA)  8280  10.97  7.31 14.43  2.03  49.31   14.01 43.93   5.1 59.90  277.41   2001             <NA>  <NA>
 7    21          Dmitriy Karpov  7550  11.24  6.86 15.69  1.93  52.01   14.64 47.10   4.8 46.91  298.41   2011  Kazakhstan(KAZ)  <NA>
 8     5      Laurent Hernu(FRA)  8218  11.20  7.22 13.99  2.03  48.95   14.15 46.13   4.9 59.63  268.40   2003             <NA>  <NA>
 9    13 Thomas van der Plaetsen  8069  11.20  7.79 12.76  2.17  49.46   14.79 37.20   5.1 58.91  285.86   2011     Belgium(BEL)  <NA>
10    31            Attila Szabó  7610  11.15  7.09 13.92  1.84  49.79   14.65 43.75   4.4 59.56  285.64   2009          Hungary  <NA>
# ... with 20 more rows

我的数据集中有两个字符向量“运动员和国籍”,其中一些条目的末尾附有括号中的国家/地区代码。我希望能够将这两个向量中的国家/地区代码拆分为新的变量,说“countrycode”,同时摆脱括号。我不知道分裂的最佳方法或语法是什么 - dplyr ::可能分开?虽然我不确定怎么样在拆分期间在括号内的国家/地区代码中包含字符组合,以及某些条目不需要拆分的事实。

我会在从新变量中删除括号之后执行类似的操作。

sample$countrycode<- gsub(pattern="\\(",replacement="",x=sample$countrycode)
sample$countrycode<- gsub(pattern="\\)",replacement="",x=sample$countrycode)

由于

3 个答案:

答案 0 :(得分:2)

一种丑陋的方法是使用sub

library(data.table)
DT = data.table(sample)

patt = "^.*\\((.{3})\\).*$"; rp = "\\1"
DT[Athlete %like% patt, cc := sub(patt, rp, Athlete)]
DT[Nationality %like% patt, cc := sub(patt, rp, Nationality)]

如果您已经使用了tidyverse软件包,那么来自stringr软件包的str_extract之类的东西可能会更清晰。此外,对于上面代码的dplyr模拟,可以查看case_when函数。 (我对这些工具不太熟悉,无法知道确切的语法。)

结果看起来像......

> DT[, .(Athlete, Nationality, cc)]
                    Athlete         Nationality  cc
 1:   François Gourmet(BEL)                  NA BEL
 2:      Agustín Félix(ESP)                  NA ESP
 3:          Keisuke Ushiro          Japan(JPN) JPN
 4:        Michael Schrader        Germany(GER) GER
 5:            Pieter Braun    Netherlands(NED) NED
 6:      Laurent Hernu(FRA)                  NA FRA
 7:          Dmitriy Karpov     Kazakhstan(KAZ) KAZ
 8:      Laurent Hernu(FRA)                  NA FRA
 9: Thomas van der Plaetsen        Belgium(BEL) BEL
10:            Attila Szabó             Hungary  NA
11:          Nadir El Fassi              France  NA
12:           Eduard Mikhan        Belarus(BLR) BLR
13:           Leonel Suárez                Cuba  NA
14:           Janek Õiglane        Estonia(EST) EST
15:    Hans van Alphen(BEL)                  NA BEL
16:            Roman Šebrle      Czech Republic  NA
17:      André Niklaus(GER)                  NA GER
18:      Pascal Behrenbruch        Germany(GER) GER
19:            Pieter Braun    Netherlands(NED) NED
20:   Oleksandr Yurkov(UKR)                  NA UKR
21:      Eelco Sintnicolaas    Netherlands(NED) NED
22:           Brent Newdick         New Zealand  NA
23:             Kim Kun-woo    South Korea(KOR) KOR
24:        Akihiko Nakamura          Japan(JPN) JPN
25:          Bastien Auzeil         France(FRA) FRA
26:       Frédéric Xhonneux                  NA  NA
27:           Janek Õiglane        Estonia(EST) EST
28:          Keisuke Ushiro          Japan(JPN) JPN
29:            Roman Šebrle Czech Republic(CZE) CZE
30:           Rico Freimuth        Germany(GER) GER
                    Athlete         Nationality  cc

答案 1 :(得分:2)

希望这适合你:

library(dplyr)

res <- sample %>% mutate(
    countrycode = case_when(
        is.na(Nationality) & grepl('\\(', Athlete) ~ gsub('.*?\\((.*)\\)', '\\1', Athlete),
        grepl('\\(', Nationality) ~ gsub('.*?\\((.*)\\)', '\\1', Nationality),
        TRUE ~ Nationality
    )
)

示例输出:

res %>% select(Athlete, Nationality, countrycode)
# # A tibble: 30 x 3
#    Athlete                 Nationality      countrycode
# <chr>                   <chr>            <chr>     
# 1 François Gourmet(BEL)   NA               BEL       
# 2 Agustín Félix(ESP)      NA               ESP       
# 3 Keisuke Ushiro          Japan(JPN)       JPN       
# 4 Michael Schrader        Germany(GER)     GER       
# 5 Pieter Braun            Netherlands(NED) NED       
# 6 Laurent Hernu(FRA)      NA               FRA       
# 7 Dmitriy Karpov          Kazakhstan(KAZ)  KAZ       
# 8 Laurent Hernu(FRA)      NA               FRA       
# 9 Thomas van der Plaetsen Belgium(BEL)     BEL       
# 10 Attila Szabó            Hungary          Hungary   
# # ... with 20 more rows

删除TRUE ~ Nationality以仅提取由Frank评论的国家/地区代码:

sample %>% mutate(
    countrycode = case_when(
        is.na(Nationality) & grepl('\\(', Athlete) ~ gsub('.*?\\((.*)\\)', '\\1', Athlete),
        grepl('\\(', Nationality) ~ gsub('.*?\\((.*)\\)', '\\1', Nationality)
    ))

答案 2 :(得分:1)

这个简单的解决方案也有效。

library(stringr)
data1$country_code <- sapply(data1$Nationality, function(x) unlist(stri_extract_all(str = x, regex = '([A-Z]+)'))[2])

        Nationality country_code
1:               NA           NA
2:               NA           NA
3:       Japan(JPN)          JPN
4:     Germany(GER)          GER
5: Netherlands(NED)          NED
6:               NA           NA