使用R和regex从字符串

时间:2017-11-15 01:29:33

标签: r regex

我使用R并在data.frame中有一个字符向量,我需要从特定列中删除一些特殊字符。 data.frame是大学橄榄球比分表。一些团队名称以'([0-9]'或者特定团队目前的排名。我想在这些团队名称之前删除排名,因此只保留团队名称。我已经接近使用下面的代码,但我正在努力去除'()'并且还有剩余的' '球队名称前面的空格。有什么想法吗?

# remove the numbers before team names
rr <- Scores$Winner
rr <- gsub("\\([0-9]\\)","",rr)
rr <- gsub("\\([0-9][0-9]\\)","",rr)
rr

以下是data.frame的一个例子

> dput(Scores[1:50,])
structure(list(Rk = c("1", "2", "3", "4", "5", "6", "7", "8", 
"9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", 
"20", "21", "22", "23", "24", "25", "26", "27", "28", "29", "30", 
"31", "32", "33", "34", "35", "36", "37", "38", "39", "40", "41", 
"42", "43", "44", "45", "46", "47", "48", "49", "50"), Wk = c("1", 
"1", "1", "1", "1", "2", "2", "2", "2", "2", "2", "2", "2", "2", 
"2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", 
"2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", 
"2", "2", "2", "2", "2", "2", "2", "2", "2", "2"), Date = c("Aug 26, 2017", 
"Aug 26, 2017", "Aug 26, 2017", "Aug 26, 2017", "Aug 26, 2017", 
"Aug 31, 2017", "Aug 31, 2017", "Aug 31, 2017", "Aug 31, 2017", 
"Aug 31, 2017", "Aug 31, 2017", "Aug 31, 2017", "Aug 31, 2017", 
"Aug 31, 2017", "Aug 31, 2017", "Aug 31, 2017", "Aug 31, 2017", 
"Aug 31, 2017", "Sep 1, 2017", "Sep 1, 2017", "Sep 1, 2017", 
"Sep 1, 2017", "Sep 1, 2017", "Sep 1, 2017", "Sep 1, 2017", "Sep 1, 2017", 
"Sep 2, 2017", "Sep 2, 2017", "Sep 2, 2017", "Sep 2, 2017", "Sep 2, 2017", 
"Sep 2, 2017", "Sep 2, 2017", "Sep 2, 2017", "Sep 2, 2017", "Sep 2, 2017", 
"Sep 2, 2017", "Sep 2, 2017", "Sep 2, 2017", "Sep 2, 2017", "Sep 2, 2017", 
"Sep 2, 2017", "Sep 2, 2017", "Sep 2, 2017", "Sep 2, 2017", "Sep 2, 2017", 
"Sep 2, 2017", "Sep 2, 2017", "Sep 2, 2017", "Sep 2, 2017"), 
    Time = c("3:00 PM", "2:30 PM", "6:00 PM", "7:30 PM", "10:00 PM", 
    "10:30 PM", "8:00 PM", "7:00 PM", "7:00 PM", "7:30 PM", "9:00 PM", 
    "9:00 PM", "7:00 PM", "8:00 PM", "7:00 PM", "7:00 PM", "7:30 PM", 
    "6:30 PM", "6:00 PM", "9:30 PM", "8:00 PM", "6:30 PM", "8:00 PM", 
    "7:00 PM", "8:00 PM", "9:00 PM", "2:00 PM", "8:00 PM", "3:30 PM", 
    "11:00 PM", "7:30 PM", "3:45 PM", "12:20 PM", "12:00 PM", 
    "12:00 PM", "7:00 PM", "6:00 PM", "10:00 PM", "6:15 PM", 
    "11:59 PM", "9:00 PM", "12:00 PM", "12:00 PM", "8:00 PM", 
    "6:00 PM", "7:00 PM", "7:10 PM", "4:00 PM", "7:00 PM", "9:30 PM"
    ), Day = c("Sat", "Sat", "Sat", "Sat", "Sat", "Thu", "Thu", 
    "Thu", "Thu", "Thu", "Thu", "Thu", "Thu", "Thu", "Thu", "Thu", 
    "Thu", "Thu", "Fri", "Fri", "Fri", "Fri", "Fri", "Fri", "Fri", 
    "Fri", "Sat", "Sat", "Sat", "Sat", "Sat", "Sat", "Sat", "Sat", 
    "Sat", "Sat", "Sat", "Sat", "Sat", "Sat", "Sat", "Sat", "Sat", 
    "Sat", "Sat", "Sat", "Sat", "Sat", "Sat", "Sat"), Winner = c("Brigham Young", 
    "Colorado State", "Hawaii", "(19) South Florida", "(14) Stanford", 
    "Arizona State", "Arkansas", "Central Michigan", "Cincinnati", 
    "Connecticut", "Idaho", "Memphis", "Minnesota", "(2) Ohio State", 
    "Tennessee State", "Toledo", "Utah", "Wake Forest", "Army", 
    "Boston College", "Colorado", "Eastern Michigan", "Navy", 
    "Syracuse", "(8) Washington", "(9) Wisconsin", "Air Force", 
    "(1) Alabama", "Alabama-Birmingham", "Arizona", "(12) Auburn", 
    "Boise State", "California", "Central Florida", "(5) Clemson", 
    "Coastal Carolina", "Duke", "Fresno State", "(15) Georgia", 
    "Hawaii", "Howard", "Illinois", "Iowa", "Iowa State", "James Madison", 
    "Kansas", "(20) Kansas State", "Kentucky", "Liberty", "(13) Louisiana State"
    ), Pts = c("20", "58", "38", "42", "62", "37", "49", "30", 
    "26", "27", "28", "37", "17", "49", "17", "47", "37", "51", 
    "64", "23", "17", "24", "42", "50", "30", "59", "62", "24", 
    "38", "62", "41", "24", "35", "61", "56", "38", "60", "66", 
    "31", "41", "43", "24", "24", "42", "34", "38", "55", "24", 
    "48", "27"), c("", "", "@", "@", "", "", "", "", "", "", 
    "", "", "", "@", "@", "", "", "", "", "@", "", "", "@", "", 
    "@", "", "", "", "", "", "", "", "@", "", "", "", "", "", 
    "", "", "@", "", "", "", "@", "", "", "@", "@", ""), Loser = c("Portland State", 
    "Oregon State", "Massachusetts", "San Jose State", "Rice", 
    "New Mexico State", "Florida A&M", "Rhode Island", "Austin Peay", 
    "Holy Cross", "Sacramento State", "Louisiana-Monroe", "Buffalo", 
    "Indiana", "Georgia State", "Elon", "North Dakota", "Presbyterian", 
    "Fordham", "Northern Illinois", "Colorado State", "Charlotte", 
    "Florida Atlantic", "Central Connecticut State", "Rutgers", 
    "Utah State", "Virginia Military Institute", "(3) Florida State", 
    "Alabama A&M", "Northern Arizona", "Georgia Southern", "Troy", 
    "North Carolina", "Florida International", "Kent State", 
    "Massachusetts", "North Carolina Central", "Incarnate Word", 
    "Appalachian State", "Western Carolina", "Nevada-Las Vegas", 
    "Ball State", "Wyoming", "Northern Iowa", "East Carolina", 
    "Southeast Missouri State", "Central Arkansas", "Southern Mississippi", 
    "Baylor", "Brigham Young"), Pts = c("6", "27", "35", "22", 
    "7", "31", "7", "27", "14", "20", "6", "29", "7", "21", "10", 
    "13", "16", "7", "6", "20", "3", "7", "19", "7", "14", "10", 
    "0", "7", "7", "24", "7", "13", "30", "17", "3", "28", "7", 
    "0", "10", "18", "40", "21", "3", "24", "14", "16", "19", 
    "17", "45", "0"), TV = c("", "", "", "", "", "", "", "", 
    "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
    "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
    "", "", "", "", "", "", "", "", "", "", "", ""), Notes = c("LaVell Edwards Stadium - Provo, Utah", 
    "Sonny Lubrick Field at Colorado State Stadium - Fort Collins, Colorado", 
    "Warren McGuirk Alumni Stadium - Amherst, Massachusetts", 
    "CEFCU Stadium - San Jose, California", "Allianz Stadium - Sydney, AUS", 
    "Sun Devil Stadium - Tempe, Arizona", "War Memorial Stadium - Little Rock, Arkansas", 
    "Kelly/Shorts Stadium - Mount Pleasant, Michigan", "Nippert Stadium - Cincinnati, Ohio", 
    "Pratt & Whitney Stadium at Rentschler Field - East Hartford, Connecticut", 
    "Kibbie-Asui Activity Center - Moscow, Idaho", "Liberty Bowl Memorial Stadium - Memphis, Tennessee", 
    "TCF Bank Stadium - Minneapolis, Minnesota", "Memorial Stadium \"The Rock\" - Bloomington, Indiana", 
    "Georgia State Stadium - Atlanta, Georgia", "Glass Bowl - Toledo, Ohio", 
    "Rice-Eccles Stadium - Salt Lake City, Utah", "BB&T Field - Winston-Salem, North Carolina", 
    "Michie Stadium - West Point, New York", "Huskie Stadium - DeKalb, Illinois", 
    "Sports Authority Field - Denver, Colorado", "Rynearson Stadium - Ypsilanti, Michigan", 
    "FAU Football Stadium - Boca Raton, Florida", "Carrier Dome - Syracuse, New York", 
    "High Point Solutions Stadium - Piscataway, New Jersey", 
    "Camp Randall Stadium - Madison, Wisconsin", "Falcon Stadium - Colorado Springs, Colorado", 
    "Mercedes-Benz Stadium - Atlanta, Georgia", "Legion Field - Birmingham, Alabama", 
    "Arizona Stadium - Tucson, Arizona", "Jordan-Hare Stadium - Auburn, Alabama", 
    "Albertsons Stadium - Boise, Idaho", "Kenan Memorial Stadium - Chapel Hill, North Carolina", 
    "", "Clemson Memorial Stadium - Clemson, South Carolina", 
    "Brooks Stadium - Conway, South Carolina", "Wallace Wade Stadium - Durham, North Carolina", 
    "Bulldog Stadium - Fresno, California", "Sanford Stadium - Athens, Georgia", 
    "Aloha Stadium - Honolulu, Hawaii", "Sam Boyd Stadium - Las Vegas, Nevada", 
    "Memorial Stadium - Champaign, Illinois", "Kinnick Stadium - Iowa City, Iowa", 
    "Jack Trice Stadium - Ames, Iowa", "Dowdy-Ficklen Stadium - Greenville, North Carolina", 
    "Memorial Stadium - Lawrence, Kansas", "Bill Snyder Family Stadium - Manhattan, Kansas", 
    "M. M. Roberts Stadium - Hattiesburg, Mississippi", "McLane Stadium - Waco, Texas", 
    "Mercedes-Benz Superdome - New Orleans, Louisiana")), .Names = c("Rk", 
"Wk", "Date", "Time", "Day", "Winner", "Pts", "", "Loser", "Pts", 
"TV", "Notes"), row.names = c(1L, 2L, 3L, 4L, 5L, 7L, 8L, 9L, 
10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 
23L, 24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L, 32L, 33L, 34L, 35L, 
36L, 37L, 38L, 39L, 40L, 41L, 42L, 43L, 44L, 45L, 46L, 47L, 48L, 
49L, 50L, 51L), class = "data.frame")

1 个答案:

答案 0 :(得分:1)

您可以使用以下正则表达式并将其替换为空字符串:

^\\([0-9]+\\)\\s+^\\(\\d+\\)\\s+

这将删除括号括起的起始数字,它将修剪字符串的开头。

示例:

(3) Florida State -> Florida State