试图从网站上删除数据但不断出现错误

时间:2018-03-04 19:16:46

标签: r dataframe web-scraping

好吧,基本上我要做的就是遍历在过去6年中参加过NCAA男子篮球锦标赛的每支球队,并将他们的名单从this website中删除。这是我的代码:

for (i in c(1:length(team_performance$Team))){
  burner_mpg_link <- getURL(paste("https://basketball.realgm.com/ncaa/conferences/",gsub(team_performance$Conference[i]," ","-"),
  "/16/",gsub(team_performance$Team[i]," ","-"),"/",team_performance$Number[i],"/stats/",team_performance$Year[i],
  "/Averages/All/All/Season/All/minutes/desc/1/",sep = ""))

  webpage <- read_html(burner_mpg_link)

  tables <- readHTMLTable(burner_mpg_link)
  table1 <- html_table(tables[1], fill = TRUE)
  data.frame(table1)

  temp_data$NULL.Year <- NULL
  temp_data$NULL.Year <- x

  mpg_data <- rbind(mpg_data,temp_data)

  percent_complete <- (100/384)*i
  print(paste(round(percent_complete,digits = 2),"% complete",sep=""))
}

team_performance是一个数据框,在此程序中,包含团队名称,年份,锦标赛中的种子,高级回合,会议以及与该团队对应的网站的网址中的数字。我的问题是table1 <- html_table(tables[1], fill = TRUE)告诉我html_table不起作用,因为tables[1]是一个列表。好的,我明白了。然后,当我尝试使用unlist取消列出tables[1]时,它会给出Error in attributes(.Data) <- c(attributes(.Data), attrib) 'names' attribute [345] must be the same length as the vector [23]。知道我能做些什么来解决这个问题吗?

编辑:可重复的例子。

> head(team_performance)
                 Team Year Seed Rounds.Advanced              Conference Number
1               Akron 2013   12               1 Mid-American Conference    174
2             Alabama 2012    9               1 Southeastern Conference    253
3              Albany 2015   14               1 America East Conference      6
4              Albany 2014   16               1 America East Conference      6
5              Albany 2013   15               1 America East Conference      6
6 American University 2014   15               1         Patriot League     245

我希望抓取的数据是数据框中the given website上的玩家统计表。

编辑2:

> dput(head(team_performance))
structure(list(Team = structure(c(1L, 2L, 3L, 3L, 3L, 4L), .Label = c("Akron", 
"Alabama", "Albany", "American University", "Arizona", "Arizona State", 
"Arkansas", "Austin Peay", "Baylor", "Belmont", "Brigham Young", 
"Bucknell", "Buffalo", "Butler", "Cal Poly", "Cal State Bakersfield", 
"California", "Chattanooga", "Cincinnati", "Coastal Carolina", 
"Colorado", "Colorado State", "Connecticut", "Creighton", "Davidson", 
"Dayton", "Delaware", "Detroit-Mercy", "Duke", "East Tennessee State", 
"Eastern Kentucky", "Eastern Washington", "Florida", "Florida Gulf Coast", 
"Florida State", "Fresno State", "George Washington", "Georgetown", 
"Georgia", "Georgia State", "Gonzaga", "Green Bay", "Hampton", 
"Harvard", "Hawaii", "Holy Cross", "Illinois", "Indiana", "Iona", 
"Iowa", "Iowa State", "Jacksonville State", "James Madison", 
"Kansas", "Kansas State", "Kent State", "Kentucky", "La Salle", 
"Lafayette", "Lehigh", "Little Rock", "Long Beach State", "Long Island", 
"Louisville", "Loyola (MD)", "LSU", "Manhattan", "Marquette", 
"Maryland", "Massachusetts", "Memphis", "Mercer", "Miami (FL)", 
"Michigan", "Michigan State", "Middle Tennessee State", "Milwaukee", 
"Minnesota", "Missouri", "Montana", "Mount St. Mary's", "Murray State", 
"NC State", "Nebraska", "Nevada", "New Mexico", "New Mexico State", 
"Norfolk State", "North Carolina", "North Carolina A&T", "North Carolina Central", 
"North Dakota", "North Dakota State", "Northeastern", "Northern Iowa", 
"Northern Kentucky", "Northwestern", "Northwestern State", "Notre Dame", 
"Ohio", "Ohio State", "Oklahoma", "Oklahoma State", "Ole Miss", 
"Oregon", "Oregon State", "Pacific", "Pittsburgh", "Princeton", 
"Providence", "Purdue", "Rhode Island", "Robert Morris", "Saint Joseph's", 
"Saint Louis", "Saint Mary's", "San Diego State", "Seton Hall", 
"South Carolina", "South Dakota State", "South Florida", "Southern", 
"Southern Methodist", "Southern Mississippi", "St. Bonaventure", 
"St. John's", "Stanford", "Stephen F. Austin", "Stony Brook", 
"Syracuse", "Temple", "Tennessee", "Texas", "Texas A&M", "Texas Southern", 
"Texas Tech", "Troy", "Tulsa", "UAB", "UC Davis", "UC Irvine", 
"UCLA", "UNC Asheville", "UNC Wilmington", "UNLV", "USC", "Utah", 
"Valparaiso", "Vanderbilt", "VCU", "Vermont", "Villanova", "Virginia", 
"Virginia Tech", "Weber State", "West Virginia", "Western Kentucky", 
"Western Michigan", "Wichita State", "Winthrop", "Wisconsin", 
"Wofford", "Wyoming", "Xavier", "Yale"), class = "factor"), Year = c(2013L, 
2012L, 2015L, 2014L, 2013L, 2014L), Seed = c(12L, 9L, 14L, 16L, 
15L, 15L), Rounds.Advanced = c(1L, 1L, 1L, 1L, 1L, 1L), Conference = structure(c(17L, 
25L, 1L, 1L, 1L, 24L), .Label = c("America East Conference", 
"American Athletic Conference", "Atlantic 10 Conference", "Atlantic Coast Conference", 
"Atlantic Sun Conference", "Big 12 Conference", "Big East Conference", 
"Big Sky Conference", "Big South Conference", "Big Ten Conference", 
"Big West Conference", "Colonial Athletic Association ", "ConferenceUSA ", 
"Horizon League ", "Ivy League ", "Metro Atlantic Athletic Conference", 
"Mid-American Conference", "Mid-Eastern Athletic Conference", 
"Missouri Valley Conference", "Mountain West Conference", "Northeast Conference", 
"Ohio Valley Conference", "Pacific-12 Conference", "Patriot League ", 
"Southeastern Conference", "Southern Conference", "Southland Conference", 
"Southwestern Athletic Conference", "Sun Belt Conference", "The Summit League ", 
"West Coast Conference", "Western Athletic Conference"), class = "factor"), 
Number = c(174L, 253L, 6L, 6L, 6L, 245L)), .Names = c("Team", 
"Year", "Seed", "Rounds.Advanced", "Conference", "Number"), row.names = c(NA, 
6L), class = "data.frame")

1 个答案:

答案 0 :(得分:0)

鉴于dat=team_performance您可以执行以下操作:

library(rvest)
s=gsub("\\s","-",paste0("https://basketball.realgm.com/ncaa/conferences/",dat[,5],"/16/",
                      dat[,1],"/",dat[,6],"/stats/",dat[,2],
                      "/Averages/All/All/Season/All/minutes/desc/1/"))


A=lapply(s,function(x){
  read_html(x)%>%html_nodes("table")%>%html_table()
})