我在刮除Transfermarket时遇到了麻烦。我想收集过去20个赛季的欧洲前5个联赛(英超,西甲,意甲,联赛1,德甲)的数据。在此,我想收集一系列详细信息-球员姓名,年龄,球员位置,球员俱乐部,剩余球员俱乐部,费用。但是,即使使用了这个非常基本的代码,我也只写了一页18/19英超联赛转会费,就收集了球队和名字(补给),但我却得到了一个我不明白的错误。我也一直在使用选择器小工具。
我的代码:
require(rvest)
page = "https://www.transfermarkt.com/premier-league/transfers/wettbewerb/GB1/plus/?saison_id=2012&s_w=&leihe=0&leihe=1&intern=0"
scraped_page <- read_html(page)
Team_html = html_nodes(page, ".tooltipstered+ .tooltipstered")
Team = html_text(Team_html)
Addition_html = html_nodes(page, ".table-header+ .responsive-table .spielprofil_tooltip")
Addition = html_text(Addition_html)
df <- data.frame(Team, Addition)
head(df)
R返回什么:
> page = "https://www.transfermarkt.com/premier-league/transfers/wettbewerb/GB1/plus/?saison_id=2012&s_w=&leihe=0&leihe=1&intern=0"
>
> scraped_page <- read_html(page)
>
> Team_html = html_nodes(page, ".tooltipstered+ .tooltipstered")
Error in UseMethod("xml_find_all") :
no applicable method for 'xml_find_all' applied to an object of class "character"
> Team = html_text(Team_html)
> Addition_html = html_nodes(page, ".table-header+ .responsive-table .spielprofil_tooltip")
Error in UseMethod("xml_find_all") :
no applicable method for 'xml_find_all' applied to an object of class "character"
> Addition = html_text(Addition_html)
>
>
> df <- data.frame(Team, Addition)
Error in data.frame(Team, Addition) :
arguments imply differing number of rows: 0, 922
>
> head(df)
1 function (x, df1, df2, ncp, log = FALSE)
2 {
3 if (missing(ncp))
4 .Call(C_df, x, df1, df2, log)
5 else .Call(C_dnf, x, df1, df2, ncp, log)
6 }
我当时想从这里开始,然后使用gsub和其他一些命令在一个年复一年的联赛中循环循环...
答案 0 :(得分:0)
您遇到的主要问题是
Team_html = html_nodes(page, ".tooltipstered+ .tooltipstered")
应该是
Team_html = html_nodes(scraped_page, ".tooltipstered+ .tooltipstered")
此外,我认为您没有正确指定选择器。我想您可能想做这样的事情...
更新
潜在的解决方案1:
按组分别刮擦每个表,它们在堆栈数据之前手动添加组名。在下面的代码中,我为前5个团队这样做
in1<-html_nodes(scraped_page, '#main > div:nth-child(13) > div.large-8.columns > div:nth-child(4) > div:nth-child(2) > table') %>% html_table()
in2<-html_nodes(scraped_page, '#main > div:nth-child(13) > div.large-8.columns > div:nth-child(5) > div:nth-child(2) > table') %>% html_table()
in3<-html_nodes(scraped_page, '#main > div:nth-child(13) > div.large-8.columns > div:nth-child(6) > div:nth-child(2) > table') %>% html_table()
in4<-html_nodes(scraped_page, '#main > div:nth-child(13) > div.large-8.columns > div:nth-child(7) > div:nth-child(2) > table') %>% html_table()
in5<-html_nodes(scraped_page, '#main > div:nth-child(13) > div.large-8.columns > div:nth-child(8) > div:nth-child(2) > table') %>% html_table()
out1<-html_nodes(scraped_page, '#main > div:nth-child(13) > div.large-8.columns > div:nth-child(4) > div:nth-child(4) > table') %>% html_table()
out2<-html_nodes(scraped_page, '#main > div:nth-child(13) > div.large-8.columns > div:nth-child(5) > div:nth-child(4) > table') %>% html_table()
out3<-html_nodes(scraped_page, '#main > div:nth-child(13) > div.large-8.columns > div:nth-child(6) > div:nth-child(4) > table') %>% html_table()
out4<-html_nodes(scraped_page, '#main > div:nth-child(13) > div.large-8.columns > div:nth-child(7) > div:nth-child(4) > table') %>% html_table()
out5<-html_nodes(scraped_page, '#main > div:nth-child(13) > div.large-8.columns > div:nth-child(8) > div:nth-child(4) > table') %>% html_table()
in1<-in1[[1]]
in2<-in2[[1]]
in3<-in3[[1]]
in4<-in4[[1]]
in5<-in5[[1]]
out1<-out1[[1]]
out2<-out2[[1]]
out3<-out3[[1]]
out4<-out4[[1]]
out5<-out5[[1]]
in1$team<-"Arsenal"
in2$team<-"Man U"
in3$team<-"West Brom"
in4$team<-"Fulham"
in5$team<-"New Castle"
out1$team<-"Arsenal"
out2$team<-"Man U"
out3$team<-"West Brom"
out4$team<-"Fulham"
out5$team<-"New Castle"
ins<-rbind(in1,in2,in3,in4,in5)
outs<-rbind(out1,out2,out3,out4,out5)
潜在解决方案2:
此解决方案不保留团队名称,而是通过缝合来更有效地梳理。
tab<-html_nodes(scraped_page, ".responsive-table td") %>% html_text()
temp<-data.frame(value=tab, index=index)
df<-data.frame(x1=character(981), x2=character(981), x3=character(981), x4=character(981), x5=character(981),
x6=character(981),x7=character(981),x8=character(981),x9=character(981))
for (i in 1:9){
df[,i]<-temp$value[temp$index==i]
}
head(df)