当HTML表有两个标题时使用rvest包

时间:2017-10-07 09:18:20

标签: r dataframe rvest

我使用以下代码来刮取AFL播放器数据上的HTML表格:

library(rvest)

website <-read_html("https://afltables.com/afl/stats/teams/adelaide/2017_gbg.html")
table   <- website %>%
           html_nodes("table") %>%
           .[(1)] %>%
           html_table()

结果表是34个obs。 27个变量,但是nrow(table)ncol(table)都返回NULL。这是正确的,因为数据框中有两行标题吗?我希望能够根据各列进行计算,但是下面会出现错误:

table[,1]
# Error in table[, 1] : incorrect number of dimensions

它产生了这个错误,我该如何解决呢?

2 个答案:

答案 0 :(得分:0)

首先,与您的问题无关:不要使用R作为对象的名称,因为此名称已保留用于html_table()中的其他功能。这被认为是不好的做法,我被告知它会回来并将你扼杀在某个地方的屁股上。

继续讨论问题:您正在努力解决NULL为您提供的数据类型。您将返回一个列表,其中包含常规data.frame。您输出的列表对于列数和行数有website <-read_html("https://afltables.com/afl/stats/teams/adelaide/2017_gbg.html") scraped <- website %>% html_nodes("table") %>% .[(1)] %>% html_table() %>% `[[`(1) # Select the first element of the list, like scraped[[1]] ncol(scraped) # 27 nrow(scraped) # 34 ,因为该列表只有一个元素:data.frame。通过选择列表中的第一个(也是唯一的)元素,您将获得您真正感兴趣的数据帧。此数据框有27列和34行

float number1 = 1;
float number2 = 2;
float input_util;

input_util=Float.parseFloat(yourEditText.getText.toString);

float  one_operate = input_util * number1 + number2;

float calPS = input_util - one_operate;

答案 1 :(得分:0)


library(rvest)
#> Le chargement a nécessité le package : xml2

website <-read_html("https://afltables.com/afl/stats/teams/adelaide/2017_gbg.html")

在这个网站上,你有几张桌子,上面显示了每个链接一张 主页上的打印表格。 在html_tables的结果上使用html_nodes("tables")可以让您一次获取列表中的所有表格。

all_tables <- website %>%
  html_nodes("table") %>%
  html_table()

str(all_tables, 1)
#> List of 23
#>  $ :'data.frame':    34 obs. of  27 variables:
#>  $ :'data.frame':    34 obs. of  27 variables:
#>  $ :'data.frame':    34 obs. of  27 variables:
#>  $ :'data.frame':    34 obs. of  27 variables:
#>  $ :'data.frame':    34 obs. of  27 variables:
#>  $ :'data.frame':    34 obs. of  27 variables:
#>  $ :'data.frame':    34 obs. of  27 variables:
#>  $ :'data.frame':    34 obs. of  27 variables:
#>  $ :'data.frame':    34 obs. of  27 variables:
#>  $ :'data.frame':    34 obs. of  27 variables:
#>  $ :'data.frame':    34 obs. of  27 variables:
#>  $ :'data.frame':    34 obs. of  27 variables:
#>  $ :'data.frame':    34 obs. of  27 variables:
#>  $ :'data.frame':    34 obs. of  27 variables:
#>  $ :'data.frame':    34 obs. of  27 variables:
#>  $ :'data.frame':    34 obs. of  27 variables:
#>  $ :'data.frame':    34 obs. of  27 variables:
#>  $ :'data.frame':    34 obs. of  27 variables:
#>  $ :'data.frame':    34 obs. of  27 variables:
#>  $ :'data.frame':    34 obs. of  27 variables:
#>  $ :'data.frame':    34 obs. of  27 variables:
#>  $ :'data.frame':    34 obs. of  27 variables:
#>  $ :'data.frame':    34 obs. of  27 variables:

然后,您可以选择所需的表,但标题仍然没有 右

head(all_tables[[1]])
#>          Disposals Disposals Disposals Disposals Disposals Disposals
#> 1           Player        R1        R2        R3        R4        R5
#> 2     Atkins, Rory        19        19        19        23        29
#> 3  Beech, Jonathon                                                  
#> 4     Betts, Eddie        18        13        16        22        12
#> 5      Brown, Luke        18        12        13         9        15
#> 6 Cameron, Charlie        23        17        16        16        13
#>   Disposals Disposals Disposals Disposals Disposals Disposals Disposals
#> 1        R6        R7        R8        R9       R10       R11       R12
#> 2        23        20        21        28        37        14        25
#> 3                                                                    15
#> 4        16        13         9        16        14        12        11
#> 5        17        13        20        25        16        12          
#> 6        13        14        10        18        13         8        13
#>   Disposals Disposals Disposals Disposals Disposals Disposals Disposals
#> 1       R14       R15       R16       R17       R18       R19       R20
#> 2        28        15        23        18        19        16        16
#> 3        12        11                                                  
#> 4        14        11        13        16         8                  16
#> 5        10        15        14        17        11        10        20
#> 6        15                  10        20         6         9        17
#>   Disposals Disposals Disposals Disposals Disposals Disposals Disposals
#> 1       R21       R22       R23        QF        PF        GF       Tot
#> 2        27        21        21        16        22        17       536
#> 3                                                                    38
#> 4         7        16        12        13        13         7       318
#> 5        17        17         9        20        10        13       353
#> 6        13        10        10        15        19        16       334

对包含purrrdplyr的列表和表格进行一些操作, 您可以格式化具有2个标题的表:

all_tables   <- website %>%
  html_nodes("table") %>%
  # do not let httr handles header automatically. 
  html_table(header = FALSE)

library(purrr)
#> 
#> Attachement du package : 'purrr'
#> The following object is masked from 'package:rvest':
#> 
#>     pluck
all_tables <- all_tables %>%
  # get the first column, first row to set the name for the list elements
  # pluck is a purrr function acting like x[[1]][1, 1] here
  lmap( ~ set_names(.x, nm = pluck(.x, 1, 1, 1))) %>%
  # For each table, set second line as header 
  # and delete first and second line
  map(~ set_names(.x, nm = .x[2, ]) %>% slice(-c(1, 2)))
str(all_tables_res, 1)
#> List of 23
#>  $ Disposals              :Classes 'tbl_df', 'tbl' and 'data.frame': 33 obs. of  27 variables:
#>  $ Kicks                  :Classes 'tbl_df', 'tbl' and 'data.frame': 33 obs. of  27 variables:
#>  $ Marks                  :Classes 'tbl_df', 'tbl' and 'data.frame': 33 obs. of  27 variables:
#>  $ Handballs              :Classes 'tbl_df', 'tbl' and 'data.frame': 33 obs. of  27 variables:
#>  $ Goals                  :Classes 'tbl_df', 'tbl' and 'data.frame': 33 obs. of  27 variables:
#>  $ Behinds                :Classes 'tbl_df', 'tbl' and 'data.frame': 33 obs. of  27 variables:
#>  $ Hit Outs               :Classes 'tbl_df', 'tbl' and 'data.frame': 33 obs. of  27 variables:
#>  $ Tackles                :Classes 'tbl_df', 'tbl' and 'data.frame': 33 obs. of  27 variables:
#>  $ Rebounds               :Classes 'tbl_df', 'tbl' and 'data.frame': 33 obs. of  27 variables:
#>  $ Inside 50s             :Classes 'tbl_df', 'tbl' and 'data.frame': 33 obs. of  27 variables:
#>  $ Clearances             :Classes 'tbl_df', 'tbl' and 'data.frame': 33 obs. of  27 variables:
#>  $ Clangers               :Classes 'tbl_df', 'tbl' and 'data.frame': 33 obs. of  27 variables:
#>  $ Frees                  :Classes 'tbl_df', 'tbl' and 'data.frame': 33 obs. of  27 variables:
#>  $ Frees Against          :Classes 'tbl_df', 'tbl' and 'data.frame': 33 obs. of  27 variables:
#>  $ Brownlow Votes         :Classes 'tbl_df', 'tbl' and 'data.frame': 33 obs. of  27 variables:
#>  $ Contested Possessions  :Classes 'tbl_df', 'tbl' and 'data.frame': 33 obs. of  27 variables:
#>  $ Uncontested Possessions:Classes 'tbl_df', 'tbl' and 'data.frame': 33 obs. of  27 variables:
#>  $ Contested Marks        :Classes 'tbl_df', 'tbl' and 'data.frame': 33 obs. of  27 variables:
#>  $ Marks Inside 50        :Classes 'tbl_df', 'tbl' and 'data.frame': 33 obs. of  27 variables:
#>  $ One Percenters         :Classes 'tbl_df', 'tbl' and 'data.frame': 33 obs. of  27 variables:
#>  $ Bounces                :Classes 'tbl_df', 'tbl' and 'data.frame': 33 obs. of  27 variables:
#>  $ Goal Assists           :Classes 'tbl_df', 'tbl' and 'data.frame': 33 obs. of  27 variables:
#>  $ % Played               :Classes 'tbl_df', 'tbl' and 'data.frame': 33 obs. of  27 variables:

You can now called any table of the website.

head(all_tables_res$Goals)
#> # A tibble: 6 x 27
#>             Player    R1    R2    R3    R4    R5    R6    R7    R8    R9
#>              <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1     Atkins, Rory     3     1     -     2     1     -     1     1     -
#> 2  Beech, Jonathon                                                      
#> 3     Betts, Eddie     4     3     3     6     3     1     3     2     3
#> 4      Brown, Luke     -     1     -     -     1     -     -     -     -
#> 5 Cameron, Charlie     2     1     -     1     2     2     2     -     4
#> 6     Crouch, Brad                             -     -     -     -     1
#> # ... with 17 more variables: R10 <chr>, R11 <chr>, R12 <chr>, R14 <chr>,
#> #   R15 <chr>, R16 <chr>, R17 <chr>, R18 <chr>, R19 <chr>, R20 <chr>,
#> #   R21 <chr>, R22 <chr>, R23 <chr>, QF <chr>, PF <chr>, GF <chr>,
#> #   Tot <chr>