Question

当原始数据放在简单的列和行中时，我想在url中获取内容。我尝试了readHTMLTable，显然它无法正常工作。使用webcsraping xpath，如何在不使用'\ n ...'的情况下获取干净的数据并将数据保存在data.frame中。是否可以不保存csv而实现？请帮助我改善代码。谢谢

library(rvest)
library(dplyr)
page <- read_html("http://weather.uwyo.edu/cgi-bin/sounding?region=seasia&TYPE=TEXT%3ALIST&YEAR=2006&MONTH=09&FROM=0100&TO=0100&STNM=48657")

xpath <- '/html/body/pre[1]'
txt <- page %>% html_node(xpath=xpath) %>% html_text()
txt

[1] "\n-----------------------------------------------------------------------------\n   PRES   HGHT   TEMP   DWPT   RELH   MIXR   DRCT   SKNT   THTA   THTE   THTV\n    hPa     m      C      C      %    g/kg    deg   knot     K      K      K \n-----------------------------------------------------------------------------\n 1009.0     16   23.8   22.7     94  17.56    170      2  296.2  346.9  299.3\n 1002.0     78   24.6   21.6     83  16.51    252      4  297.6  345.6  300.5\n 1000.0     96   24.4   21.3     83  16.23    275      4  297.6  344.8  300.4\n  962.0    434   22.9   20.0     84  15.56    235     10  299.4  345.0  302.1\n  925.0    777   21.4   18.7     85  14.90    245     11  301.2  345.2  303.9\n  887.0   1142   20.3   16.0     76  13.04    255     15  303.7  342.7  306.1\n  850.0   1512   19.2   13.2     68  11.34    230     17  306.2  340.6  308.3\n  839.0   1624   18.8   11.8     64  10.47    225     17  307.0  338.8  308.9\n  828.0   1735   18.0   11.4     65  10.33   ... <truncated>

Answer 1

您的数据被截断，因此我将尽力而为：

txt <- "\n-----------------------------------------------------------------------------\n   PRES   HGHT   TEMP   DWPT   RELH   MIXR   DRCT   SKNT   THTA   THTE   THTV\n    hPa     m      C      C      %    g/kg    deg   knot     K      K      K \n-----------------------------------------------------------------------------\n 1009.0     16   23.8   22.7     94  17.56    170      2  296.2  346.9  299.3\n 1002.0     78   24.6   21.6     83  16.51    252      4  297.6  345.6  300.5\n 1000.0     96   24.4   21.3     83  16.23    275      4  297.6  344.8  300.4\n  962.0    434   22.9   20.0     84  15.56    235     10  299.4  345.0  302.1\n  925.0    777   21.4   18.7     85  14.90    245     11  301.2  345.2  303.9\n  887.0   1142   20.3   16.0     76  13.04    255     15  303.7  342.7  306.1\n  850.0   1512   19.2   13.2     68  11.34    230     17  306.2  340.6  308.3\n"

它似乎是固定宽度的，使用\n分隔符将行压缩为单个字符串，因此我们将其拆分：

strsplit(txt, "\n")
# [[1]]
#  [1] ""                                                                             
#  [2] "-----------------------------------------------------------------------------"
#  [3] "   PRES   HGHT   TEMP   DWPT   RELH   MIXR   DRCT   SKNT   THTA   THTE   THTV"
#  [4] "    hPa     m      C      C      %    g/kg    deg   knot     K      K      K "
#  [5] "-----------------------------------------------------------------------------"
#  [6] " 1009.0     16   23.8   22.7     94  17.56    170      2  296.2  346.9  299.3"
#  [7] " 1002.0     78   24.6   21.6     83  16.51    252      4  297.6  345.6  300.5"
#  [8] " 1000.0     96   24.4   21.3     83  16.23    275      4  297.6  344.8  300.4"
#  [9] "  962.0    434   22.9   20.0     84  15.56    235     10  299.4  345.0  302.1"
# [10] "  925.0    777   21.4   18.7     85  14.90    245     11  301.2  345.2  303.9"
# [11] "  887.0   1142   20.3   16.0     76  13.04    255     15  303.7  342.7  306.1"
# [12] "  850.0   1512   19.2   13.2     68  11.34    230     17  306.2  340.6  308.3"

似乎第1行是空的，第2行和第5行是需要删除的行。第3-4行分别是列标题和单位。由于R不允许使用多行标头，因此我将删除这些单元，并将其留给您，以便在需要时将它们保存在其他位置。

这是一个简单的调用（注意[[1]]返回列表的strsplit）

read.table(text=strsplit(txt, "\n")[[1]][-c(1,2,4,5)], header=TRUE)
#   PRES HGHT TEMP DWPT RELH  MIXR DRCT SKNT  THTA  THTE  THTV
# 1 1009   16 23.8 22.7   94 17.56  170    2 296.2 346.9 299.3
# 2 1002   78 24.6 21.6   83 16.51  252    4 297.6 345.6 300.5
# 3 1000   96 24.4 21.3   83 16.23  275    4 297.6 344.8 300.4
# 4  962  434 22.9 20.0   84 15.56  235   10 299.4 345.0 302.1
# 5  925  777 21.4 18.7   85 14.90  245   11 301.2 345.2 303.9
# 6  887 1142 20.3 16.0   76 13.04  255   15 303.7 342.7 306.1
# 7  850 1512 19.2 13.2   68 11.34  230   17 306.2 340.6 308.3

Answer 2

我们可以扩展您的基本代码，并将网页视为API端点，因为它带有参数：

library(httr)
library(rvest)

我通过::在下面使用了多个^^，但我不想污染名称空间。

我通常最终会写一个小的参数化函数或带有cpl参数化函数的小程序包来封装下面的逻辑。

httr::GET(
  url = "http://weather.uwyo.edu/cgi-bin/sounding",
  query = list(
    region = "seasia",
    TYPE = "TEXT:LIST",
    YEAR = "2006",
    MONTH = "09",
    FROM = "0100",
    TO = "0100",
    STNM = "48657"
  )
) -> res

^^发出网页请求并收集响应。

httr::content(res, as="parsed") %>%
  html_nodes("pre") -> wx_dat

^^将其转换为html_document

现在，我们提取读数：

html_text(wx_dat[[1]]) %>%           # turn the first <pre> node into text
  strsplit("\n") %>%                 # split it into lines
  unlist() %>%                       # turn it back into a character vector
  { col_names <<- .[3]; . } %>%      # pull out the column names (we'll use them later)
  .[-(1:5)] %>%                      # strip off the header
  paste0(collapse="\n") -> readings  # turn it back into a big text blob

^^清理了表，我们将使用readr::read_table()对其进行解析。我们还将提取列名称转换为实际的列名称：

readr::read_table(readings, col_names = tolower(unlist(strsplit(trimws(col_names), "\ +"))))
## # A tibble: 106 x 11
##     pres  hght  temp  dwpt  relh  mixr  drct  sknt  thta  thte  thtv
##    <dbl> <int> <dbl> <dbl> <int> <dbl> <int> <int> <dbl> <dbl> <dbl>
##  1  1009    16  23.8  22.7    94 17.6    170     2  296.  347.  299.
##  2  1002    78  24.6  21.6    83 16.5    252     4  298.  346.  300.
##  3  1000    96  24.4  21.3    83 16.2    275     4  298.  345.  300.
##  4   962   434  22.9  20      84 15.6    235    10  299.  345   302.
##  5   925   777  21.4  18.7    85 14.9    245    11  301.  345.  304.
##  6   887  1142  20.3  16      76 13.0    255    15  304.  343.  306.
##  7   850  1512  19.2  13.2    68 11.3    230    17  306.  341.  308.
##  8   839  1624  18.8  11.8    64 10.5    225    17  307   339.  309.
##  9   828  1735  18    11.4    65 10.3    220    17  307.  339.  309.
## 10   789  2142  15.1  10      72  9.84   205    16  308.  339.  310.
## # ... with 96 more rows

您并不是说您想要电台的元数据，但我们也可以获取它（在第二个<pre>中：

html_text(wx_dat[[2]]) %>%
  strsplit("\n") %>%
  unlist() %>%
  trimws() %>%       # get rid of whitespace
  .[-1] %>%          # blank line removal
  strsplit(": ") %>% # separate field and value
  lapply(function(x) setNames(as.list(x), c("measure", "value"))) %>% # make each pair a named list
  dplyr::bind_rows() -> metadata # turn it into a data frame

metadata
## # A tibble: 30 x 2
##    measure                                 value      
##    <chr>                                   <chr>      
##  1 Station identifier                      WMKD       
##  2 Station number                          48657      
##  3 Observation time                        060901/0000
##  4 Station latitude                        3.78       
##  5 Station longitude                       103.21     
##  6 Station elevation                       16.0       
##  7 Showalter index                         0.34       
##  8 Lifted index                            -1.40      
##  9 LIFT computed using virtual temperature -1.63      
## 10 SWEAT index                             195.39     
## # ... with 20 more rows

将Web中的内容另存为data.frame

2 个答案: