从R中的原始html中提取数据

时间:2015-07-15 07:21:47

标签: xml r web-scraping rcurl

我正在尝试从此页面中提取所有选项卡中所有值的值。 http://www.imd.gov.in/section/hydro/dynamic/rfmaps/weekrain.htm

我首先尝试下载为excel。但那是不可能的。我只能将其下载为文本文件。如果我尝试直接从网页上阅读,我会得到原始的html页面。我被困在如何提取这些值。请找到我尝试过的代码。

library(RCurl)
require(XML)
url = "http://www.imd.gov.in/section/hydro/dynamic/rfmaps/weekrain.htm"
download.file(url = url, destfile = "E:\\indiaprecip")

2 个答案:

答案 0 :(得分:1)

只需使用XML中的函数“htmlTreeParse”

library(XML)
html <- htmlTreeParse("http://www.imd.gov.in/section/hydro/dynamic/rfmaps/weekrain.htm",
                     useInternalNodes = T)
xpathSApply(html, "//meta/@name")

但在你的情况下,你有另一个问题。您要访问的数据位于html框架中。以下代码可以帮助您阅读这些数据:

library(XML)
library(RCulr)
url <- "http://www.imd.gov.in/section/hydro/dynamic/rfmaps/weekrain.htm"
html <- htmlTreeParse(url, useInternalNodes = T)
frameUrl <- paste("http://www.imd.gov.in/section/hydro/dynamic/rfmaps/",
                  xpathSApply(html, "//frame[1]/@src"),
                  sep = "")

htmlWithData = getURL(frameUrl,
                      httpheader = c("User-Agent" = "RCurl",
                                     "Referer" = url))

dataXml <- htmlTreeParse(htmlWithData, isURL = F, useInternalNodes = T)
xpathSApply(dataXml, "//body/table")

答案 1 :(得分:1)

唉。 Excel - &gt; HTML + iframe。以下内容可帮助您入门。无论您使用何种方法,您都必须在刮擦后进行大量处理。我没有显示它的使用,但htmltab包也可以在这里提供帮助。

library(xml2)  # devtools::install_github("hadley/xml2")
library(rvest) # devtools::install_github("hadley/rvest")
library(httr)
library(magrittr)

base_url <- "http://www.imd.gov.in/section/hydro/dynamic/rfmaps/weekrain_files"

# Get all the state sheet URLs --------------------------------------------

state_sheets <- read_html(sprintf("%s/%s", base_url, "tabstrip.htm"))

state_sheets %>%
  html_nodes(xpath="//a[@target='frSheet']") %>%
  html_attr("href") -> state_links

state_sheets %>%
  html_nodes(xpath="//a[@target='frSheet']") %>%
  html_text() -> state_names

state_links
##  [1] "sheet001.htm" "sheet002.htm" "sheet003.htm" "sheet004.htm"
##  [5] "sheet005.htm" "sheet006.htm" "sheet007.htm" "sheet008.htm"
##  [9] "sheet009.htm" "sheet010.htm" "sheet011.htm" "sheet012.htm"
## [13] "sheet013.htm" "sheet014.htm" "sheet015.htm" "sheet016.htm"
## [17] "sheet017.htm" "sheet018.htm" "sheet019.htm" "sheet020.htm"
## [21] "sheet021.htm" "sheet022.htm" "sheet023.htm" "sheet024.htm"
## [25] "sheet025.htm" "sheet026.htm" "sheet027.htm" "sheet028.htm"
## [29] "sheet029.htm" "sheet030.htm" "sheet031.htm" "sheet032.htm"
## [33] "sheet033.htm" "sheet034.htm" "sheet035.htm"

state_names
##  [1] "A & N ISLAND"      "ANDHRA PRADESH"    "ARUNACHAL PRADESH"
##  [4] "ASSAM"             "BIHAR"             "CHANDIGARH"       
##  [7] "CHHATTISGARH"      "DELHI"             "DIU"              
## [10] "DNH & DAMAN"       "GOA"               "GUJARAT"          
## [13] "HARYANA"           "HIMACHAL PRADESH"  "JAMMU & KASHMIR"  
## [16] "JHARKHAND"         "KARNATAKA"         "KERALA"           
## [19] "LAKSHADWEEP"       "MADHYA PRADESH"    "MAHARASHTRA"      
## [22] "MANIPUR"           "MEGHALAYA"         "MIZORAM"          
## [25] "NAGALAND"          "ORISSA"            "PONDICHERRY"      
## [28] "PUNJAB"            "RAJASTHAN"         "SIKKIM"           
## [31] "TAMILNADU"         "TRIPURA"           "UTTAR PRADESH"    
## [34] "UTTARAKHAND"       "WEST BENGAL"

# Get one page ------------------------------------------------------------

# the referer was necessary for me. ymmv

rain <- html_session(sprintf("%s/%s", base_url, state_links[2]),
                     add_headers(Referer="http://www.imd.gov.in/section/hydro/dynamic/rfmaps/weekrain_files/tabstrip.htm"))


# Extract whole table (ugly) ----------------------------------------------

# this will require _alot_ of cleanup code

rain %>% html_nodes("table") %>% html_table() %>% extract2(1) %>% head(10)
##                                                             X1   X2
## 1                                                                  
## 2     To view the\r\n  Districtwise Rainfall of another State, <NA>
## 3  Click the desired\r\n  STATE tab on the strip at the bottom <NA>
## 4                                                              <NA>
## 5                                                              <NA>
## 6                     DISTRICTWISE RAINFALL\r\n  DISTRIBUTION  <NA>
## 7                                                                  
## 8                                                              <NA>
## 9                                     STATE/UT/MET.SUBDIVISION <NA>
## 10                                        DISTRICT\r\n  (NAME) <NA>
##            X3   X4         X5       X6                          X7   X8 X9
## 1                                                                       NA
## 2        <NA> <NA>       <NA>          Back to Rainfall Statistics <NA> NA
## 3        <NA> <NA>       <NA>                                 <NA> <NA> NA
## 4        <NA> <NA>       <NA>     <NA>                        <NA> <NA> NA
## 5        <NA> <NA>       <NA>     <NA>                        <NA> <NA> NA
## 6        <NA> <NA>       <NA>     <NA>                        <NA> <NA> NA
## 7  16.07.2015   TO 22.07.2015                                           NA
## 8        <NA> <NA>                <NA>                                  NA
## 9        <NA>          ACTUAL   NORMAL                        %DEP CAT. NA
## 10                       (mm)     (mm)                                  NA


# Extract by column -------------------------------------------------------

# painstaking and it may be less work to cleanup the table

rain %>% html_nodes("td.xl88") %>% html_text()
##  [1] "East Godavary"  "Guntur"         "Krishna"        "Nellore"       
##  [5] "Prakasam"       "Srikakulam"     "Vishakhapatnam" "Vizianagaram"  
##  [9] "West Godavary"  "Adilabad"       "Hyderabad"      "Karimnagar"    
## [13] "Khamman"        "Mahabubnagar"   "Medak"          "Nalgonda"      
## [17] "Nizamabad"      "Rangareddy"     "Warangal"       "Anantapur"     
## [21] "Chittor"        "Cuddapah"       "Kurnool"

rain %>% html_nodes("td[class='xl85'][x\\:num='']") %>% html_text()
##  [1] "43.1" "35.8" "48.2" "22.1" "26.5" "46.8" "44.3" "42.5" "52.4" "16.8"
## [11] "26.2" "26.3" "25.4" "0.0"  "0.0"  "0.0"  "54.8" "0.0"  "0.0"  "0.0" 
## [21] "73.9" "43.1" "53.2" "41.9" "64.7" "44.1" "62.1"

rain %>% html_nodes("td[class='xl85'][x\\:num!='']") %>% html_text()
##  [1] "94.3" "E"    "50.6" "E"    "44.8" "N"    "8.2"  "S"    "10.4" "S"   
## [11] "53.2" "N"    "44.5" "N"    "43.3" "N"    "82.7" "E"    ""     ""    
## [21] "9.8"  "D"    "16.0" "D"    "8.9"  "S"    "6.7"  "S"    "25.2" "D"   
## [31] "59.3" "D"    "7.8"  "S"    "34.2" "D"    ""     ""     "16.1" "S"   
## [41] "27.6" "D"    "13.2" "S"    "17.1" "S"    ""