选择<option>标记的值后,刮取页面的HTML内容

时间:2015-09-03 11:47:54

标签: html xml r sharepoint

一旦我们选择了选项的值,我就需要页面的html。

<commandlineArgs>-Xms512m -Xmx2g</commandlineArgs>

此代码提供可用选项的值。现在我想访问每个值的页面。我尝试使用查询字符串修改url。但它没有成功。

1 个答案:

答案 0 :(得分:1)

当然可以(解决)......

library(xml2)
library(rvest)
library(httr)
library(pbapply)

# Get the constituencies --------------------------------------------------

URL <- "http://ceogoa.nic.in/appln/UIL/ElectoralRoll.aspx"
electoral <- read_html(URL)
constituency <- grep("^0$", html_attr(html_nodes(electoral, "option"), "value"), value=TRUE, invert=TRUE)

# Found this and the other wretched SharePoint parameters via
# Developer Tools in Chrome

view_state <- "/wEPDwULLTEzNjg2NjEyMTEPZBYCZg9kFgICAw9kFgQCcw9kFgYCBQ9kFgJmD2QWAmYPZBYCAgEPZBYCZg9kFgICAQ8WAh4FY2xhc3NlFgICAQ8PFgIeB1Zpc2libGVoZBYCAgEPZBYCZg9kFgJmD2QWAgIBDw8WAh4EVGV4dGVkZAIJDxBkDxYpZgIBAgICAwIEAgUCBgIHAggCCQIKAgsCDAINAg4CDwIQAhECEgITAhQCFQIWAhcCGAIZAhoCGwIcAh0CHgIfAiACIQIiAiMCJAIlAiYCJwIoFikQBQlTZWxlY3QuLi4FATBnEAUJMS1NYW5kcmVtBQUwNTAwMWcQBQgyLVBlcm5lbQUFMDUwMDJnEAUKMy1CaWNob2xpbQUFMDUwMDNnEAUHNC1UaXZpbQUFMDUwMDRnEAUINS1NYXB1c2EFBTA1MDA1ZxAFCDYtU2lvbGltBQUwNTAwNmcQBQk3LVNhbGlnYW8FBTA1MDA3ZxAFCzgtQ2FsYW5ndXRlBQUwNTAwOGcQBQo5LVBvcnZvcmltBQUwNTAwOWcQBQkxMC1BbGRvbmEFBTA1MDEwZxAFCTExLVBhbmFqaQUFMDUwMTFnEAULMTItVGFsZWlnYW8FBTA1MDEyZxAFCjEzLVN0LkNydXoFBTA1MDEzZxAFDDE0LVN0LiBBbmRyZQUFMDUwMTRnEAUMMTUtQ3VtYmFyanVhBQUwNTAxNWcQBQcxNi1NYWVtBQUwNTAxNmcQBQwxNy1TYW5xdWVsaW0FBTA1MDE3ZxAFCTE4LVBvcmllbQUFMDUwMThnEAUJMTktVmFscG9pBQUwNTAxOWcQBQgyMC1QcmlvbAUFMDUwMjBnEAUIMjEtUG9uZGEFBTA1MDIxZxAFCTIyLVNpcm9kYQUFMDUwMjJnEAUKMjMtTWFyY2FpbQUFMDUwMjNnEAULMjQtTW9ybXVnYW8FBTA1MDI0ZxAFEDI1LVZhc2NvLURhLUdhbWEFBTA1MDI1ZxAFCjI2LURhYm9saW0FBTA1MDI2ZxAFCzI3LUNvcnRhbGltBQUwNTAyN2cQBQgyOC1OdXZlbQUFMDUwMjhnEAULMjktQ3VydG9yaW0FBTA1MDI5ZxAFCjMwLUZhdG9yZGEFBTA1MDMwZxAFCTMxLU1hcmdhbwUFMDUwMzFnEAULMzItQmVuYXVsaW0FBTA1MDMyZxAFCjMzLU5hdmVsaW0FBTA1MDMzZxAFCzM0LUN1bmNvbGltBQUwNTAzNGcQBQgzNS1WZWxpbQUFMDUwMzVnEAUJMzYtUXVlcGVtBQUwNTAzNmcQBQwzNy1DdXJjaG9yZW0FBTA1MDM3ZxAFDDM4LVNhbnZvcmRlbQUFMDUwMzhnEAUKMzktU2FuZ3VlbQUFMDUwMzlnEAULNDAtQ2FuYWNvbmEFBTA1MDQwZ2RkAhEPZBYCZg9kFgICAQ88KwANAGQCdQ9kFgQCAQ9kFgICAQ9kFgICAQ8PFgIfAgUHwqkgMjAxNWRkAgIPZBYCAgEPZBYCAgMPDxYCHwIFBjg2ODM5NGRkGAEFD2N0bDAwJE1haW4kZ3ZBQw9nZPW/9I4EvbbDNDoGBM07vkZaWPFH"

get_parts <- function(cons) {

  POST("http://ceogoa.nic.in/appln/UIL/ElectoralRoll.aspx", 
       body=list(
         `ctl00$Main$drpAC`=cons, # <------ here's where we use the parameter
         `ctl00$ToolkitScriptManager`="ctl00$ToolkitScriptManager|ctl00$Main$btnSearch",
         `_TSM_HiddenField_`="gw7jpIJ8LMgM7u8gLjQBxxbgFlVTP1p_vIL8EuJVw1w1",
         `ctl00$Main$vcAC_ClientState`=NULL,
         `__ASYNCPOST`=TRUE,
         `__EVENTTARGET`=NULL,
         `__EVENTARGUMENT`=NULL,
         `__VIEWSTATE`=view_state,
         `ctl00$Main$btnSearch`="Search"),
       encode="form") -> res

  doc <- read_html(content(res, as="text"))
  tab <- html_table(html_nodes(doc, "table.mGrid")[[1]], fill=TRUE)
  tab$constituency <- cons
  tab

}

dat <- pblapply(constituency, get_parts)

请注意,如果要从中创建一个巨大的数据框(如果需要表),则需要对表进行更多的清理。如果您希望PDF链接应该很容易,但请不要在清理或替代提取问题上标记此链接。