一旦我们选择了选项的值,我就需要页面的html。
<commandlineArgs>-Xms512m -Xmx2g</commandlineArgs>
此代码提供可用选项的值。现在我想访问每个值的页面。我尝试使用查询字符串修改url。但它没有成功。
答案 0 :(得分:1)
当然可以(解决)......
library(xml2)
library(rvest)
library(httr)
library(pbapply)
# Get the constituencies --------------------------------------------------
URL <- "http://ceogoa.nic.in/appln/UIL/ElectoralRoll.aspx"
electoral <- read_html(URL)
constituency <- grep("^0$", html_attr(html_nodes(electoral, "option"), "value"), value=TRUE, invert=TRUE)
# Found this and the other wretched SharePoint parameters via
# Developer Tools in Chrome
view_state <- "/wEPDwULLTEzNjg2NjEyMTEPZBYCZg9kFgICAw9kFgQCcw9kFgYCBQ9kFgJmD2QWAmYPZBYCAgEPZBYCZg9kFgICAQ8WAh4FY2xhc3NlFgICAQ8PFgIeB1Zpc2libGVoZBYCAgEPZBYCZg9kFgJmD2QWAgIBDw8WAh4EVGV4dGVkZAIJDxBkDxYpZgIBAgICAwIEAgUCBgIHAggCCQIKAgsCDAINAg4CDwIQAhECEgITAhQCFQIWAhcCGAIZAhoCGwIcAh0CHgIfAiACIQIiAiMCJAIlAiYCJwIoFikQBQlTZWxlY3QuLi4FATBnEAUJMS1NYW5kcmVtBQUwNTAwMWcQBQgyLVBlcm5lbQUFMDUwMDJnEAUKMy1CaWNob2xpbQUFMDUwMDNnEAUHNC1UaXZpbQUFMDUwMDRnEAUINS1NYXB1c2EFBTA1MDA1ZxAFCDYtU2lvbGltBQUwNTAwNmcQBQk3LVNhbGlnYW8FBTA1MDA3ZxAFCzgtQ2FsYW5ndXRlBQUwNTAwOGcQBQo5LVBvcnZvcmltBQUwNTAwOWcQBQkxMC1BbGRvbmEFBTA1MDEwZxAFCTExLVBhbmFqaQUFMDUwMTFnEAULMTItVGFsZWlnYW8FBTA1MDEyZxAFCjEzLVN0LkNydXoFBTA1MDEzZxAFDDE0LVN0LiBBbmRyZQUFMDUwMTRnEAUMMTUtQ3VtYmFyanVhBQUwNTAxNWcQBQcxNi1NYWVtBQUwNTAxNmcQBQwxNy1TYW5xdWVsaW0FBTA1MDE3ZxAFCTE4LVBvcmllbQUFMDUwMThnEAUJMTktVmFscG9pBQUwNTAxOWcQBQgyMC1QcmlvbAUFMDUwMjBnEAUIMjEtUG9uZGEFBTA1MDIxZxAFCTIyLVNpcm9kYQUFMDUwMjJnEAUKMjMtTWFyY2FpbQUFMDUwMjNnEAULMjQtTW9ybXVnYW8FBTA1MDI0ZxAFEDI1LVZhc2NvLURhLUdhbWEFBTA1MDI1ZxAFCjI2LURhYm9saW0FBTA1MDI2ZxAFCzI3LUNvcnRhbGltBQUwNTAyN2cQBQgyOC1OdXZlbQUFMDUwMjhnEAULMjktQ3VydG9yaW0FBTA1MDI5ZxAFCjMwLUZhdG9yZGEFBTA1MDMwZxAFCTMxLU1hcmdhbwUFMDUwMzFnEAULMzItQmVuYXVsaW0FBTA1MDMyZxAFCjMzLU5hdmVsaW0FBTA1MDMzZxAFCzM0LUN1bmNvbGltBQUwNTAzNGcQBQgzNS1WZWxpbQUFMDUwMzVnEAUJMzYtUXVlcGVtBQUwNTAzNmcQBQwzNy1DdXJjaG9yZW0FBTA1MDM3ZxAFDDM4LVNhbnZvcmRlbQUFMDUwMzhnEAUKMzktU2FuZ3VlbQUFMDUwMzlnEAULNDAtQ2FuYWNvbmEFBTA1MDQwZ2RkAhEPZBYCZg9kFgICAQ88KwANAGQCdQ9kFgQCAQ9kFgICAQ9kFgICAQ8PFgIfAgUHwqkgMjAxNWRkAgIPZBYCAgEPZBYCAgMPDxYCHwIFBjg2ODM5NGRkGAEFD2N0bDAwJE1haW4kZ3ZBQw9nZPW/9I4EvbbDNDoGBM07vkZaWPFH"
get_parts <- function(cons) {
POST("http://ceogoa.nic.in/appln/UIL/ElectoralRoll.aspx",
body=list(
`ctl00$Main$drpAC`=cons, # <------ here's where we use the parameter
`ctl00$ToolkitScriptManager`="ctl00$ToolkitScriptManager|ctl00$Main$btnSearch",
`_TSM_HiddenField_`="gw7jpIJ8LMgM7u8gLjQBxxbgFlVTP1p_vIL8EuJVw1w1",
`ctl00$Main$vcAC_ClientState`=NULL,
`__ASYNCPOST`=TRUE,
`__EVENTTARGET`=NULL,
`__EVENTARGUMENT`=NULL,
`__VIEWSTATE`=view_state,
`ctl00$Main$btnSearch`="Search"),
encode="form") -> res
doc <- read_html(content(res, as="text"))
tab <- html_table(html_nodes(doc, "table.mGrid")[[1]], fill=TRUE)
tab$constituency <- cons
tab
}
dat <- pblapply(constituency, get_parts)
请注意,如果要从中创建一个巨大的数据框(如果需要表),则需要对表进行更多的清理。如果您希望PDF链接应该很容易,但请不要在清理或替代提取问题上标记此链接。