我尝试使用名字和姓氏在网站上多次搜索 (https://npiregistry.cms.hhs.gov/registry/)然后创建输出的数据框
我发现这与How to automate multiple requests to a web search form using R中描述的相似,但由于某些原因,我一直在收到错误 "错误:无法加载外部实体"`
以下是我用来提取记录的代码
fn = rep(c('HARVEY','HARVEY'));
ln = rep(c('BIDWELL','ADELSON'));
mydf = data.frame(fn,ln);
get_data = function(df){
library(XML);
root = 'http://npiregistry.cms.hhs.gov/'
u = paste(root,'registry/search-results-table?','first_name=', df$fn, '&last_name=',
df$ln, sep = "");
# encode url correctly
url = URLencode(u);
# extract data from the right table
data = readHTMLTable(url);
}
library(plyr)
mydata = adply(mydf, 1, get_data);
感谢您的帮助
答案 0 :(得分:2)
呼叫需求是https:而不是http:。我还删除了仅用于基础R的plyr库:
library(rvest)
fn = rep(c('HARVEY','HARVEY'));
ln = rep(c('BIDWELL','ADELSON'));
mydf = data.frame(fn,ln);
get_data = function(df){
root = 'https://npiregistry.cms.hhs.gov/'
u = paste(root,'registry/search-results-table?','first_name=', df[1], '&last_name=',
df[2], sep = "");
# encode url correctly
url = URLencode(u);
#print(url)
# extract data from the right table
data = read_html(url);
newresult<- html_nodes(data, "table")[1] %>%html_table()
# convert result into a data frame
newresult<-as.data.frame(newresult)
}
mydata = apply(mydf, 1, function(x) { get_data(x)})
#mydata is a list of data frames, do.call creates a single data.frame
finalanswer<-do.call(rbind, mydata)
#finalanswer needs some clean up.
答案 1 :(得分:2)
它有一个未经身份验证的API ......为什么不使用它?
library(httr)
library(jsonlite)
library(tidyverse)
npi_query <- function(f_name, l_name) {
res <- GET("https://npiregistry.cms.hhs.gov/api/",
query = list(first_name = f_name, last_name = l_name))
stop_for_status(res)
res <- content(res, as="text", encoding="UTF-8")
res <- fromJSON(res, flatten=TRUE)
as_tibble(res$results)
}
data_frame(
fn = c('HARVEY', 'HARVEY'),
ln = c('BIDWELL','ADELSON')
) -> lkp
map2_df(lkp$fn, lkp$ln, npi_query) %>%
glimpse()
## Observations: 2
## Variables: 19
## $ taxonomies <list> [<MA, 207R00000X, TRUE, 36065, Interna...
## $ addresses <list> [<c("DORCHESTER", "DORCHESTER"), c("23...
## $ created_epoch <int> 1152230400, 1168992000
## $ identifiers <list> [[], []]
## $ other_names <list> [[], []]
## $ number <int> 1336171859, 1205988342
## $ last_updated_epoch <int> 1183852800, 1183852800
## $ enumeration_type <chr> "NPI-1", "NPI-1"
## $ basic.status <chr> "A", "A"
## $ basic.credential <chr> "M.D.", "DMD"
## $ basic.first_name <chr> "HARVEY", "HARVEY"
## $ basic.last_name <chr> "BIDWELL", "ADELSON"
## $ basic.middle_name <chr> "W", "JEROME"
## $ basic.name <chr> "BIDWELL HARVEY", "ADELSON HARVEY"
## $ basic.gender <chr> "M", "M"
## $ basic.sole_proprietor <chr> "NO", "NO"
## $ basic.last_updated <chr> "2007-07-08", "2007-07-08"
## $ basic.enumeration_date <chr> "2006-07-07", "2007-01-17"
## $ basic.name_prefix <chr> NA, "DR."