删除更新的NPPES NPI提取文件

时间:2018-07-17 11:39:55

标签: r survey

我在R中使用ajdamico的lodown程序包。首先,我说这个程序包很棒,所以谢谢Anthony。过去成功运行脚本后,我有两个问题:

  1. NPPES发布了7月份的文件。在此最新版本上运行脚本时,我仅看到6月数据。也就是说,根据NPPES每周发布的文件,尽管我知道记录在7月已更新,但我看不到7月已更新的任何记录。知道为什么会这样吗?我怀疑这至少部分是由于7月份的zip文件包含的数据比历史上提供的要多得多。
  2. 鉴于zip文件中现在包含更多文件,我如何指导脚本提取哪个文件?

下面的完整脚本,在应用任何子设置之前:

    get_catalog_nppes <-
        function( data_name = "nppes" , output_dir , ... ){

        # read in the whole NPI files page
        npi.datapage <- suppressWarnings( readLines( "http://download.cms.gov/nppes/NPI_Files.html" ) )

        # find the first line containing the data dissemination link
        npi.dataline <- npi.datapage[ grep( "NPPES_Data_Dissemination_" , npi.datapage ) ][1]

        # pull out the zipped file's name from that line
        fn <- paste0( "http://download.cms.gov/nppes/" , gsub( "(.*)(NPPES_Data_Dissemination_.*\\.zip)(.*)$" , "\\2" , npi.dataline ) )

        catalog <-
            data.frame(
                full_url = fn ,
                output_filename = paste0( output_dir , "/nppes.csv" ) ,
                stringsAsFactors = FALSE
            )

        catalog

    }


    lodown_nppes <-
        function( data_name = "nppes" , catalog , path_to_7za = '7za' , ... ){

            on.exit( print( catalog ) )

            if( nrow( catalog ) != 1 ) stop( "nppes catalog must be exactly one record" )

            if( ( .Platform$OS.type != 'windows' ) && ( system( paste0('"', path_to_7za , '" -h' ) ) != 0 ) ) stop( "you need to install 7-zip.  if you already have it, include a path_to_7za='/directory/7za' parameter" )

            tf <- tempfile()

            download.file( catalog$full_url , tf , mode = 'wb' )

            # extract the file, platform-specific
            if ( .Platform$OS.type == 'windows' ){

                unzipped_files <- unzip_warn_fail( tf , exdir = tempdir() )

            } else {

                # build the string to send to the terminal on non-windows systems
                dos.command <- paste0( '"' , path_to_7za , '" x ' , tf , ' -o"' , tempdir() , '"' )
                system( dos.command )
                unzipped_files <- list.files( tempdir() , full.names = TRUE , recursive = TRUE )

            }

            csv.file <- unzipped_files[ grepl( '\\.csv$' , basename( unzipped_files ) , ignore.case = TRUE ) & !grepl( 'FileHeader' , basename( unzipped_files ) , ignore.case = TRUE ) ]

            file.copy( csv.file , catalog$output_filename )

            catalog$case_count <- R.utils::countLines( csv.file ) - 1

            file.remove( unzipped_files , tf )

            on.exit()

            catalog

        }


##Load Dataset
column_names <-
  names( 
    read.csv( 
      file.path( path.expand( "~" ) , "NPPES" , "nppes.csv" ) , 
      nrow = 1 )[ FALSE , , ] 
  )

##Load Dataframe
column_names <-
  names( 
    read.csv( 
      file.path( path.expand( "~" ) , "NPPES" , "nppes.csv" ) , 
      nrow = 1 )[ FALSE , , ] 
  )

column_names <- gsub( "\\." , "_" , tolower( column_names ) )

column_types <-
  ifelse( 
    grepl( "npi" , column_names ) & 
      !grepl( "npi|codified|first|last|blergh|whistle|rule|exist|orange|way|brown|houses|outrageous|boring|substance|superficial|good|fancy|sneaky|terrific|paint|nasty|gentle|decorous|smile", column_names ) , 
    'n' , 'c' 
  )

columns_to_import <-
  c( "npi","entity_type_code","provider_business_mailing_address_postal_code","last_update_date","npi_deactivation_reason_code","npi_deactivation_date","npi_reactivation_date","authorized_official_last_name","authorized_official_first_name","healthcare_provider_taxonomy_code_1","healthcare_provider_taxonomy_code_2","healthcare_provider_taxonomy_code_3","healthcare_provider_taxonomy_code_4","healthcare_provider_taxonomy_code_5","healthcare_provider_taxonomy_code_6","healthcare_provider_taxonomy_code_7","healthcare_provider_taxonomy_code_8","healthcare_provider_taxonomy_code_9","healthcare_provider_taxonomy_code_10","healthcare_provider_taxonomy_code_11","healthcare_provider_taxonomy_code_12","healthcare_provider_taxonomy_code_13","healthcare_provider_taxonomy_code_14","healthcare_provider_taxonomy_code_15","is_sole_proprietor"
  )

stopifnot( all( columns_to_import %in% column_names ) )

columns_to_import <-
  columns_to_import[ order( match( columns_to_import , column_names ) ) ]

nppes <- 
  data.frame( 
    readr::read_csv( 
      file.path( path.expand( "~" ) , "NPPES" , 
                 "nppes.csv" ) , 
      col_names = columns_to_import , 
      col_types = 
        paste0( 
          ifelse( column_names %in% columns_to_import , column_types , '_' ) , 
          collapse = "" 
        ) ,
      skip = 1
    ) 
  )

0 个答案:

没有答案