将数据从xml提取到R数据帧

时间:2018-05-09 01:56:02

标签: r xml dataframe xml2

我对R中的XMLxml2个软件包很新,我很难将数据从xml提取到数据框中。

来自xml文件的示例数据

<?xml version="1.0" encoding="utf-8"?>
<mod:ModificationSet xmlns:hci="http://riziv.fgov.be/szv/HealthCareInstitution" xmlns:per="http://riziv.fgov.be/szv/Person" xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:pto="http://riziv.fgov.be/szv/PersonToOrganization" xmlns:org="http://riziv.fgov.be/szv/Organization" xmlns:hca="http://riziv.fgov.be/szv/HealthCareAppliance" xmlns:ati="http://riziv.fgov.be/szv/HcApplianceToHcInstitution" xmlns:p12="http://www.w3.org/2001/XMLSchema-instance" xmlns:szv="http://riziv.fgov.be/szv/BasicTypes" xmlns:hcw="http://riziv.fgov.be/szv/HealthCareWorker" xMmmlns:mod="http://riziv.fgov.be/szv/ModificationSet" xmlns:dev="http://riziv.fgov.be/szv/Device" xmlns:wti="http://riziv.fgov.be/szv/HcWorkerToHcInstitution">
  <mod:Payload>
    <mod:Modifications>
      <mod:Modification>
        <mod:Context>
          <szv:Origin>63080900</szv:Origin>
          <szv:CreationDate>2018-04-05</szv:CreationDate>
          <szv:OperationType>01</szv:OperationType>
          <szv:OperationDate>2018-04-05</szv:OperationDate>
        </mod:Context>
        <mod:HealthCareAppliance>
          <hca:Identification>
            <hca:RizivNumber>00000182</hca:RizivNumber>
          </hca:Identification>
          <hca:Device>
            <dev:DeviceNumber>30016</dev:DeviceNumber>
            <dev:DeviceType>PET-CT</dev:DeviceType>
            <dev:Model>Philips-Gemini TF Big Bore PET/CT</dev:Model>
            <dev:StartDateInvoicing>2016-06-01</dev:StartDateInvoicing>
            <dev:EndDateInvoicing p12:nil="true" />
            <dev:LocationIsAddress>false</dev:LocationIsAddress>
            <dev:IsFixedDevice>true</dev:IsFixedDevice>
            <dev:IsExtraMuros>false</dev:IsExtraMuros>
          </hca:Device>
        </mod:HealthCareAppliance>
      </mod:Modification>
      <mod:Modification>
        <mod:Context>
          <szv:Origin>63080900</szv:Origin>
          <szv:CreationDate>2018-04-05</szv:CreationDate>
          <szv:OperationType>01</szv:OperationType>
          <szv:OperationDate>2010-07-13</szv:OperationDate>
        </mod:Context>
        <mod:HealthCareAppliance>
          <hca:Identification>
            <hca:RizivNumber>00000182</hca:RizivNumber>
          </hca:Identification>
          <hca:Status>
            <hca:StatusCode>InUse</hca:StatusCode>
            <hca:StatusStartDate>2010-07-13</hca:StatusStartDate>
          </hca:Status>
        </mod:HealthCareAppliance>
      </mod:Modification>
      <mod:Modification>
        <mod:Context>
          <szv:Origin>63080900</szv:Origin>
          <szv:CreationDate>2018-04-05</szv:CreationDate>
          <szv:OperationType>01</szv:OperationType>
          <szv:OperationDate>2018-04-05</szv:OperationDate>
        </mod:Context>
        <mod:HcApplianceToHcInstitution>
          <ati:HealthCareInstitution>
            <ati:RizivNumber>71024388</ati:RizivNumber>
            <ati:InstitutionCode>710</ati:InstitutionCode>
          </ati:HealthCareInstitution>
          <ati:HealthCareAppliance>
            <ati:RizivNumber>00000182</ati:RizivNumber>
          </ati:HealthCareAppliance>
          <ati:Period>
            <szv:StartDate>2016-08-19</szv:StartDate>
            <szv:EndDate p12:nil="true" />
          </ati:Period>
        </mod:HcApplianceToHcInstitution>
      </mod:Modification>
    </mod:Modifications>
  </mod:Payload>

这是我迄今为止所取得的成就!该脚本运行时没有任何错误,但它无法提取任何数据并在最后返回Null值。

library(XML)
xmldoc <- xmlParse("BAS_SIT_HCA_20180405141931.xml", useInternalNodes=TRUE)
class(xmldoc)

namespace_list <- c(mod="http://riziv.fgov.be/szv/ModificationSet",
                    szv="http://riziv.fgov.be/szv/BasicTypes",
                    hca="http://riziv.fgov.be/szv/HealthCareAppliance",
                    dev="http://riziv.fgov.be/szv/Device",
                    ati="http://riziv.fgov.be/szv/HcApplianceToHcInstitution")

do.call(rbind, xpathApply(xmldoc, "//mod:ModificationSet/mod:Payload/mod:Modifications
                                   /mod:Modification", namespaces=namespace_list, function(node) {
                                      Origin <- xmlValue(node[["./mod:Context/szv:Origin"]])
                                      CreationDate <- xmlValue(node[["./mod:Context/szv:CreationDate"]])
                                      OperationType <- xmlValue(node[["./mod:Context/szv:OperationType"]])
                                      OperationDate <- xmlValue(node[["./mod:Context/szv:OperationDate"]])
      xp1 <- "./mod:HealthCareAppliance/hca:Identification/hca:RizivNumber"
      RizivNumber <- xpathSApply(node, xp1, namespaces=namespace_list, xmlValue)
      if(is.null(RizivNumber)) RizivNumber <- NA

      xp2 <- "./mod:HealthCareAppliance/hca:Device/dev:DeviceNumber"
      DeviceNumber <- xpathSApply(node, xp2, namespaces=namespace_list, xmlValue)
      if(is.null(DeviceNumber)) DeviceNumber <- NA
      xp3 <- "./mod:HealthCareAppliance/hca:Device/dev:DeviceType"
      DeviceType <- xpathSApply(node, xp3, namespaces=namespace_list, xmlValue)
      if(is.null(DeviceType)) DeviceType <- NA
      xp4 <- "./mod:HealthCareAppliance/hca:Device/dev:DeviceSubType"
      DeviceSubType <- xpathSApply(node, xp4, namespaces=namespace_list, xmlValue)
      if(is.null(DeviceSubType)) DeviceSubType <- NA
      xp5 <- "./mod:HealthCareAppliance/hca:Device/dev:Model"
      Model <- xpathSApply(node, xp5, namespaces=namespace_list, xmlValue)
      if(is.null(Model)) Model <- NA
      xp6 <- "./mod:HealthCareAppliance/hca:Device/dev:StartDateInvoicing"
      StartDateInvoicing <- xpathSApply(node, xp6, namespaces=namespace_list, xmlValue)
      if(is.null(StartDateInvoicing)) StartDateInvoicing <- NA
      xp7 <- "./mod:HealthCareAppliance/hca:Device/dev:EndDateInvoicing"
      EndDateInvoicing <- xpathSApply(node, xp7, namespaces=namespace_list, xmlValue)
      if(is.null(EndDateInvoicing)) EndDateInvoicing <- NA
      xp8 <- "./mod:HealthCareAppliance/hca:Device/dev:LocationIsAddress"
      LocationIsAddress <- xpathSApply(node, xp8, namespaces=namespace_list, xmlValue)
      if(is.null(LocationIsAddress)) LocationIsAddress <- NA
      xp9 <- "./mod:HealthCareAppliance/hca:Device/dev:IsFixedDevice"
      IsFixedDevice <- xpathSApply(node, xp9, namespaces=namespace_list, xmlValue)
      if(is.null(IsFixedDevice)) IsFixedDevice <- NA
      xp10 <- "./mod:HealthCareAppliance/hca:Device/dev:IsExtraMuros"
      IsExtraMuros <- xpathSApply(node, xp10, namespaces=namespace_list, xmlValue)
      if(is.null(IsExtraMuros)) IsExtraMuros <- NA

      xp11 <- "./mod:HealthCareAppliance/hca:Status/hca:StatusCode"
      StatusCode <- xpathSApply(node, xp11, namespaces=namespace_list, xmlValue)
      if(is.null(StatusCode)) StatusCode <- NA
      xp12 <- "./mod:HealthCareAppliance/hca:Status/hca:StatusStartDate"
      StatusStartDate <- xpathSApply(node, xp12, namespaces=namespace_list, xmlValue)
      if(is.null(StatusStartDate)) StatusStartDate <- NA

      xp13 <- "./mod:HcApplianceToHcInstitution/ati:HealthCareInstitution/ati:RizivNumber"
      RizivNumber_ <- xpathSApply(node, xp13, namespaces=namespace_list, xmlValue)
      if(is.null(RizivNumber_)) RizivNumber_ <- NA
      xp14 <- "./mod:HcApplianceToHcInstitution/ati:HealthCareInstitution/ati:InstitutionCode"
      InstitutionCode <- xpathSApply(node, xp14, namespaces=namespace_list, xmlValue)
      if(is.null(InstitutionCode)) InstitutionCode <- NA

      xp15 <- "./mod:HcApplianceToHcInstitution/ati:HealthCareAppliance/ati:RizivNumber"
      RizivNumber2 <- xpathSApply(node, xp15, namespaces=namespace_list, xmlValue)
      if(is.null(RizivNumber2)) RizivNumber2 <- NA

      xp16 <- "./mod:HcApplianceToHcInstitution/ati:Period/szv:StartDate"
      StartDate <- xpathSApply(node, xp16, namespaces=namespace_list, xmlValue)
      if(is.null(StartDate)) StartDate <- NA
      xp17 <- "./mod:HcApplianceToHcInstitution/ati:Period/szv:EndDate"
      EndDate <- xpathSApply(node, xp17, namespaces=namespace_list, xmlValue)
      if(is.null(EndDate)) EndDate <- NA



#      xmldoc_df <- data.frame(Origin, CreationDate, OperationType, OperationDate, RizivNumber,
#                              DeviceNumber, DeviceType, Model, StartDateInvoicing, EndDateInvoicing,
#                              LocationIsAddress, IsFixedDevice, IsExtraMuros, stringsAsFactors = FALSE)
                                            }))

提前谢谢你的帮助。附:对于提前发布重复问题的道歉。

1 个答案:

答案 0 :(得分:0)

这是你希望实现的目标吗?

library(xml2)
library(dplyr)

xmldoc <- read_xml("./Desktop/test.xml", encoding = "utf-8", as_html = FALSE)

RizivNumber <- xmldoc %>% 
               xml_find_all(".//hca:RizivNumber") %>% 
               xml_text()
#> RizivNumber
#[1] "00000182" "00000182"

DeviceNumber <- xmldoc %>% 
                xml_find_all(".//dev:DeviceNumber") %>% 
                xml_text()
#> DeviceNumber
#[1] "30016"

DeviceType <- xmldoc %>% 
              xml_find_all(".//dev:DeviceType") %>% 
              xml_text()
#> DeviceType
#[1] "PET-CT"

......等等