使用R

时间:2018-01-03 11:21:16

标签: r xml xml-parsing

我正在阅读一组XML文件,并希望了解处理以下内容的最佳方法:

<MyDecision>
    <Decision>
        <DecisionID>X1234</DecisionID>
        <DecisionReasons xmlns:a="http://schemas.datacontract.org/2004/07/Contracts">
            <a:Reason>
                <a:Description>DOBMismatch</a:Description>
            </a:Reason>
            <a:Reason>
                <a:Description>PrimaryChecksFail</a:Description>
            </a:Reason>
            <a:Reason>
                <a:Description>IncomeReferral</a:Description>
            </a:Reason>
        </DecisionReasons>
    </Decision>
</MyDecision>

目前,我正在运行一些R代码但得到回复:

Error: Duplicate identifiers for rows (2, 3, 4)

预期输出是一个类似于:

的数据框
fieldname                                                                   |contents
MyDecision_Decision_DecisionID                                              |X1234
MyDecision_Decision_DecisionReasons_Reason_Description_DOBMismatch          |DOBMismatch
MyDecision_Decision_DecisionReasons_Reason_Description_PrimaryChecksFail    |PrimaryChecksFail
MyDecision_Decision_DecisionReasons_Reason_Description_IncomeReferral       |IncomeReferral

我目前的代码如下:

library(profvis)
library(XML)
library(xml2)
library(plyr)
library(tidyverse)
library(reshape2)
library(foreign)
library(rio)

setwd('c:/temp/xml/t')

df             <- data.frame()
transposed.df1 <- data.frame()
allxmldata     <- data.frame()


inputfiles <- as.character('test.xml')

findchildren<-function(nodes, df) {
  numchild <- sapply(nodes, function(x){length(xml_children(x))})
  xmlvalue <- xml_text(nodes[numchild==0])
  xmlname  <- xml_name(nodes[numchild==0])
  xmlpath  <- sapply(nodes[numchild==0], function(x) {gsub(', ','_', toString(rev(xml_name(xml_parents(x)))))})
  if (isTRUE(xmlpath == 'MyDecision_Decision_DecisionReasons_Reason')) {
    fieldname <- paste(xmlpath,xmlname,xmlvalue,sep = '_')
  } else {
    fieldname <- paste(xmlpath,xmlname,sep = '_')
  }
  contents <- sapply(xmlvalue, function(f){is.na(f)<-which(f == '');f})
  dftemp   <- data.frame(fieldname, contents)
  df       <- rbind(df, dftemp)
  print(dim(df))
  if (sum(numchild)>0){
    findchildren(xml_children(nodes[numchild>0]), df) }
  else{ return(df)}
}

for (x in inputfiles) {
  df1     <- findchildren(xml_children(read_xml(x)),df)
  xml.df1 <- data.frame(spread(df1, key = fieldname, value = contents), fix.empty.names = TRUE)
  allxmldata <- rbind.fill(allxmldata,xml.df1)
}

我希望有人可以指出我做错了什么......

1 个答案:

答案 0 :(得分:0)

我提出了以下适用于您的示例数据集的解决方案。希望这种方法也可以用于更大的数据集。

# we need these two packages
library(xml2)
library(tidyverse)

# read in the xml-file
xml <- read_xml(
  '<MyDecision>
    <Decision>
      <DecisionID>X1234</DecisionID>
      <DecisionReasons xmlns:a="http://schemas.datacontract.org/2004/07/Contracts">
        <a:Reason>
          <a:Description>DOBMismatch</a:Description>
        </a:Reason>
        <a:Reason>
          <a:Description>PrimaryChecksFail</a:Description>
        </a:Reason>
        <a:Reason>
          <a:Description>IncomeReferral</a:Description>
        </a:Reason>
      </DecisionReasons>
  </Decision>
  </MyDecision>'
)

xml %>% 
  # first find all nodes that doesn't contain any child nodes
  xml_find_all(".//node()[not(node())]") %>% 
  # find the parent of each node
  map(xml_parent) %>% 
  # extract name and text of each of the childless nodes
  map(~list(name = xml_name(.x), text = xml_text(.x))) %>% 
  # bind rows.
  bind_rows()

这会产生以下输出:

# A tibble: 4 x 2
         name              text
        <chr>             <chr>
1  DecisionID             X1234
2 Description       DOBMismatch
3 Description PrimaryChecksFail
4 Description    IncomeReferral