使用R语言从XML数据文件构建表

时间:2019-12-25 12:22:49

标签: r xml dataframe

我是R编程的新手,我有示例XML文件,如下所示

<Attribute ID="GroupSEO" MultiValued="false" ProductMode="Property" FullTextIndexed="false" ExternallyMaintained="false" Derived="false" Mandatory="false">
  <Name>Group SEO Name</Name>
  <Validation BaseType="text" MinValue="" MaxValue="" MaxLength="1024" InputMask=""/>
  <DimensionLink DimensionID="Language"/>
  <MetaData>
    <Value AttributeID="Attribute-Group-Order">1</Value>
    <Value AttributeID="Enterprise-Label">NAV-GR-SEONAME</Value>
    <Value ID="#NAMED" AttributeID="Attribute-Group-Name">#NAMED</Value>
    <Value AttributeID="Enterprise-Description">Navigation Group SEO Name</Value>
    <Value AttributeID="Attribute-Order">3</Value>
  </MetaData>
  <AttributeGroupLink AttributeGroupID="HTCategorizationsNavigation"/>
  <AttributeGroupLink AttributeGroupID="HTDigitalServicesModifyClassifications"/>
  <UserTypeLink UserTypeID="ENT-Group"/>
  <UserTypeLink UserTypeID="NAVGRP"/>
  <UserTypeLink UserTypeID="ENT-SubCategory"/>
  <UserTypeLink UserTypeID="ENT-Category"/>

我想使用R语言将其转换为数据帧。我的预期输出是

##   FullTextIndexed  MultiValued  ProductMode  ExternallyMaintained  Derived  Mandatory  Attribute-Group-Order  Enterprise-Description      UserTypeID 
1         false         false       Property            false          false    false             1     Navigation group seo name   ENT-Group,ENT-Category,..

我已经搜索了互联网,但找不到解决我问题的方法。 我从互联网上收到了一个密码

library("XML")
library("methods")
setwd("E:/Project")
xmldata<-xmlToDataFrame("Sample.xml")
print(xmldata)

但是当我执行代码时,出现以下错误

Error in `[<-.data.frame`(`*tmp*`, i, names(nodes[[i]]), value = c(Name = "You YoutubeLink7 (URL)",  : 
duplicate subscripts for columns
In addition: Warning message:
In names(x) == varNames :
longer object length is not a multiple of shorter object length
> print(xmldata)
Error in print(xmldata) : object 'xmldata' not found

任何人都可以帮助我了解错误的含义以及对我的问题的解决方案,对格式问题感到抱歉。 预先感谢您提供解决方案。

谢谢

2 个答案:

答案 0 :(得分:1)

具有正确的xml数据(文件末尾的属性标签)。

<?xml version="1.0" encoding="UTF-8"?>
<Attribute ID="GroupSEO" MultiValued="false" ProductMode="Property" FullTextIndexed="false" ExternallyMaintained="false" Derived="false" Mandatory="false">
  <Name>Group SEO Name</Name>
  <Validation BaseType="text" MinValue="" MaxValue="" MaxLength="1024" InputMask=""/>
  <DimensionLink DimensionID="Language"/>
  <MetaData>
    <Value AttributeID="Attribute-Group-Order">1</Value>
    <Value AttributeID="Enterprise-Label">NAV-GR-SEONAME</Value>
    <Value ID="#NAMED" AttributeID="Attribute-Group-Name">#NAMED</Value>
    <Value AttributeID="Enterprise-Description">Navigation Group SEO Name</Value>
    <Value AttributeID="Attribute-Order">3</Value>
  </MetaData>
  <AttributeGroupLink AttributeGroupID="HTCategorizationsNavigation"/>
  <AttributeGroupLink AttributeGroupID="HTDigitalServicesModifyClassifications"/>
  <UserTypeLink UserTypeID="ENT-Group"/>
  <UserTypeLink UserTypeID="NAVGRP"/>
  <UserTypeLink UserTypeID="ENT-SubCategory"/>
  <UserTypeLink UserTypeID="ENT-Category"/>
</Attribute>

然后,我们使用xpath来获取所需的全部信息。在htmlParse步骤中将路径更改为xml文件。

library(XML)
data=htmlParse("C:/Users/.../yourxmlfile.xml")
fulltextindexed=xpathSApply(data,"normalize-space(//attribute/@fulltextindexed)")
multivalued=xpathSApply(data,"normalize-space(//attribute/@multivalued)")
productmode=xpathSApply(data,"normalize-space(//attribute/@productmode)")
externallymaintained=xpathSApply(data,"normalize-space(//attribute/@externallymaintained)")
derived=xpathSApply(data,"normalize-space(//attribute/@derived)")
mandatory=xpathSApply(data,"normalize-space(//attribute/@mandatory)")
attribute.group.order=xpathSApply(data,"//value[@attributeid='Attribute-Group-Order']",xmlValue)
enterprise.description=xpathSApply(data,"//value[@attributeid='Enterprise-Description']",xmlValue)
user.type.id=paste(xpathSApply(data,"//usertypelink/@usertypeid"),collapse = "|")
df=data.frame(fulltextindexed,multivalued,productmode,externallymaintained,derived,mandatory,attribute.group.order,enterprise.description,user.type.id)

结果:

enter image description here

答案 1 :(得分:1)

使用tidyversexml2

数据

data <- read_xml('<Attribute ID="GroupSEO" MultiValued="false" ProductMode="Property" FullTextIndexed="false" ExternallyMaintained="false" Derived="false" Mandatory="false">
  <Name>Group SEO Name</Name>
  <Validation BaseType="text" MinValue="" MaxValue="" MaxLength="1024" InputMask=""/>
  <DimensionLink DimensionID="Language"/>
  <MetaData>
    <Value AttributeID="Attribute-Group-Order">1</Value>
    <Value AttributeID="Enterprise-Label">NAV-GR-SEONAME</Value>
    <Value ID="#NAMED" AttributeID="Attribute-Group-Name">#NAMED</Value>
    <Value AttributeID="Enterprise-Description">Navigation Group SEO Name</Value>
    <Value AttributeID="Attribute-Order">3</Value>
  </MetaData>
  <AttributeGroupLink AttributeGroupID="HTCategorizationsNavigation"/>
  <AttributeGroupLink AttributeGroupID="HTDigitalServicesModifyClassifications"/>
  <UserTypeLink UserTypeID="ENT-Group"/>
  <UserTypeLink UserTypeID="NAVGRP"/>
  <UserTypeLink UserTypeID="ENT-SubCategory"/>
  <UserTypeLink UserTypeID="ENT-Category"/>
</Attribute>')

代码

#For attribute tag
Attributes <- xml_find_all(data, "//Attribute")
Attributes <- Attributes %>% 
        map(xml_attrs) %>%
        map_df(~as.list(.))

#find AttributeID nodes
nodes <- xml_find_all(data, "//Value")

AGO <- nodes[xml_attr(nodes, "AttributeID")=="Attribute-Group-Order"]
Attributes["Attribute-Group-Order"] <- xml_text(AGO)

ED <- nodes[xml_attr(nodes, "AttributeID")=="Enterprise-Description"]
Attributes["Enterprise-Description"] <- xml_text(ED)


#UserTypelink tags
UserTypeLink <- xml_find_all(data, "//UserTypeLink")
UserTypeLink <- UserTypeLink %>% 
        map(xml_attrs) %>%
        map_df(~as.list(.)) %>%
        mutate(UserTypeID=map_chr(UserTypeID, ~toString(UserTypeID, .x))) %>%
        filter(row_number()==1)

#Final output
do.call("cbind", list(Attributes,UserTypeLink))