将XML节点转换为数据帧

时间:2016-10-13 19:58:36

标签: r xml

我正在使用XML文件,我设法从中提取了一组我感兴趣的节点:

XMLData  <- xmlParse("MX_Test_K_Charge_2-6.xml")
XMLNodes <- getNodeSet(xx,"//Mono")

我得到一组看起来像这样的节点:

[[1]]
<Mono MonoisoMass="3333.7954" SumIntensity="31795.6617" AveragineMass="3334.7653">
  <Charged>
    <Match Chg="5" MoiM="3332.8416" avgMoiM="3332.7844" MoiMz="667.5756" calcMoiMz="667.56261" MabMz="667.7632" MabInt="5995" Dev="15.65" Scale="1.087"/>
    <Match Chg="6" MoiM="3332.7776" avgMoiM="3332.7768" MoiMz="556.4702" calcMoiMz="556.47006" MabMz="556.6372" MabInt="5554" Dev="2.02" Scale="0.999"/>
    <Match Chg="4" MoiM="3332.7775" avgMoiM="3332.7774" MoiMz="834.2016" calcMoiMz="834.20145" MabMz="834.4524" MabInt="5613" Dev="2.00" Scale="0.999"/>
    <Match Chg="3" MoiM="3332.7760" avgMoiM="3332.7765" MoiMz="1111.9326" calcMoiMz="1111.93284" MabMz="1112.2676" MabInt="5501" Dev="2.17" Scale="1.001"/>
    <Match Chg="2" MoiM="3332.7786" avgMoiM="3332.7770" MoiMz="1667.3966" calcMoiMz="1667.39562" MabMz="1667.8966" MabInt="5574" Dev="1.40" Scale="0.999"/>
  </Charged>
</Mono> 

[[2]]
<Mono MonoisoMass="2667.3004" SumIntensity="24359.4426" AveragineMass="2667.3000">
  <Charged>
    <Match Chg="2" MoiM="2666.2969" avgMoiM="2666.2968" MoiMz="1334.1557" calcMoiMz="1334.15581" MabMz="1334.6573" MabInt="5596" Dev="5.32" Scale="0.971"/>
    <Match Chg="3" MoiM="2666.2970" avgMoiM="2666.2969" MoiMz="889.7730" calcMoiMz="889.77296" MabMz="890.1073" MabInt="5482" Dev="7.28" Scale="0.971"/>
    <Match Chg="4" MoiM="2666.2733" avgMoiM="2666.2926" MoiMz="667.5756" calcMoiMz="667.58154" MabMz="667.8324" MabInt="4790" Dev="33.98" Scale="0.854"/>
    <Match Chg="5" MoiM="2666.2976" avgMoiM="2666.2976" MoiMz="534.2668" calcMoiMz="534.26669" MabMz="534.4674" MabInt="5437" Dev="7.52" Scale="0.970"/>
    <Match Chg="6" MoiM="2666.2977" avgMoiM="2666.2975" MoiMz="445.3902" calcMoiMz="445.39012" MabMz="445.5574" MabInt="5442" Dev="7.62" Scale="0.970"/>
  </Charged>
</Mono> 

[[3]]
<Mono MonoisoMass="2000.8526" SumIntensity="20204.6338" AveragineMass="1999.8456">
  <Charged>
    <Match Chg="6" MoiM="1999.8456" avgMoiM="1999.8451" MoiMz="334.3149" calcMoiMz="334.31481" MabMz="334.4820" MabInt="5115" Dev="6.90" Scale="0.954"/>
    <Match Chg="5" MoiM="1999.8456" avgMoiM="1999.8452" MoiMz="400.9764" calcMoiMz="400.97632" MabMz="401.1769" MabInt="5040" Dev="6.63" Scale="0.956"/>
    <Match Chg="4" MoiM="1999.8457" avgMoiM="1999.8452" MoiMz="500.9687" calcMoiMz="500.96858" MabMz="501.2194" MabInt="5014" Dev="6.88" Scale="0.954"/>
    <Match Chg="3" MoiM="1999.8453" avgMoiM="1999.8480" MoiMz="667.6224" calcMoiMz="667.62234" MabMz="667.9603" MabInt="4353" Dev="3.44" Scale="1.010"/>
    <Match Chg="2" MoiM="1999.8450" avgMoiM="1999.8447" MoiMz="1000.9298" calcMoiMz="1000.92988" MabMz="1001.4312" MabInt="5030" Dev="4.91" Scale="0.955"/>
  </Charged>
</Mono> 

attr(,"class")
[1] "XMLNodeSet"

现在我想取这组节点并转换为数据帧,以便有列:MonoisoMass; SumIntensity; AveragineMass;在父节点中作为属性出现

存在于子节点中

如果问题已经提出,我很抱歉,但我发现XML包的文档非常复杂,无法理解。如果您能够以可理解的方式向我推荐一本解释在R中使用XML文件的教程或书籍,我将不胜感激。

1 个答案:

答案 0 :(得分:1)

您可以执行以下操作:

require(tidyverse)
require(xml2)
mono <- dat %>% map(xml_find_all, "//Mono")
parent_dat <- mono %>% map(xml_attrs) %>% flatten %>% map(~t(.) %>% as_tibble)
child_dat <- mono %>% map(xml_find_all,"//Match") %>% map(xml_attrs) %>% map(~map_df(., ~t(.) %>% as_tibble))
map2_df(parent_dat, child_dat, cbind) %>% type_convert

给你:

   MonoisoMass SumIntensity AveragineMass Chg     MoiM  avgMoiM     MoiMz calcMoiMz     MabMz MabInt   Dev Scale
1     3333.795     31795.66      3334.765   5 3332.842 3332.784  667.5756  667.5626  667.7632   5995 15.65 1.087
2     3333.795     31795.66      3334.765   6 3332.778 3332.777  556.4702  556.4701  556.6372   5554  2.02 0.999
3     3333.795     31795.66      3334.765   4 3332.778 3332.777  834.2016  834.2015  834.4524   5613  2.00 0.999
4     3333.795     31795.66      3334.765   3 3332.776 3332.776 1111.9326 1111.9328 1112.2676   5501  2.17 1.001
5     3333.795     31795.66      3334.765   2 3332.779 3332.777 1667.3966 1667.3956 1667.8966   5574  1.40 0.999
6     2667.300     24359.44      2667.300   2 2666.297 2666.297 1334.1557 1334.1558 1334.6573   5596  5.32 0.971
7     2667.300     24359.44      2667.300   3 2666.297 2666.297  889.7730  889.7730  890.1073   5482  7.28 0.971
8     2667.300     24359.44      2667.300   4 2666.273 2666.293  667.5756  667.5815  667.8324   4790 33.98 0.854
9     2667.300     24359.44      2667.300   5 2666.298 2666.298  534.2668  534.2667  534.4674   5437  7.52 0.970
10    2667.300     24359.44      2667.300   6 2666.298 2666.298  445.3902  445.3901  445.5574   5442  7.62 0.970

假设您的数据如下所示:

txt1 <- '<Mono MonoisoMass="3333.7954" SumIntensity="31795.6617" AveragineMass="3334.7653">
  <Charged>
    <Match Chg="5" MoiM="3332.8416" avgMoiM="3332.7844" MoiMz="667.5756" calcMoiMz="667.56261" MabMz="667.7632" MabInt="5995" Dev="15.65" Scale="1.087"/>
    <Match Chg="6" MoiM="3332.7776" avgMoiM="3332.7768" MoiMz="556.4702" calcMoiMz="556.47006" MabMz="556.6372" MabInt="5554" Dev="2.02" Scale="0.999"/>
    <Match Chg="4" MoiM="3332.7775" avgMoiM="3332.7774" MoiMz="834.2016" calcMoiMz="834.20145" MabMz="834.4524" MabInt="5613" Dev="2.00" Scale="0.999"/>
    <Match Chg="3" MoiM="3332.7760" avgMoiM="3332.7765" MoiMz="1111.9326" calcMoiMz="1111.93284" MabMz="1112.2676" MabInt="5501" Dev="2.17" Scale="1.001"/>
    <Match Chg="2" MoiM="3332.7786" avgMoiM="3332.7770" MoiMz="1667.3966" calcMoiMz="1667.39562" MabMz="1667.8966" MabInt="5574" Dev="1.40" Scale="0.999"/>
  </Charged>
</Mono> '

txt2 <- '<Mono MonoisoMass="2667.3004" SumIntensity="24359.4426" AveragineMass="2667.3000">
  <Charged>
<Match Chg="2" MoiM="2666.2969" avgMoiM="2666.2968" MoiMz="1334.1557" calcMoiMz="1334.15581" MabMz="1334.6573" MabInt="5596" Dev="5.32" Scale="0.971"/>
<Match Chg="3" MoiM="2666.2970" avgMoiM="2666.2969" MoiMz="889.7730" calcMoiMz="889.77296" MabMz="890.1073" MabInt="5482" Dev="7.28" Scale="0.971"/>
<Match Chg="4" MoiM="2666.2733" avgMoiM="2666.2926" MoiMz="667.5756" calcMoiMz="667.58154" MabMz="667.8324" MabInt="4790" Dev="33.98" Scale="0.854"/>
<Match Chg="5" MoiM="2666.2976" avgMoiM="2666.2976" MoiMz="534.2668" calcMoiMz="534.26669" MabMz="534.4674" MabInt="5437" Dev="7.52" Scale="0.970"/>
<Match Chg="6" MoiM="2666.2977" avgMoiM="2666.2975" MoiMz="445.3902" calcMoiMz="445.39012" MabMz="445.5574" MabInt="5442" Dev="7.62" Scale="0.970"/>
</Charged>
</Mono> '



dat <- map(list(txt1, txt2), read_xml)