缩短我的XML文件版本如下:
<?xml version="1.0" encoding="UTF-8"?>
<MzIdentML id="MS-GF+">
<SequenceCollection xmlns="http://psidev.info/psi/pi/mzIdentML/1.1">
<DBSequence length="146" id="DBSeq143">
<cvParam cvRef="PSI-MS" accession="MS:1001088"></cvParam>
</DBSequence>
<Peptide id="Pep7">
<PeptideSequence>MFLSFPTTK</PeptideSequence>
<Modification location="1" monoisotopicMassDelta="15.994915">
<cvParam cvRef="UNIMOD" accession="UNIMOD:35" name="Oxidation"></cvParam>
</Modification>
</Peptide>
<PeptideEvidence dBSequence_ref="DBSeq143" id="PepEv_160_1_18"></PeptideEvidence>
<PeptideEvidence dBSequence_ref="DBSeq143" id="PepEv_275_8_133"></PeptideEvidence>
</SequenceCollection>
</MzIdentML>
我想分别获得DBSequence,Peptide和PeptideEvidence的细节。但父母和孩子的属性(或嵌套的孩子......如果有)。换句话说,我希望所有的属性作为每个部分中的键值对如下图所示:
----------------------------------------------------------------------
<DBSequence length="146" id="DBSeq143">
<cvParam cvRef="PSI-MS" accession="MS:1001088"></cvParam>
</DBSequence>
----------------------------------------------------------------------
<Peptide id="Pep7">
<PeptideSequence>MFLSFPTTK</PeptideSequence>
<Modification location="1" monoisotopicMassDelta="15.994915">
<cvParam cvRef="UNIMOD" accession="UNIMOD:35" name="Oxidation"></cvParam>
</Modification>
</Peptide>
----------------------------------------------------------------------
<PeptideEvidence dBSequence_ref="DBSeq143" id="PepEv_160_1_18"></PeptideEvidence>
<PeptideEvidence dBSequence_ref="DBSeq143" id="PepEv_275_8_133"></PeptideEvidence>
----------------------------------------------------------------------
例如,如果我们考虑<DBSequence>
部分:
<DBSequence length="146" id="DBSeq143">
<cvParam cvRef="PSI-MS" accession="MS:1001088"></cvParam>
</DBSequence>
应输出为:
DBSequence=>length=146;id=DBSeq143;cvRef=PSI-MS;accession=MS:1001088;
这是我在SAX写的代码:
package lucene.parse;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
public class MzIdentMLSAXParser extends DefaultHandler {
private boolean isDBsequence = false;
String DBSequenceSection;
String PeptideEvidenceDocument;
public static void main(String[] argv) throws SAXException, ParserConfigurationException, IOException {
MzIdentMLSAXParser ps = new MzIdentMLSAXParser("file_path_here/sample.xml");
}
public MzIdentMLSAXParser(String dataDir) throws FileNotFoundException, SAXException, ParserConfigurationException, IOException {
FileInputStream fis = new FileInputStream(dataDir);
SAXParserFactory spf = SAXParserFactory.newInstance();
SAXParser parser = spf.newSAXParser();
parser.parse(fis, this);
}
@Override
public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
if (qName.equals("DBSequence")) {
// each time we found a new DBSequence, we re-initialize DBSequenceSection
DBSequenceSection = "";
// get attributes of DBSequence
for (int i = 0; i < atts.getLength(); i++) {
DBSequenceSection += atts.getQName(i) + "=" + atts.getValue(i) + ";";
}
isDBsequence = true;
} else if ((qName.equals("cvParam")) && (isDBsequence)) {
// get attributes of cvParam which are belongs to DBSequence
// there can be cvParam that are not belongs to DBSequence.
for (int i = 0; i < atts.getLength(); i++) {
DBSequenceSection += atts.getQName(i) + "=" + atts.getValue(i) + ";";
}
} else if (qName.equals("PeptideEvidence")) {
// each time we found a new PeptideEvidence, we re-initialize docuDBSequenceSectionment
PeptideEvidenceDocument = "";
for (int i = 0; i < atts.getLength(); i++) {
PeptideEvidenceDocument += atts.getQName(i) + "=" + atts.getValue(i) + ";";
}
}
}
@Override
public void endElement(String uri, String localName, String qName) throws SAXException {
if (qName.equals("DBSequence")) {
System.out.println(qName +"=>"+DBSequenceSection);
isDBsequence = false;
} else if (qName.equals("PeptideEvidence")) {
System.out.println(qName +"=>"+PeptideEvidenceDocument);
}
}
}
有没有简单的方法呢?因为我有很多这样的嵌套节点的标签。这里的挑战是<cvParam>
不仅出现在<DBSequence>
标记中,还出现在<Modification>
等其他标记中。我也尝试使用StAX。但是无法成功。
答案 0 :(得分:0)
以下是使用StAX的工作示例。 StAX在解析已知的XML结构时表现出色,但也可以用于动态解析。
此代码依赖于知识,例如知道我们想要</script>
</head>
<body>
<div data-role="page">
<div data-role="header">
<h1>Welcome To My Homepage</h1>
</div>
<div align="center" data-role="main" class="ui-content">
<a href="#" id="add" data-rel="popup" class="ui-btn ui-btn-inline ui-corner-all ui-icon-check ui-btn-icon-left">Add New</a>
<a href="#" id="list" class="ui-btn ui-btn-inline ui-corner-all ui-icon-check ui-btn-icon-left">Show List</a>
</div>
<div class="display" align="center" display="inline">
<table id="dis" border="0">
<form id="testconfirmjq" method="POST" action="insert.php">
<tr><td>Name</td><td> <input type="text" name="name" id="name"></td></tr>
<tr><td>Age</td><td><input type="text" name="age" id="datepicker-13" size="30"></td></tr>
<tr><td>City</td><td> <input type="text" name="city"></td></tr>
<tr><td><input id="button" type="submit" name="send" value="Submit"></td></tr>
</form>
</table>
,DBSequence
和Peptide
的内容,PeptideEvidence
有文字内容,而其他内容则没有。
这些方法使用递归来遵循XML的结构。
PeptideSequence
<强>输出强>
public static void main(String[] args) throws Exception {
String xml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" +
"<MzIdentML id=\"MS-GF+\">\n" +
" <SequenceCollection xmlns=\"http://psidev.info/psi/pi/mzIdentML/1.1\">\n" +
" <DBSequence length=\"146\" id=\"DBSeq143\">\n" +
" <cvParam cvRef=\"PSI-MS\" accession=\"MS:1001088\"></cvParam>\n" +
" </DBSequence>\n" +
" <Peptide id=\"Pep7\">\n" +
" <PeptideSequence>MFLSFPTTK</PeptideSequence>\n" +
" <Modification location=\"1\" monoisotopicMassDelta=\"15.994915\">\n" +
" <cvParam cvRef=\"UNIMOD\" accession=\"UNIMOD:35\" name=\"Oxidation\"></cvParam>\n" +
" </Modification>\n" +
" </Peptide>\n" +
" <PeptideEvidence dBSequence_ref=\"DBSeq143\" id=\"PepEv_160_1_18\"></PeptideEvidence>\n" +
" <PeptideEvidence dBSequence_ref=\"DBSeq143\" id=\"PepEv_275_8_133\"></PeptideEvidence>\n" +
" </SequenceCollection>\n" +
"</MzIdentML>";
XMLStreamReader reader = XMLInputFactory.newFactory().createXMLStreamReader(new StringReader(xml));
try {
reader.nextTag();
search(reader);
} finally {
reader.close();
}
}
private static void search(XMLStreamReader reader) throws XMLStreamException {
// reader must be on START_ELEMENT upon entry, and will be on matching END_ELEMENT on return
assert reader.getEventType() == XMLStreamConstants.START_ELEMENT;
while (reader.nextTag() == XMLStreamConstants.START_ELEMENT) {
String name = reader.getLocalName();
switch (name) {
case "DBSequence":
case "Peptide":
case "PeptideEvidence": {
Map<String, String> props = new LinkedHashMap<>();
collectProps(reader, props);
System.out.println(name + ": " + props);
break; }
default:
search(reader);
}
}
}
private static void collectProps(XMLStreamReader reader, Map<String, String> props) throws XMLStreamException {
// reader must be on START_ELEMENT upon entry, and will be on matching END_ELEMENT on return
assert reader.getEventType() == XMLStreamConstants.START_ELEMENT;
for (int i = 0; i < reader.getAttributeCount(); i++)
props.put(reader.getAttributeLocalName(i), reader.getAttributeValue(i));
String name = reader.getLocalName();
switch (name) {
case "PeptideSequence":
props.put(name, reader.getElementText());
break;
default:
while (reader.nextTag() == XMLStreamConstants.START_ELEMENT)
collectProps(reader, props);
}
}