perl查找包含字符串1的行,然后在字符串2和3之间提取字符

时间:2016-11-03 14:49:09

标签: arrays regex perl file

您好我编写了以下脚本来提取包含字符串列表之一的行:

#!/usr/bin/perl

use strict;
use warnings;

my $file_name = shift @ARGV;

open(my $file, '<', $file_name) or die $!;

while (<$file>) {
   print $_ if /(textsem:DiseaseDisorderMention|textsem:SignSymptomMention)/;
}

close $file;

导致:

<textsem:DiseaseDisorderMention xmi:id="278" sofa="6" begin="56" end="65" id="6" ontologyConceptArr="221 251 261 231 241" typeID="2" segmentID="SIMPLE_SEGMENT" discoveryTechnique="1" confidence="0.0" polarity="1" uncertainty="0" conditional="false" generic="false" subject="patient" historyOf="1"/>
<textsem:DiseaseDisorderMention xmi:id="421" sofa="6" begin="18" end="26" id="0" ontologyConceptArr="359 399 369 339 309 349 389 379 319 329" typeID="2" segmentID="SIMPLE_SEGMENT" discoveryTechnique="1" confidence="0.0" polarity="1" uncertainty="0" conditional="false" generic="false" subject="patient" historyOf="0"/>
<textsem:SignSymptomMention xmi:id="520" sofa="6" begin="38" end="55" id="2" ontologyConceptArr="492 462 472 502 452 482" typeID="3" segmentID="SIMPLE_SEGMENT" discoveryTechnique="1" confidence="0.0" polarity="1" uncertainty="0" conditional="false" generic="false" subject="patient" historyOf="0"/>
<textsem:SignSymptomMention xmi:id="563" sofa="6" begin="45" end="52" id="5" ontologyConceptArr="550" typeID="3" segmentID="SIMPLE_SEGMENT" discoveryTechnique="1" confidence="0.0" polarity="1" uncertainty="0" conditional="false" generic="false" subject="patient" historyOf="0"/>
<textsem:SignSymptomMention xmi:id="606" sofa="6" begin="45" end="55" id="4" ontologyConceptArr="593" typeID="3" segmentID="SIMPLE_SEGMENT" discoveryTechnique="1" confidence="0.0" polarity="1" uncertainty="0" conditional="false" generic="false" subject="patient" historyOf="0"/>
<textsem:SignSymptomMention xmi:id="704" sofa="6" begin="38" end="52" id="3" ontologyConceptArr="686 666 676 646 656 636" typeID="3" segmentID="SIMPLE_SEGMENT" discoveryTechnique="1" confidence="0.0" polarity="1" uncertainty="0" conditional="false" generic="false" subject="patient" historyOf="0"/>
<textsem:SignSymptomMention xmi:id="758" sofa="6" begin="38" end="65" id="1" ontologyConceptArr="744 734" typeID="3" segmentID="SIMPLE_SEGMENT" discoveryTechnique="1" confidence="0.0" polarity="1" uncertainty="0" conditional="false" generic="false" subject="patient" historyOf="0"/>

你可以看到我正在提取一些xml(实际上它是一个xmi文件似乎与XML :: Simple不相称)。

我希望将一些属性(如id="278sofa="6"begin="56"segmentID="SIMPLE_SEGMENT"存储到数组数组中,而不是仅打印这些行,有点像在做

@mentions =  ( [ 278, 6, 56, "SIMPLE_SEGMENT" ],
               [ 421, 6, 18, "SIMPLE SEGMENT" ],
               [ 520, 6, 38, "SIMPLE SEGMENT" ]
                      .
                      .
                      .
             );

我只是不知道如何以编程方式执行此操作。我该怎么做?

完整的XMI文件:

<?xml version="1.0" encoding="UTF-8"?><xmi:XMI xmlns:syntax="http:///org/apache/ctakes/typesystem/type/syntax.ecore" xmlns:refsem="http:///org/apache/ctakes/typesystem/type/refsem.ecore" xmlns:cas="http:///uima/cas.ecore" xmlns:xmi="http://www.omg.org/XMI" xmlns:util="http:///org/apache/ctakes/typesystem/type/util.ecore" xmlns:textspan="http:///org/apache/ctakes/typesystem/type/textspan.ecore" xmlns:assertion="http:///org/apache/ctakes/typesystem/type/temporary/assertion.ecore" xmlns:structured="http:///org/apache/ctakes/typesystem/type/structured.ecore" xmlns:relation="http:///org/apache/ctakes/typesystem/type/relation.ecore" xmlns:textsem="http:///org/apache/ctakes/typesystem/type/textsem.ecore" xmlns:type="http:///org/apache/ctakes/drugner/type.ecore" xmlns:tcas="http:///uima/tcas.ecore" xmi:version="2.0">
    <cas:NULL xmi:id="0"/>
    <cas:Sofa xmi:id="6" sofaNum="1" sofaID="_InitialView" mimeType="text" sofaString="Mary doesn't have epilepsy, she has a family history of migraines."/>
    <tcas:DocumentAnnotation xmi:id="1" sofa="6" begin="0" end="66" language="en"/>
    <textspan:Segment xmi:id="13" sofa="6" begin="0" end="66" id="SIMPLE_SEGMENT"/>
    <textspan:Sentence xmi:id="19" sofa="6" begin="0" end="66" sentenceNumber="0"/>
    <syntax:WordToken xmi:id="25" sofa="6" begin="0" end="4" tokenNumber="0" normalizedForm="Mary" partOfSpeech="NNP" capitalization="1" numPosition="0" canonicalForm="Mary"/>
    <syntax:WordToken xmi:id="37" sofa="6" begin="5" end="9" tokenNumber="1" normalizedForm="do" partOfSpeech="VBZ" capitalization="0" numPosition="0" canonicalForm="do"/>
    <syntax:WordToken xmi:id="57" sofa="6" begin="13" end="17" tokenNumber="3" normalizedForm="have" partOfSpeech="VB" capitalization="0" numPosition="0" canonicalForm="have"/>
    <syntax:WordToken xmi:id="69" sofa="6" begin="18" end="26" tokenNumber="4" normalizedForm="epilepsy" partOfSpeech="NN" capitalization="0" numPosition="0" canonicalForm="epilepsy"/>
    <syntax:WordToken xmi:id="89" sofa="6" begin="28" end="31" tokenNumber="6" normalizedForm="she" partOfSpeech="PRP" capitalization="0" numPosition="0" canonicalForm="she"/>
    <syntax:WordToken xmi:id="101" sofa="6" begin="32" end="35" tokenNumber="7" normalizedForm="HA" partOfSpeech="VBZ" capitalization="0" numPosition="0" canonicalForm="HA"/>
    <syntax:WordToken xmi:id="113" sofa="6" begin="36" end="37" tokenNumber="8" normalizedForm="A" partOfSpeech="DT" capitalization="0" numPosition="0" canonicalForm="A"/>
    <syntax:WordToken xmi:id="125" sofa="6" begin="38" end="44" tokenNumber="9" normalizedForm="family" partOfSpeech="NN" capitalization="0" numPosition="0" canonicalForm="family"/>
    <syntax:WordToken xmi:id="137" sofa="6" begin="45" end="52" tokenNumber="10" normalizedForm="history" partOfSpeech="NN" capitalization="0" numPosition="0" canonicalForm="history"/>
    <syntax:WordToken xmi:id="149" sofa="6" begin="53" end="55" tokenNumber="11" normalizedForm="of" partOfSpeech="IN" capitalization="0" numPosition="0"/>
    <syntax:WordToken xmi:id="161" sofa="6" begin="56" end="65" tokenNumber="12" normalizedForm="migraine" partOfSpeech="NNS" capitalization="0" numPosition="0" canonicalForm="migraine"/>
    <syntax:ContractionToken xmi:id="49" sofa="6" begin="9" end="12" tokenNumber="2" normalizedForm="n't" partOfSpeech="RB"/>
    <syntax:PunctuationToken xmi:id="81" sofa="6" begin="26" end="27" tokenNumber="5" normalizedForm="," partOfSpeech=","/>
    <syntax:PunctuationToken xmi:id="173" sofa="6" begin="65" end="66" tokenNumber="13" normalizedForm="." partOfSpeech="."/>
    <syntax:NP xmi:id="181" sofa="6" begin="0" end="4" chunkType="NP"/>
    <syntax:NP xmi:id="191" sofa="6" begin="18" end="26" chunkType="NP"/>
    <syntax:NP xmi:id="201" sofa="6" begin="28" end="31" chunkType="NP"/>
    <syntax:NP xmi:id="211" sofa="6" begin="36" end="65" chunkType="NP"/>
    <syntax:NP xmi:id="216" sofa="6" begin="56" end="65" chunkType="NP"/>
    <syntax:VP xmi:id="186" sofa="6" begin="5" end="17" chunkType="VP"/>
    <syntax:VP xmi:id="206" sofa="6" begin="32" end="35" chunkType="VP"/>
    <syntax:O xmi:id="196" sofa="6" begin="26" end="27" chunkType="O"/>
    <textsem:DiseaseDisorderMention xmi:id="278" sofa="6" begin="56" end="65" id="6" ontologyConceptArr="221 251 261 231 241" typeID="2" segmentID="SIMPLE_SEGMENT" discoveryTechnique="1" confidence="0.0" polarity="1" uncertainty="0" conditional="false" generic="false" subject="patient" historyOf="1"/>
    <textsem:DiseaseDisorderMention xmi:id="421" sofa="6" begin="18" end="26" id="0" ontologyConceptArr="359 399 369 339 309 349 389 379 319 329" typeID="2" segmentID="SIMPLE_SEGMENT" discoveryTechnique="1" confidence="0.0" polarity="1" uncertainty="0" conditional="false" generic="false" subject="patient" historyOf="0"/>
    <textsem:SignSymptomMention xmi:id="520" sofa="6" begin="38" end="55" id="2" ontologyConceptArr="492 462 472 502 452 482" typeID="3" segmentID="SIMPLE_SEGMENT" discoveryTechnique="1" confidence="0.0" polarity="1" uncertainty="0" conditional="false" generic="false" subject="patient" historyOf="0"/>
    <textsem:SignSymptomMention xmi:id="563" sofa="6" begin="45" end="52" id="5" ontologyConceptArr="550" typeID="3" segmentID="SIMPLE_SEGMENT" discoveryTechnique="1" confidence="0.0" polarity="1" uncertainty="0" conditional="false" generic="false" subject="patient" historyOf="0"/>
    <textsem:SignSymptomMention xmi:id="606" sofa="6" begin="45" end="55" id="4" ontologyConceptArr="593" typeID="3" segmentID="SIMPLE_SEGMENT" discoveryTechnique="1" confidence="0.0" polarity="1" uncertainty="0" conditional="false" generic="false" subject="patient" historyOf="0"/>
    <textsem:SignSymptomMention xmi:id="704" sofa="6" begin="38" end="52" id="3" ontologyConceptArr="686 666 676 646 656 636" typeID="3" segmentID="SIMPLE_SEGMENT" discoveryTechnique="1" confidence="0.0" polarity="1" uncertainty="0" conditional="false" generic="false" subject="patient" historyOf="0"/>
    <textsem:SignSymptomMention xmi:id="758" sofa="6" begin="38" end="65" id="1" ontologyConceptArr="744 734" typeID="3" segmentID="SIMPLE_SEGMENT" discoveryTechnique="1" confidence="0.0" polarity="1" uncertainty="0" conditional="false" generic="false" subject="patient" historyOf="0"/>
    <syntax:ConllDependencyNode xmi:id="792" sofa="6" begin="0" end="66" id="0"/>
    <syntax:ConllDependencyNode xmi:id="806" sofa="6" begin="0" end="4" id="1" form="Mary" lemma="mary" cpostag="NNP" postag="NNP" feats="_" head="848" deprel="nsubj" pdeprel="_"/>
    <syntax:ConllDependencyNode xmi:id="820" sofa="6" begin="5" end="9" id="2" form="does" lemma="do" cpostag="VBZ" postag="VBZ" feats="_" head="848" deprel="aux" pdeprel="_"/>
    <syntax:ConllDependencyNode xmi:id="834" sofa="6" begin="9" end="12" id="3" form="n't" lemma="not" cpostag="RB" postag="RB" feats="_" head="848" deprel="neg" pdeprel="_"/>
    <syntax:ConllDependencyNode xmi:id="848" sofa="6" begin="13" end="17" id="4" form="have" lemma="have" cpostag="VB" postag="VB" feats="_" head="904" deprel="ccomp" pdeprel="_"/>
    <syntax:ConllDependencyNode xmi:id="862" sofa="6" begin="18" end="26" id="5" form="epilepsy" lemma="epilepsy" cpostag="NN" postag="NN" feats="_" head="848" deprel="dobj" pdeprel="_"/>
    <syntax:ConllDependencyNode xmi:id="876" sofa="6" begin="26" end="27" id="6" form="," lemma="," cpostag="," postag="," feats="_" head="904" deprel="punct" pdeprel="_"/>
    <syntax:ConllDependencyNode xmi:id="890" sofa="6" begin="28" end="31" id="7" form="she" lemma="she" cpostag="PRP" postag="PRP" feats="_" head="904" deprel="nsubj" pdeprel="_"/>
    <syntax:ConllDependencyNode xmi:id="904" sofa="6" begin="32" end="35" id="8" form="has" lemma="have" cpostag="VBZ" postag="VBZ" feats="_" head="792" deprel="root" pdeprel="_"/>
    <syntax:ConllDependencyNode xmi:id="918" sofa="6" begin="36" end="37" id="9" form="a" lemma="a" cpostag="DT" postag="DT" feats="_" head="946" deprel="det" pdeprel="_"/>
    <syntax:ConllDependencyNode xmi:id="932" sofa="6" begin="38" end="44" id="10" form="family" lemma="family" cpostag="NN" postag="NN" feats="_" head="946" deprel="nn" pdeprel="_"/>
    <syntax:ConllDependencyNode xmi:id="946" sofa="6" begin="45" end="52" id="11" form="history" lemma="history" cpostag="NN" postag="NN" feats="_" head="904" deprel="dobj" pdeprel="_"/>
    <syntax:ConllDependencyNode xmi:id="960" sofa="6" begin="53" end="55" id="12" form="of" lemma="of" cpostag="IN" postag="IN" feats="_" head="946" deprel="prep" pdeprel="_"/>
    <syntax:ConllDependencyNode xmi:id="974" sofa="6" begin="56" end="65" id="13" form="migraines" lemma="migraine" cpostag="NNS" postag="NNS" feats="_" head="960" deprel="pobj" pdeprel="_"/>
    <syntax:ConllDependencyNode xmi:id="988" sofa="6" begin="65" end="66" id="14" form="." lemma="." cpostag="." postag="." feats="_" head="904" deprel="punct" pdeprel="_"/>
    <textsem:Predicate xmi:id="1002" sofa="6" begin="5" end="9" relations="" frameSet="do.01"/>
    <textsem:Predicate xmi:id="1009" sofa="6" begin="13" end="17" relations="1080 1090 1100" frameSet="have.03"/>
    <textsem:Predicate xmi:id="1016" sofa="6" begin="32" end="35" relations="1053 1063" frameSet="have.03"/>
    <textsem:SemanticArgument xmi:id="1023" sofa="6" begin="0" end="4" relation="1080" label="A0"/>
    <textsem:SemanticArgument xmi:id="1029" sofa="6" begin="9" end="12" relation="1090" label="AM-NEG"/>
    <textsem:SemanticArgument xmi:id="1035" sofa="6" begin="18" end="26" relation="1100" label="A1"/>
    <textsem:SemanticArgument xmi:id="1041" sofa="6" begin="28" end="31" relation="1053" label="A0"/>
    <textsem:SemanticArgument xmi:id="1047" sofa="6" begin="45" end="52" relation="1063" label="A1"/>
    <textsem:SemanticRoleRelation xmi:id="1053" id="0" category="A0" discoveryTechnique="0" confidence="0.0" polarity="0" uncertainty="0" conditional="false" predicate="1016" argument="1041"/>
    <textsem:SemanticRoleRelation xmi:id="1063" id="0" category="A1" discoveryTechnique="0" confidence="0.0" polarity="0" uncertainty="0" conditional="false" predicate="1016" argument="1047"/>
    <textsem:SemanticRoleRelation xmi:id="1080" id="0" category="A0" discoveryTechnique="0" confidence="0.0" polarity="0" uncertainty="0" conditional="false" predicate="1009" argument="1023"/>
    <textsem:SemanticRoleRelation xmi:id="1090" id="0" category="AM-NEG" discoveryTechnique="0" confidence="0.0" polarity="0" uncertainty="0" conditional="false" predicate="1009" argument="1029"/>
    <textsem:SemanticRoleRelation xmi:id="1100" id="0" category="A1" discoveryTechnique="0" confidence="0.0" polarity="0" uncertainty="0" conditional="false" predicate="1009" argument="1035"/>
    <cas:NonEmptyFSList xmi:id="1073" head="1053" tail="1076"/>
    <cas:NonEmptyFSList xmi:id="1076" head="1063" tail="1079"/>
    <cas:NonEmptyFSList xmi:id="1110" head="1080" tail="1113"/>
    <cas:NonEmptyFSList xmi:id="1113" head="1090" tail="1116"/>
    <cas:NonEmptyFSList xmi:id="1116" head="1100" tail="1119"/>
    <syntax:TerminalTreebankNode xmi:id="1334" sofa="6" begin="0" end="4" nodeType="NNP" nodeValue="Mary" leaf="true" parent="1570" headIndex="0" index="0" tokenIndex="0"/>
    <syntax:TerminalTreebankNode xmi:id="1347" sofa="6" begin="5" end="9" nodeType="VBZ" nodeValue="does" leaf="true" parent="1587" headIndex="0" index="1" tokenIndex="1"/>
    <syntax:TerminalTreebankNode xmi:id="1360" sofa="6" begin="9" end="12" nodeType="RB" nodeValue="n't" leaf="true" parent="1587" headIndex="0" index="2" tokenIndex="2"/>
    <syntax:TerminalTreebankNode xmi:id="1373" sofa="6" begin="13" end="17" nodeType="VB" nodeValue="have" leaf="true" parent="1605" headIndex="0" index="3" tokenIndex="3"/>
    <syntax:TerminalTreebankNode xmi:id="1386" sofa="6" begin="18" end="26" nodeType="NN" nodeValue="epilepsy" leaf="true" parent="1622" headIndex="0" index="4" tokenIndex="4"/>
    <syntax:TerminalTreebankNode xmi:id="1399" sofa="6" begin="26" end="27" nodeType="," nodeValue="," leaf="true" parent="1534" headIndex="0" index="5" tokenIndex="5"/>
    <syntax:TerminalTreebankNode xmi:id="1412" sofa="6" begin="28" end="31" nodeType="PRP" nodeValue="she" leaf="true" parent="1655" headIndex="0" index="6" tokenIndex="6"/>
    <syntax:TerminalTreebankNode xmi:id="1425" sofa="6" begin="32" end="35" nodeType="VBZ" nodeValue="has" leaf="true" parent="1672" headIndex="0" index="7" tokenIndex="7"/>
    <syntax:TerminalTreebankNode xmi:id="1438" sofa="6" begin="36" end="37" nodeType="DT" nodeValue="a" leaf="true" parent="1706" headIndex="0" index="8" tokenIndex="8"/>
    <syntax:TerminalTreebankNode xmi:id="1451" sofa="6" begin="38" end="44" nodeType="NN" nodeValue="family" leaf="true" parent="1706" headIndex="0" index="9" tokenIndex="9"/>
    <syntax:TerminalTreebankNode xmi:id="1464" sofa="6" begin="45" end="52" nodeType="NN" nodeValue="history" leaf="true" parent="1706" headIndex="0" index="10" tokenIndex="10"/>
    <syntax:TerminalTreebankNode xmi:id="1477" sofa="6" begin="53" end="55" nodeType="IN" nodeValue="of" leaf="true" parent="1724" headIndex="0" index="11" tokenIndex="11"/>
    <syntax:TerminalTreebankNode xmi:id="1490" sofa="6" begin="56" end="65" nodeType="NNS" nodeValue="migraines" leaf="true" parent="1741" headIndex="0" index="12" tokenIndex="12"/>
    <syntax:TerminalTreebankNode xmi:id="1503" sofa="6" begin="65" end="66" nodeType="." nodeValue="." leaf="true" parent="1534" headIndex="0" index="13" tokenIndex="13"/>
    <syntax:TreebankNode xmi:id="1534" sofa="6" begin="0" end="66" nodeType="S" leaf="false" parent="1516" children="1553 1399 1638 1503" headIndex="7"/>
    <syntax:TreebankNode xmi:id="1553" sofa="6" begin="0" end="26" nodeType="S" leaf="false" parent="1534" children="1570 1587" headIndex="1"/>
    <syntax:TreebankNode xmi:id="1570" sofa="6" begin="0" end="4" nodeType="NP" leaf="false" parent="1553" children="1334" headIndex="0">
        <nodeTags>SBJ</nodeTags>
    </syntax:TreebankNode>
    <syntax:TreebankNode xmi:id="1587" sofa="6" begin="5" end="26" nodeType="VP" leaf="false" parent="1553" children="1347 1360 1605" headIndex="1"/>
    <syntax:TreebankNode xmi:id="1605" sofa="6" begin="13" end="26" nodeType="VP" leaf="false" parent="1587" children="1373 1622" headIndex="3"/>
    <syntax:TreebankNode xmi:id="1622" sofa="6" begin="18" end="26" nodeType="NP" leaf="false" parent="1605" children="1386" headIndex="4"/>
    <syntax:TreebankNode xmi:id="1638" sofa="6" begin="28" end="65" nodeType="S" leaf="false" parent="1534" children="1655 1672" headIndex="7"/>
    <syntax:TreebankNode xmi:id="1655" sofa="6" begin="28" end="31" nodeType="NP" leaf="false" parent="1638" children="1412" headIndex="6">
        <nodeTags>SBJ</nodeTags>
    </syntax:TreebankNode>
    <syntax:TreebankNode xmi:id="1672" sofa="6" begin="32" end="65" nodeType="VP" leaf="false" parent="1638" children="1425 1689" headIndex="7"/>
    <syntax:TreebankNode xmi:id="1689" sofa="6" begin="36" end="65" nodeType="NP" leaf="false" parent="1672" children="1706 1724" headIndex="10"/>
    <syntax:TreebankNode xmi:id="1706" sofa="6" begin="36" end="52" nodeType="NP" leaf="false" parent="1689" children="1438 1451 1464" headIndex="10"/>
    <syntax:TreebankNode xmi:id="1724" sofa="6" begin="53" end="65" nodeType="PP" leaf="false" parent="1689" children="1477 1741" headIndex="11"/>
    <syntax:TreebankNode xmi:id="1741" sofa="6" begin="56" end="65" nodeType="NP" leaf="false" parent="1724" children="1490" headIndex="12"/>
    <syntax:TopTreebankNode xmi:id="1516" sofa="6" begin="0" end="66" nodeType="TOP" leaf="false" children="1534" headIndex="0" treebankParse="(TOP (S (S (NP-SBJ (NNP Mary)) (VP (VBZ does)(RB n't) (VP (VB have) (NP (NN epilepsy)))))(, ,) (S (NP-SBJ (PRP she)) (VP (VBZ has) (NP (NP (DT a) (NN family) (NN history)) (PP (IN of) (NP (NNS migraines))))))(. .)))" terminals="1334 1347 1360 1373 1386 1399 1412 1425 1438 1451 1464 1477 1490 1503"/>
    <cas:EmptyFSList xmi:id="1119"/>
    <cas:EmptyFSList xmi:id="1079"/>
    <refsem:UmlsConcept xmi:id="734" codingScheme="SNOMEDCT" code="137741002" oid="137741002#SNOMEDCT" score="0.0" disambiguated="false" cui="C0455393" tui="T033" preferredText="FH: Migraine"/>
    <refsem:UmlsConcept xmi:id="744" codingScheme="SNOMEDCT" code="160342001" oid="160342001#SNOMEDCT" score="0.0" disambiguated="false" cui="C0455393" tui="T033" preferredText="FH: Migraine"/>
    <refsem:UmlsConcept xmi:id="636" codingScheme="SNOMEDCT" code="57177007" oid="57177007#SNOMEDCT" score="0.0" disambiguated="false" cui="C0241889" tui="T033" preferredText="Family history"/>
    <refsem:UmlsConcept xmi:id="656" codingScheme="SNOMEDCT" code="276502004" oid="276502004#SNOMEDCT" score="0.0" disambiguated="false" cui="C0241889" tui="T033" preferredText="Family history"/>
    <refsem:UmlsConcept xmi:id="646" codingScheme="SNOMEDCT" code="160470003" oid="160470003#SNOMEDCT" score="0.0" disambiguated="false" cui="C0241889" tui="T033" preferredText="Family history"/>
    <refsem:UmlsConcept xmi:id="676" codingScheme="SNOMEDCT" code="266909004" oid="266909004#SNOMEDCT" score="0.0" disambiguated="false" cui="C0241889" tui="T033" preferredText="Family history"/>
    <refsem:UmlsConcept xmi:id="666" codingScheme="SNOMEDCT" code="137863003" oid="137863003#SNOMEDCT" score="0.0" disambiguated="false" cui="C0241889" tui="T033" preferredText="Family history"/>
    <refsem:UmlsConcept xmi:id="686" codingScheme="SNOMEDCT" code="137667000" oid="137667000#SNOMEDCT" score="0.0" disambiguated="false" cui="C0241889" tui="T033" preferredText="Family history"/>
    <refsem:UmlsConcept xmi:id="593" codingScheme="SNOMEDCT" code="392521001" oid="392521001#SNOMEDCT" score="0.0" disambiguated="false" cui="C0262926" tui="T033" preferredText="Medical History"/>
    <refsem:UmlsConcept xmi:id="550" codingScheme="SNOMEDCT" code="392521001" oid="392521001#SNOMEDCT" score="0.0" disambiguated="false" cui="C0262926" tui="T033" preferredText="Medical History"/>
    <refsem:UmlsConcept xmi:id="482" codingScheme="SNOMEDCT" code="137863003" oid="137863003#SNOMEDCT" score="0.0" disambiguated="false" cui="C0241889" tui="T033" preferredText="Family history"/>
    <refsem:UmlsConcept xmi:id="452" codingScheme="SNOMEDCT" code="57177007" oid="57177007#SNOMEDCT" score="0.0" disambiguated="false" cui="C0241889" tui="T033" preferredText="Family history"/>
    <refsem:UmlsConcept xmi:id="502" codingScheme="SNOMEDCT" code="137667000" oid="137667000#SNOMEDCT" score="0.0" disambiguated="false" cui="C0241889" tui="T033" preferredText="Family history"/>
    <refsem:UmlsConcept xmi:id="472" codingScheme="SNOMEDCT" code="276502004" oid="276502004#SNOMEDCT" score="0.0" disambiguated="false" cui="C0241889" tui="T033" preferredText="Family history"/>
    <refsem:UmlsConcept xmi:id="462" codingScheme="SNOMEDCT" code="160470003" oid="160470003#SNOMEDCT" score="0.0" disambiguated="false" cui="C0241889" tui="T033" preferredText="Family history"/>
    <refsem:UmlsConcept xmi:id="492" codingScheme="SNOMEDCT" code="266909004" oid="266909004#SNOMEDCT" score="0.0" disambiguated="false" cui="C0241889" tui="T033" preferredText="Family history"/>
    <refsem:UmlsConcept xmi:id="329" codingScheme="SNOMEDCT" code="246545002" oid="246545002#SNOMEDCT" score="0.0" disambiguated="false" cui="C0014544" tui="T047" preferredText="Epilepsy"/>
    <refsem:UmlsConcept xmi:id="319" codingScheme="SNOMEDCT" code="267593008" oid="267593008#SNOMEDCT" score="0.0" disambiguated="false" cui="C0014544" tui="T047" preferredText="Epilepsy"/>
    <refsem:UmlsConcept xmi:id="379" codingScheme="SNOMEDCT" code="271788002" oid="271788002#SNOMEDCT" score="0.0" disambiguated="false" cui="C0014544" tui="T047" preferredText="Epilepsy"/>
    <refsem:UmlsConcept xmi:id="389" codingScheme="SNOMEDCT" code="193026007" oid="193026007#SNOMEDCT" score="0.0" disambiguated="false" cui="C0014544" tui="T047" preferredText="Epilepsy"/>
    <refsem:UmlsConcept xmi:id="349" codingScheme="SNOMEDCT" code="128613002" oid="128613002#SNOMEDCT" score="0.0" disambiguated="false" cui="C0014544" tui="T047" preferredText="Epilepsy"/>
    <refsem:UmlsConcept xmi:id="309" codingScheme="SNOMEDCT" code="155036009" oid="155036009#SNOMEDCT" score="0.0" disambiguated="false" cui="C0014544" tui="T047" preferredText="Epilepsy"/>
    <refsem:UmlsConcept xmi:id="339" codingScheme="SNOMEDCT" code="267698007" oid="267698007#SNOMEDCT" score="0.0" disambiguated="false" cui="C0014544" tui="T047" preferredText="Epilepsy"/>
    <refsem:UmlsConcept xmi:id="369" codingScheme="SNOMEDCT" code="155045005" oid="155045005#SNOMEDCT" score="0.0" disambiguated="false" cui="C0014544" tui="T047" preferredText="Epilepsy"/>
    <refsem:UmlsConcept xmi:id="399" codingScheme="SNOMEDCT" code="313307000" oid="313307000#SNOMEDCT" score="0.0" disambiguated="false" cui="C0014544" tui="T047" preferredText="Epilepsy"/>
    <refsem:UmlsConcept xmi:id="359" codingScheme="SNOMEDCT" code="84757009" oid="84757009#SNOMEDCT" score="0.0" disambiguated="false" cui="C0014544" tui="T047" preferredText="Epilepsy"/>
    <refsem:UmlsConcept xmi:id="241" codingScheme="SNOMEDCT" code="267699004" oid="267699004#SNOMEDCT" score="0.0" disambiguated="false" cui="C0149931" tui="T047" preferredText="Migraine Disorders"/>
    <refsem:UmlsConcept xmi:id="231" codingScheme="SNOMEDCT" code="193041007" oid="193041007#SNOMEDCT" score="0.0" disambiguated="false" cui="C0149931" tui="T047" preferredText="Migraine Disorders"/>
    <refsem:UmlsConcept xmi:id="261" codingScheme="SNOMEDCT" code="155048007" oid="155048007#SNOMEDCT" score="0.0" disambiguated="false" cui="C0149931" tui="T047" preferredText="Migraine Disorders"/>
    <refsem:UmlsConcept xmi:id="251" codingScheme="SNOMEDCT" code="37796009" oid="37796009#SNOMEDCT" score="0.0" disambiguated="false" cui="C0149931" tui="T047" preferredText="Migraine Disorders"/>
    <refsem:UmlsConcept xmi:id="221" codingScheme="SNOMEDCT" code="155046006" oid="155046006#SNOMEDCT" score="0.0" disambiguated="false" cui="C0149931" tui="T047" preferredText="Migraine Disorders"/>
    <cas:View sofa="6" members="1 13 19 25 37 57 69 89 101 113 125 137 149 161 49 81 173 181 191 201 211 216 186 206 196 278 421 520 563 606 704 758 792 806 820 834 848 862 876 890 904 918 932 946 960 974 988 1002 1009 1016 1023 1029 1035 1041 1047 1053 1063 1080 1090 1100 1073 1076 1110 1113 1116 1334 1347 1360 1373 1386 1399 1412 1425 1438 1451 1464 1477 1490 1503 1534 1553 1570 1587 1605 1622 1638 1655 1672 1689 1706 1724 1741 1516"/>
</xmi:XMI>

4 个答案:

答案 0 :(得分:2)

如果您不想从XML文档中解析大量结构,但实际上已经将其过滤为这些标记,那么您可以将其视为一种奇怪的文本格式。

有时可以务实,只要你能保证这些属性中只有数字,字母和空格。

use strict;
use warnings;
use Data::Dumper;

while ( my $line = <DATA> ) {
    next unless $line =~ /(textsem:DiseaseDisorderMention|textsem:SignSymptomMention)/;
    my %attributes;
    $attributes{$1} = $2 while $line =~ m/(\w+)="([\w\s]+)"/g;

    print Dumper \%attributes;
}

__DATA__
<textsem:DiseaseDisorderMention xmi:id="278" sofa="6" begin="56" end="65" id="6" ontologyConceptArr="221 251 261 231 241" typeID="2" segmentID="SIMPLE_SEGMENT" discoveryTechnique="1" confidence="0.0" polarity="1" uncertainty="0" conditional="false" generic="false" subject="patient" historyOf="1"/>

这会给你:

$VAR1 = {
          'end' => '65',
          'id' => '6',
          'polarity' => '1',
          'discoveryTechnique' => '1',
          'uncertainty' => '0',
          'historyOf' => '1',
          'sofa' => '6',
          'typeID' => '2',
          'begin' => '56'
        };

你可以从那里拿走它。

答案 1 :(得分:2)

因为它出现在评论中 - 我的答案是“使用解析器”。

XML :: Twig完全能够解析你的XMI,它有点像这样:

#!/usr/bin/env perl

use strict;
use warnings;
use Data::Dumper;

use XML::Twig;

my $twig = XML::Twig->new->parsefile ('your_file.xml'); 

my @target_keys = qw ( id sofa begin segmentID );

foreach my $thing ( $twig->get_xpath('//*[@id]') ) {
   if (  $thing->tag eq 'textsem:SignSymptomMention'
      or $thing->tag eq 'textsem:DiseaseDisorderMention' )
   {
      print $thing -> tag;
      print "$_ => " . $thing->att($_). " " for @target_keys;
      print "\n";
   }
}

你可以完成你所追求的精确结果(虽然就个人而言,我建议不要提取东西,然后“就地”使用它们):

my $twig = XML::Twig->new->parsefile('your_file.xml'); 

my @target_keys = qw ( id sofa begin segmentID );
my @results; 

foreach my $thing ( $twig->get_xpath('//*[@id]') ) {
   if (  $thing->tag eq 'textsem:SignSymptomMention'
      or $thing->tag eq 'textsem:DiseaseDisorderMention' )
   {
      push @results, [map { $thing -> att($_) } @target_keys]; 

   }
}

print Dumper \@results;

给出了:

$VAR1 = [
          [
            '6',
            '6',
            '56',
            'SIMPLE_SEGMENT'
          ],
          [
            '0',
            '6',
            '18',
            'SIMPLE_SEGMENT'
          ],
          [
            '2',
            '6',
            '38',
            'SIMPLE_SEGMENT'
          ],
          [
            '5',
            '6',
            '45',
            'SIMPLE_SEGMENT'
          ],
          [
            '4',
            '6',
            '45',
            'SIMPLE_SEGMENT'
          ],
          [
            '3',
            '6',
            '38',
            'SIMPLE_SEGMENT'
          ],
          [
            '1',
            '6',
            '38',
            'SIMPLE_SEGMENT'
          ]
        ];

答案 2 :(得分:1)

这是使用XML::Twig和twig处理程序的另一种方法,用于要解析的两个标记。可以构建一个xpath表达式来匹配两个标签,但我更喜欢只分配相同的代码引用两次,因为它更清晰,更容易阅读。

use strict;
use warnings;
use XML::Twig;
use Data::Printer;

my $twig = XML::Twig->new(
    twig_handlers => {
        'textsem:DiseaseDisorderMention' => \&match,
        'textsem:SignSymptomMention' => \&match,
    }
)->parse(\*DATA);

my @mentions;
sub match {
    push @mentions, [ 
        $_->att('xmi:id'), 
        $_->att('sofa'), 
        $_->att('begin'), 
        $_->att('segmentID'),
    ];
}

p @mentions;

__DATA__
...

或者,您也可以使用较短的map方法处理程序。

sub match {
    my ($t, $e) = @_;
    push @mentions, [ map { $e->att($_) } qw(xmi:id sofa begin segmentID) ];
}

在这两种情况下,输出如下:

[
    [0] [
        [0] 278,
        [1] 6,
        [2] 56,
        [3] "SIMPLE_SEGMENT"
    ],
    [1] [
        [0] 421,
        [1] 6,
        [2] 18,
        [3] "SIMPLE_SEGMENT"
    ],
    [2] [
        [0] 520,
        [1] 6,
        [2] 38,
        [3] "SIMPLE_SEGMENT"
    ],
    [3] [
        [0] 563,
        [1] 6,
        [2] 45,
        [3] "SIMPLE_SEGMENT"
    ],
    [4] [
        [0] 606,
        [1] 6,
        [2] 45,
        [3] "SIMPLE_SEGMENT"
    ],
    [5] [
        [0] 704,
        [1] 6,
        [2] 38,
        [3] "SIMPLE_SEGMENT"
    ],
    [6] [
        [0] 758,
        [1] 6,
        [2] 38,
        [3] "SIMPLE_SEGMENT"
    ]
]

答案 3 :(得分:0)

使用XML::SAX包:

主:

#!/usr/bin/perl
use strict;
use warnings;
use Data::Dumper qw(Dumper);
use XML::SAX::ParserFactory;
use MyHandler;

my $handler = MyHandler->new();
my $parser = XML::SAX::ParserFactory->parser(Handler => $handler);

$parser->parse_uri('./yourxmlfile.xml');

print Dumper($MyHandler::result);

包裹:MyHandler.pm

package MyHandler;
use strict;
use warnings;
use base qw(XML::SAX::Base);

our $result = [];

sub start_element {
    my ($self, $data) = @_;

    if ($data->{'Name'} =~ /^textsem:(?:DiseaseDisorderMention|SignSymptomMention)$/) {
        push $result, [ $data->{'Attributes'}->{'{http://www.omg.org/XMI}id'}->{'Value'}, 
                        $data->{'Attributes'}->{'{}sofa'}->{'Value'},
                        $data->{'Attributes'}->{'{}begin'}->{'Value'},
                        $data->{'Attributes'}->{'{}segmentID'}->{'Value'} ];
    }
}

1;