使用re在多个标签之间查找文本

时间:2015-04-17 09:26:53

标签: python regex

我试图在python中找到标签内的所有字符。以下是我的代码:

import re

text=''' <parse>(ROOT
      (S
      (NP (NNP Stanford) (NNP University))
      (VP (VBZ is)
      (ADJP (JJ located)
      (PP (IN in)
      (NP (NNP California)))))
      (. .)))

      </parse>
    <dependencies type="basic-dependencies">
      <dep type="root">
        <governor idx="0">ROOT</governor>
        <dependent idx="4">located</dependent>
      </dep>
      <dep type="nn">
        <governor idx="2">University</governor>
        <dependent idx="1">Stanford</dependent>
      </dep>
      <dep type="nsubj">
        <governor idx="4">located</governor>
        <dependent idx="2">University</dependent>
      </dep>
      <dep type="cop">
        <governor idx="4">located</governor>
        <dependent idx="3">is</dependent>
      </dep>
      <dep type="prep">
        <governor idx="4">located</governor>
        <dependent idx="5">in</dependent>
      </dep>
      <dep type="pobj">
        <governor idx="5">in</governor>
        <dependent idx="6">California</dependent>
      </dep>
    </dependencies>
    <dependencies type="collapsed-dependencies">
      <dep type="root">
        <governor idx="0">ROOT</governor>
        <dependent idx="4">located</dependent>
      </dep>
      <dep type="nn">
        <governor idx="2">University</governor>
        <dependent idx="1">Stanford</dependent>
      </dep>
      <dep type="nsubj">
        <governor idx="4">located</governor>
        <dependent idx="2">University</dependent>
      </dep>
      <dep type="cop">
        <governor idx="4">located</governor>
        <dependent idx="3">is</dependent>
      </dep>
      <dep type="prep_in">
        <governor idx="4">located</governor>
        <dependent idx="6">California</dependent>
      </dep>
    </dependencies>
    <dependencies type="collapsed-ccprocessed-dependencies">
      <dep type="root">
        <governor idx="0">ROOT</governor>
        <dependent idx="4">located</dependent>
      </dep>
      <dep type="nn">
        <governor idx="2">University</governor>
        <dependent idx="1">Stanford</dependent>
      </dep>
      <dep type="nsubj">
        <governor idx="4">located</governor>
        <dependent idx="2">University</dependent>
      </dep>
      <dep type="cop">
        <governor idx="4">located</governor>
        <dependent idx="3">is</dependent>
      </dep>
      <dep type="prep_in">
        <governor idx="4">located</governor>
        <dependent idx="6">California</dependent>
      </dep>
    </dependencies>
  </sentence>
  <sentence id="2">
    <tokens>
      <token id="1">
        <word>It</word>
        <lemma>it</lemma>
        <CharacterOffsetBegin>46</CharacterOffsetBegin>
        <CharacterOffsetEnd>48</CharacterOffsetEnd>
        <POS>PRP</POS>
        <NER>O</NER>
      </token>
      <token id="2">
        <word>is</word>
        <lemma>be</lemma>
        <CharacterOffsetBegin>49</CharacterOffsetBegin>
        <CharacterOffsetEnd>51</CharacterOffsetEnd>
        <POS>VBZ</POS>
        <NER>O</NER>
      </token>
      <token id="3">
        <word>a</word>
        <lemma>a</lemma>
        <CharacterOffsetBegin>52</CharacterOffsetBegin>
        <CharacterOffsetEnd>53</CharacterOffsetEnd>
        <POS>DT</POS>
        <NER>O</NER>
      </token>
      <token id="4">
        <word>great</word>
        <lemma>great</lemma>
        <CharacterOffsetBegin>54</CharacterOffsetBegin>
        <CharacterOffsetEnd>59</CharacterOffsetEnd>
        <POS>JJ</POS>
        <NER>O</NER>
      </token>
      <token id="5">
        <word>university</word>
        <lemma>university</lemma>
        <CharacterOffsetBegin>60</CharacterOffsetBegin>
        <CharacterOffsetEnd>70</CharacterOffsetEnd>
        <POS>NN</POS>
        <NER>O</NER>
      </token>
      <token id="6">
        <word>,</word>
        <lemma>,</lemma>
        <CharacterOffsetBegin>70</CharacterOffsetBegin>
        <CharacterOffsetEnd>71</CharacterOffsetEnd>
        <POS>,</POS>
        <NER>O</NER>
      </token>
      <token id="7">
        <word>founded</word>
        <lemma>found</lemma>
        <CharacterOffsetBegin>72</CharacterOffsetBegin>
        <CharacterOffsetEnd>79</CharacterOffsetEnd>
        <POS>VBN</POS>
        <NER>O</NER>
      </token>
      <token id="8">
        <word>in</word>
        <lemma>in</lemma>
        <CharacterOffsetBegin>80</CharacterOffsetBegin>
        <CharacterOffsetEnd>82</CharacterOffsetEnd>
        <POS>IN</POS>
        <NER>O</NER>
      </token>
      <token id="9">
        <word>1891</word>
        <lemma>1891</lemma>
        <CharacterOffsetBegin>83</CharacterOffsetBegin>
        <CharacterOffsetEnd>87</CharacterOffsetEnd>
        <POS>CD</POS>
        <NER>DATE</NER>
        <NormalizedNER>1891</NormalizedNER>
        <Timex tid="t1" type="DATE">1891</Timex>
      </token>
      <token id="10">
        <word>.</word>
        <lemma>.</lemma>
        <CharacterOffsetBegin>87</CharacterOffsetBegin>
        <CharacterOffsetEnd>88</CharacterOffsetEnd>
        <POS>.</POS>
        <NER>O</NER>
      </token>
    </tokens>
    <parse>(ROOT
      (S
      (NP (PRP It))
      (VP (VBZ is)
      (NP
      (NP (DT a) (JJ great) (NN university))
      (, ,)
      (VP (VBN founded)
      (PP (IN in)
      (NP (CD 1891))))))
      (. .)))

      </parse>
    <dependencies type="basic-dependencies">
      <dep type="root">
        <governor idx="0">ROOT</governor>
        <dependent idx="5">university</dependent>
      </dep>
      <dep type="nsubj">
        <governor idx="5">university</governor>
        <dependent idx="1">It</dependent>
      </dep>
      <dep type="cop">
        <governor idx="5">university</governor>
        <dependent idx="2">is</dependent>
      </dep>
      <dep type="det">
        <governor idx="5">university</governor>
        <dependent idx="3">a</dependent>
      </dep>
      <dep type="amod">
        <governor idx="5">university</governor>
        <dependent idx="4">great</dependent>
      </dep>
      <dep type="vmod">
        <governor idx="5">university</governor>
        <dependent idx="7">founded</dependent>
      </dep>
      <dep type="prep">
        <governor idx="7">founded</governor>
        <dependent idx="8">in</dependent>
      </dep>
      <dep type="pobj">
        <governor idx="8">in</governor>
        <dependent idx="9">1891</dependent>
      </dep>
    </dependencies>
    <dependencies type="collapsed-dependencies">
      <dep type="root">
        <governor idx="0">ROOT</governo'''


p1=re.compile("<parse>(.*)</parse>",re.DOTALL)
parse=p1.findall(text)
print parse

上述代码的输出是:

['(ROOT\n          (S\n          (NP (NNP Stanford) (NNP University))\n          (VP (VBZ is)\n          (ADJP (JJ located)\n          (PP (IN in)\n          (NP (NNP California)))))\n          (. .)))\n\n          </parse>\n        <dependencies type="basic-dependencies">\n          <dep type="root">\n            <governor idx="0">ROOT</governor>\n            <dependent idx="4">located</dependent>\n          </dep>\n          <dep type="nn">\n            <governor idx="2">University</governor>\n            <dependent idx="1">Stanford</dependent>\n          </dep>\n          <dep type="nsubj">\n            <governor idx="4">located</governor>\n            <dependent idx="2">University</dependent>\n          </dep>\n          <dep type="cop">\n            <governor idx="4">located</governor>\n            <dependent idx="3">is</dependent>\n          </dep>\n          <dep type="prep">\n            <governor idx="4">located</governor>\n            <dependent idx="5">in</dependent>\n          </dep>\n          <dep type="pobj">\n            <governor idx="5">in</governor>\n            <dependent idx="6">California</dependent>\n          </dep>\n        </dependencies>\n        <dependencies type="collapsed-dependencies">\n          <dep type="root">\n            <governor idx="0">ROOT</governor>\n            <dependent idx="4">located</dependent>\n          </dep>\n          <dep type="nn">\n            <governor idx="2">University</governor>\n            <dependent idx="1">Stanford</dependent>\n          </dep>\n          <dep type="nsubj">\n            <governor idx="4">located</governor>\n            <dependent idx="2">University</dependent>\n          </dep>\n          <dep type="cop">\n            <governor idx="4">located</governor>\n            <dependent idx="3">is</dependent>\n          </dep>\n          <dep type="prep_in">\n            <governor idx="4">located</governor>\n            <dependent idx="6">California</dependent>\n          </dep>\n        </dependencies>\n        <dependencies type="collapsed-ccprocessed-dependencies">\n          <dep type="root">\n            <governor idx="0">ROOT</governor>\n            <dependent idx="4">located</dependent>\n          </dep>\n          <dep type="nn">\n            <governor idx="2">University</governor>\n            <dependent idx="1">Stanford</dependent>\n          </dep>\n          <dep type="nsubj">\n            <governor idx="4">located</governor>\n            <dependent idx="2">University</dependent>\n          </dep>\n          <dep type="cop">\n            <governor idx="4">located</governor>\n            <dependent idx="3">is</dependent>\n          </dep>\n          <dep type="prep_in">\n            <governor idx="4">located</governor>\n            <dependent idx="6">California</dependent>\n          </dep>\n        </dependencies>\n      </sentence>\n      <sentence id="2">\n        <tokens>\n          <token id="1">\n            <word>It</word>\n            <lemma>it</lemma>\n            <CharacterOffsetBegin>46</CharacterOffsetBegin>\n            <CharacterOffsetEnd>48</CharacterOffsetEnd>\n            <POS>PRP</POS>\n            <NER>O</NER>\n          </token>\n          <token id="2">\n            <word>is</word>\n            <lemma>be</lemma>\n            <CharacterOffsetBegin>49</CharacterOffsetBegin>\n            <CharacterOffsetEnd>51</CharacterOffsetEnd>\n            <POS>VBZ</POS>\n            <NER>O</NER>\n          </token>\n          <token id="3">\n            <word>a</word>\n            <lemma>a</lemma>\n            <CharacterOffsetBegin>52</CharacterOffsetBegin>\n            <CharacterOffsetEnd>53</CharacterOffsetEnd>\n            <POS>DT</POS>\n            <NER>O</NER>\n          </token>\n          <token id="4">\n            <word>great</word>\n            <lemma>great</lemma>\n            <CharacterOffsetBegin>54</CharacterOffsetBegin>\n            <CharacterOffsetEnd>59</CharacterOffsetEnd>\n            <POS>JJ</POS>\n            <NER>O</NER>\n          </token>\n          <token id="5">\n            <word>university</word>\n            <lemma>university</lemma>\n            <CharacterOffsetBegin>60</CharacterOffsetBegin>\n            <CharacterOffsetEnd>70</CharacterOffsetEnd>\n            <POS>NN</POS>\n            <NER>O</NER>\n          </token>\n          <token id="6">\n            <word>,</word>\n            <lemma>,</lemma>\n            <CharacterOffsetBegin>70</CharacterOffsetBegin>\n            <CharacterOffsetEnd>71</CharacterOffsetEnd>\n            <POS>,</POS>\n            <NER>O</NER>\n          </token>\n          <token id="7">\n            <word>founded</word>\n            <lemma>found</lemma>\n            <CharacterOffsetBegin>72</CharacterOffsetBegin>\n            <CharacterOffsetEnd>79</CharacterOffsetEnd>\n            <POS>VBN</POS>\n            <NER>O</NER>\n          </token>\n          <token id="8">\n            <word>in</word>\n            <lemma>in</lemma>\n            <CharacterOffsetBegin>80</CharacterOffsetBegin>\n            <CharacterOffsetEnd>82</CharacterOffsetEnd>\n            <POS>IN</POS>\n            <NER>O</NER>\n          </token>\n          <token id="9">\n            <word>1891</word>\n            <lemma>1891</lemma>\n            <CharacterOffsetBegin>83</CharacterOffsetBegin>\n            <CharacterOffsetEnd>87</CharacterOffsetEnd>\n            <POS>CD</POS>\n            <NER>DATE</NER>\n            <NormalizedNER>1891</NormalizedNER>\n            <Timex tid="t1" type="DATE">1891</Timex>\n          </token>\n          <token id="10">\n            <word>.</word>\n            <lemma>.</lemma>\n            <CharacterOffsetBegin>87</CharacterOffsetBegin>\n            <CharacterOffsetEnd>88</CharacterOffsetEnd>\n            <POS>.</POS>\n            <NER>O</NER>\n          </token>\n        </tokens>\n        <parse>(ROOT\n          (S\n          (NP (PRP It))\n          (VP (VBZ is)\n          (NP\n          (NP (DT a) (JJ great) (NN university))\n          (, ,)\n          (VP (VBN founded)\n          (PP (IN in)\n          (NP (CD 1891))))))\n          (. .)))\n\n          ']

但是我只需要解析标签中的字符,没有别的。请解决这个问题。以下应该是输出:

'(ROOT\n          (S\n          (NP (NNP Stanford) (NNP University))\n          (VP (VBZ is)\n          (ADJP (JJ located)\n          (PP (IN in)\n          (NP (NNP California)))))\n          (. .)))\n\n        
(ROOT\n          (S\n          (NP (PRP It))\n          (VP (VBZ is)\n          (NP\n          (NP (DT a) (JJ great) (NN university))\n          (, ,)\n          (VP (VBN founded)\n          (PP (IN in)\n          (NP (CD 1891))))))\n          (. .)))\n\n          

2 个答案:

答案 0 :(得分:1)

如果您需要使用正则表达式,请使用以下内容:

(?s)<parse>(.*?)</parse>

请参阅demo

import re
p = re.compile(ur'(?s)<parse>(.*?)</parse>')
parse = re.findall(p, text)
print parse

答案 1 :(得分:0)

使用BeautifulSoup解析器。

>>> from bs4 import BeautifulSoup
>>> soup = BeautifulSoup(text)
>>> for i in soup.select('parse'):
        print(i.text)


(ROOT
      (S
      (NP (NNP Stanford) (NNP University))
      (VP (VBZ is)
      (ADJP (JJ located)
      (PP (IN in)
      (NP (NNP California)))))
      (. .)))


(ROOT
      (S
      (NP (PRP It))
      (VP (VBZ is)
      (NP
      (NP (DT a) (JJ great) (NN university))
      (, ,)
      (VP (VBN founded)
      (PP (IN in)
      (NP (CD 1891))))))
      (. .)))