在xml

时间:2015-08-09 17:53:46

标签: python regex xml

我有一个类似以下的文件

<xml>
<LM lm="blablabla" catg="bla">some</LM>
<LM lm="blablabla" catg="bla">word</LM>:
<LM lm="blablabla" catg="bla">some</LM>
<LM lm="blablabla" catg="bla">other</LM>,
<LM lm="blablabla" catg="bla">word</LM>
<LM lm="blablabla" catg="bla">and</LM>.
<LM lm="blablabla" catg="bla">so</LM>
<LM lm="blablabla" catg="bla">on</LM>.
</xml>

现在,我可以使用python中的minidom正确解析文档,但我还想在标记外部获取标点符号,并将其放入每个标记的特定标记内。像这样:

    <LM lm="blablabla" catg="bla">some</LM>
    <LM lm="blablabla" catg="bla">word</LM>
    <LM lm="blablabla" catg="colon">:</LM>
    <LM lm="blablabla" catg="bla">some</LM>
    <LM lm="blablabla" catg="bla">other</LM>
    <LM lm="blablabla" catg="comma">,</LM>
    <LM lm="blablabla" catg="bla">word</LM>
    <LM lm="blablabla" catg="bla">and</LM>
    <LM lm="blablabla" catg="eos">.</LM>
    <LM lm="blablabla" catg="bla">so</LM>
    <LM lm="blablabla" catg="bla">on</LM>
    <LM lm="blablabla" catg="eos">.</LM>

我觉得我不应该使用正则表达式这样做,但是怎么样?有安全的方法吗?

这是我到目前为止所做的:

# -*- coding: utf-8 -*-
import sys
import codecs
import io
from xml.dom.minidom import parseString
from xml.sax.saxutils import unescape
import pandas as pd
import numpy as np
import unicodedata
import re

pos_conversion_rules = {
    'a$':'A',
    'blablabla':'BLABLABLA'
}

def look_up(i):
    for k in pos_conversion_rules:
        pattern = re.compile(k)
        if pattern.match(i):            
            return pos_conversion_rules[k]


#non apro il file direttamente con minidom perché non riesce a riconvertire i caratteri speciali xml
datasource = codecs.open("/CORPUS.xml", "r")
datasource = datasource.read()

#non converto subito in utf8
datasource = unescape(datasource,
                      {
                          "&agrave;":"x01",
                          "&egrave":"x02",
                          "&igrave":"x03",
                          "&ograve":"x04",
                          "&ugrave":"x05",
                          "&eacute":"y01",
                          "&oacute":"y02",
                          "&apos;": "z01",
                          "&quot;": 'z02'
                      })

#parsing 
document = parseString(datasource)

node = document.getElementsByTagName('LM')

#inizializzo i vettori di lunghezza pari a quella della lista dei nodi
token = [0]*len(node)
pos = [0]*len(node)
lemma = [0]*len(node)

#estraggo i dati e converto le pos attraverso la funzione look_up definita sopra
for i in  range(len(node)):
    token[i] = node[i].firstChild.data
    for attrName, attrValue in node[i].attributes.items():
        #controllo di non avere problemi con i lemmi
        if attrName == "lemma":
            if len(attrValue)>0:
                lemma[i] = attrValue
            else:
                lemma[i] = ("NaN")
        elif attrName == "catg":
            x = [look_up(attrValue),attrValue]
            pos[i] = x


#creo il dataframe
df = pd.DataFrame({
    'pos':pos,
    'token':token,
    'lemma': lemma
},columns=('token','pos','lemma'))

print df.to_string

1 个答案:

答案 0 :(得分:0)

这是解决方案:

from lxml import etree
import sys
import os
import glob
import argparse
import pandas as pd
import re


pos_conversion_rules = {
    '[.;:?!]':'XPS',
    '[,]':'XPW',
    'blablabla':'BLABLA'
}

def look_up(i):
    for k in pos_conversion_rules:
        pattern = re.compile(k)
        if pattern.match(i):            
            return pos_conversion_rules[k]

path = "~/where/my/files/are/"
f = "*.xml"
docs = [f for f in glob.glob(os.path.join(path,f))]

parser = etree.XMLParser(recover=True)


for d in docs:
    x = []
    tree = etree.parse(d,parser)

    for node in [z for z in  tree.iterfind(".//LM")]:
        x.append([node.text,
                  node.get('lemma'),
                  look_up(node.get('catg'))])
        if node.tail.rstrip():
            x.append([node.tail.rstrip(),
                      node.tail.rstrip(),
                      look_up(node.tail.rstrip())])

    df = pd.DataFrame(x)