我有一个类似以下的文件
<xml>
<LM lm="blablabla" catg="bla">some</LM>
<LM lm="blablabla" catg="bla">word</LM>:
<LM lm="blablabla" catg="bla">some</LM>
<LM lm="blablabla" catg="bla">other</LM>,
<LM lm="blablabla" catg="bla">word</LM>
<LM lm="blablabla" catg="bla">and</LM>.
<LM lm="blablabla" catg="bla">so</LM>
<LM lm="blablabla" catg="bla">on</LM>.
</xml>
现在,我可以使用python中的minidom正确解析文档,但我还想在标记外部获取标点符号,并将其放入每个标记的特定标记内。像这样:
<LM lm="blablabla" catg="bla">some</LM>
<LM lm="blablabla" catg="bla">word</LM>
<LM lm="blablabla" catg="colon">:</LM>
<LM lm="blablabla" catg="bla">some</LM>
<LM lm="blablabla" catg="bla">other</LM>
<LM lm="blablabla" catg="comma">,</LM>
<LM lm="blablabla" catg="bla">word</LM>
<LM lm="blablabla" catg="bla">and</LM>
<LM lm="blablabla" catg="eos">.</LM>
<LM lm="blablabla" catg="bla">so</LM>
<LM lm="blablabla" catg="bla">on</LM>
<LM lm="blablabla" catg="eos">.</LM>
我觉得我不应该使用正则表达式这样做,但是怎么样?有安全的方法吗?
这是我到目前为止所做的:
# -*- coding: utf-8 -*-
import sys
import codecs
import io
from xml.dom.minidom import parseString
from xml.sax.saxutils import unescape
import pandas as pd
import numpy as np
import unicodedata
import re
pos_conversion_rules = {
'a$':'A',
'blablabla':'BLABLABLA'
}
def look_up(i):
for k in pos_conversion_rules:
pattern = re.compile(k)
if pattern.match(i):
return pos_conversion_rules[k]
#non apro il file direttamente con minidom perché non riesce a riconvertire i caratteri speciali xml
datasource = codecs.open("/CORPUS.xml", "r")
datasource = datasource.read()
#non converto subito in utf8
datasource = unescape(datasource,
{
"à":"x01",
"è":"x02",
"ì":"x03",
"ò":"x04",
"ù":"x05",
"é":"y01",
"ó":"y02",
"'": "z01",
""": 'z02'
})
#parsing
document = parseString(datasource)
node = document.getElementsByTagName('LM')
#inizializzo i vettori di lunghezza pari a quella della lista dei nodi
token = [0]*len(node)
pos = [0]*len(node)
lemma = [0]*len(node)
#estraggo i dati e converto le pos attraverso la funzione look_up definita sopra
for i in range(len(node)):
token[i] = node[i].firstChild.data
for attrName, attrValue in node[i].attributes.items():
#controllo di non avere problemi con i lemmi
if attrName == "lemma":
if len(attrValue)>0:
lemma[i] = attrValue
else:
lemma[i] = ("NaN")
elif attrName == "catg":
x = [look_up(attrValue),attrValue]
pos[i] = x
#creo il dataframe
df = pd.DataFrame({
'pos':pos,
'token':token,
'lemma': lemma
},columns=('token','pos','lemma'))
print df.to_string
答案 0 :(得分:0)
这是解决方案:
from lxml import etree
import sys
import os
import glob
import argparse
import pandas as pd
import re
pos_conversion_rules = {
'[.;:?!]':'XPS',
'[,]':'XPW',
'blablabla':'BLABLA'
}
def look_up(i):
for k in pos_conversion_rules:
pattern = re.compile(k)
if pattern.match(i):
return pos_conversion_rules[k]
path = "~/where/my/files/are/"
f = "*.xml"
docs = [f for f in glob.glob(os.path.join(path,f))]
parser = etree.XMLParser(recover=True)
for d in docs:
x = []
tree = etree.parse(d,parser)
for node in [z for z in tree.iterfind(".//LM")]:
x.append([node.text,
node.get('lemma'),
look_up(node.get('catg'))])
if node.tail.rstrip():
x.append([node.tail.rstrip(),
node.tail.rstrip(),
look_up(node.tail.rstrip())])
df = pd.DataFrame(x)