嗨我有一个以下格式的输入文件。
.....
......
<TABLE COLS="3">
<ROW>
<R>data</R>
<R>data</R>
</ROW>
<ROW>
<R>data</R>
<R>data</R>
<R>data</R>
</ROW>
</TABLE>
<TABLE COLS="4">
<ROW>
<R>data</R>
<R>data</R>
<R>data</R>
<R>data</R>
<R>data</R>
</ROW>
<ROW>
<R>data</R>
<R>data</R>
</ROW>
</TABLE>
.......
.....
.
...
输出文件应为:
....
....
.
..
<table ct="3">
<ent="1">
<ent="2">
<ent="3">
<row>
<rvn ="1">data</rvn>
<rvn ="2">data</rvn>
</row>
<row>
<rvn ="1">data</rvn>
<rvn ="2">data</rvn>
<rvn ="3">data</rvn>
</row>
</table>
<table ct="4">
<ent="1">
<ent="2">
<ent="3">
<ent="4">
<row>
<rvn ="1">data</rvn>
<rvn ="2">data</rvn>
<rvn ="3">data</rvn>
<rvn ="4">data</rvn>
<rvn ="5">data</rvn>
</row>
<row>
<rvn ="1">data</rvn>
<rvn ="2">data</rvn>
</row>
</table>
...
...
...
我编写了以下代码:当我运行此代码时,表col值将被最后一个表col值替换。而且我在增加<rvn>
值方面遇到了问题。请你们中的任何一个人帮我解决问题。
import re
def tblcnv( st, val ):
Tcolspec = ''
Endval = int(val) + 1
for i in range(1, Endval):
l = str(i)
Tcolspec += "<colspec col='" + l + "' colwidth=''/>\n"
Theader = re.sub(r"(?i)<table.*?>","<table ct='" + val +"'>\n" + Tcolspec + "\n", st)
return Theader
in_data = open("in.txt", "r")
out_data = open("out.txt", "w")
Rdata = in_data.read()
Rval = Rdata.replace("\n", " ")
Rval = re.sub("(?i)(<TABLE.*cols=\"(\d+).*?</TABLE>)", lambda m: tblcnv(m.group(1), m.group(2)), Rval)
out_data.write(Rval)
答案 0 :(得分:1)
这是您的工作代码......
注意:你不应该使用正则表达式...解析总是更好的方法......
import re
counter = None
def datacnv( st ):
global counter
return "<rvn=\""+ next(counter) +"\">" + st + "</rvn>\n"
def rowcnv( st ):
global counter
counter = iter("".join([str(x) for x in range(1,10)]))
st = re.sub("(?i)<R>(.*?)</R>", lambda m: datacnv(m.group(1)), st)
return "<row>\n" + st + "</row>\n"
def tblcnv( st, val ):
Tcolspec = ''
Endval = int(val) + 1
for i in range(1, Endval):
l = str(i)
Tcolspec += "<colspec col='" + l + "' colwidth=''/>\n"
Theader = re.sub(r"(?i)<table.*?>","\n<table ct='" + val +"'>\n" + Tcolspec + "\n", st)
Theader = re.sub("(?i)<ROW>(.*?)</ROW>", lambda m: rowcnv(m.group(1)), Theader)
return Theader
in_data = open("in.txt", "r")
out_data = open("out.txt", "w")
Rdata = in_data.read().lower()
in_data.close()
Rval = Rdata.replace("\n", " ")
Rval = re.sub("(?i)(<TABLE.*?cols=\"(\d+).*?</TABLE>)", lambda m: tblcnv(m.group(1), m.group(2)), Rval)
out_data.write(Rval)
out_data.close()
<强>输出
<table ct='3'>
<colspec col='1' colwidth=''/>
<colspec col='2' colwidth=''/>
<colspec col='3' colwidth=''/>
<row>
<rvn="1">data</rvn>
<rvn="2">data</rvn>
</row>
<row>
<rvn="1">data</rvn>
<rvn="2">data</rvn>
<rvn="3">data</rvn>
</row>
</table>
<table ct='4'>
<colspec col='1' colwidth=''/>
<colspec col='2' colwidth=''/>
<colspec col='3' colwidth=''/>
<colspec col='4' colwidth=''/>
<row>
<rvn="1">data</rvn>
<rvn="2">data</rvn>
<rvn="3">data</rvn>
<rvn="4">data</rvn>
<rvn="5">data</rvn>
</row>
<row>
<rvn="1">data</rvn>
<rvn="2">data</rvn>
</row>
</table>
答案 1 :(得分:1)
使用HTML / XML解析器是一种操作HTML / XML的简单且不易出错的方式。
它更容易,因为解析器允许您处理更高级别的概念:标记和属性,而不是任意字符串上的正则表达式。
以下是使用lxml的示例:
import lxml.etree as ET
import itertools as IT
content = '''\
<root>
<TABLE COLS="3">
<ROW>
<R>data</R>
<R>data</R>
</ROW>
<ROW>
<R>data</R>
<R>data</R>
<R>data</R>
</ROW>
</TABLE>
<TABLE COLS="4">
<ROW>
<R>data</R>
<R>data</R>
<R>data</R>
<R>data</R>
<R>data</R>
</ROW>
<ROW>
<R>data</R>
<R>data</R>
</ROW>
</TABLE>
</root>
'''
root = ET.fromstring(content)
for elt in root.iter():
elt.tag = elt.tag.lower()
if elt.tag == 'table':
elt.attrib['ct'] = elt.attrib['COLS']
del elt.attrib['COLS']
# Add <ent> tags
for i in range(int(elt.attrib['ct']), 0, -1):
elt.insert(0, ET.Element('ent', value=str(i)))
# Restart count every time <row> is encountered
if elt.tag == 'row':
count = IT.count(1)
# Change <R> to <rvn>
if elt.tag == 'r':
elt.tag = 'rvn'
elt.attrib['value'] = str(next(count))
print(ET.tostring(root, pretty_print = True))
产量
<root>
<table ct="3">
<ent value="1"/><ent value="2"/><ent value="3"/><row>
<rvn value="1">data</rvn>
<rvn value="2">data</rvn>
</row>
<row>
<rvn value="1">data</rvn>
<rvn value="2">data</rvn>
<rvn value="3">data</rvn>
</row>
</table>
<table ct="4">
<ent value="1"/><ent value="2"/><ent value="3"/><ent value="4"/><row>
<rvn value="1">data</rvn>
<rvn value="2">data</rvn>
<rvn value="3">data</rvn>
<rvn value="4">data</rvn>
<rvn value="5">data</rvn>
</row>
<row>
<rvn value="1">data</rvn>
<rvn value="2">data</rvn>
</row>
</table>
</root>