我有一个类似于以下格式的csv文件:
===============================================================
#Type 1 Definition
#============================================================================
#TYPE, <name>
#Some tag for type------------------------------------------------------
#TYPESomeTag, <id>, <name>, <param>
#Another tag for type----------------------------------------------
#TYPEAnothertag, <param_1>, <param_2>, <param_3>
TYPE, Name_1
TYPESomeTag, 1, 2, 3
TYPESomeTag, 4, 2, 5
TYPEAnothertag, a, b, c
TYPE, Name_2
TYPESomeTag, 1, 2, 3
TYPESomeTag, 4, 2, 5
TYPEAnothertag, a, b, c
#===============================================================================
#Type 2 Definition
#===============================================================================
#TYPE2, <name>
#Some tag for type------------------------------------------------------
#TYPE2SomeTag, <id>, <name>, <param>
#Another tag for type----------------------------------------------
#TYPE2Anothertag, <param_1>, <param_2>, <param_3>
TYPE2, Name_1
TYPE2SomeTag, 1, 2, 3
TYPE2SomeTag, 4, 2, 5
TYPE2Anothertag, a, b, c
TYPE2, Name_2
TYPE2SomeTag, 1, 2, 3
TYPE2SomeTag, 4, 2, 5
TYPE2Anothertag, a, b, c
and so on...
我的目标是将上面的csv转换为xml格式,我也使用Python。 这是我开始实现这个
的方式for row in csv.reader(open(csvFile)):
if(row): #check for blank lines
if row[0] == 'TYPE':
xmlData.write(' ' + '<TYPE'+ row[1] + '>'+"\n")
elif row[0] == 'TYPESomeTag'
xmlData.write(' ' + '<TYPESomeTag'+ row[2] + '>'+"\n")
elif
#write some more tags
else
#something else
xmlData.close()
我遵循的这种方法非常简陋,因为它不容易扩展。我将每行的第一列与字符串进行比较。现在问题出现了,如果有另一组类型定义,如TYPE2。然后我必须写另一组if..else语句,我认为这不是真正有效地做到这一点的方法。
有人可以建议我如何以更好的方式完成将上述csv转换为xml的任务。
编辑:
这是我的目标xml:
<tags>
<TYPE Name_1>
<TYPESomeTag>
<id>1</id>
<name>2</name>
<param>3</param>
</TYPESomeTag>
<TYPESomeTag>
<id>4</id>
<name>2</name>
<param>5</param>
</TYPESomeTag>
<TYPEAnothertag>
<param_1>a</param_1>
<param_2>b</param_2>
<param_3>c</param_3>
</TYPEAnothertag>
</TYPE>
<TYPE2 Name_2>
<TYPE2SomeTag>
<id>1</id>
<name>2</name>
<param>3</param>
</TYPE2SomeTag>
<TYPE2SomeTag>
<id>4</id>
<name>2</name>
<param>5</param>
</TYPE2SomeTag>
<TYPE2Anothertag>
<param_1>a</param_1>
<param_2>b</param_2>
<param_3>c</param_3>
</TYPE2Anothertag>
</TYPE2>
</tags>
答案 0 :(得分:2)
那是一个相当复杂的问题:
tags
我会用:
它以下面的代码结尾:
import re
import csv
from xml.etree import ElementTree as ET
import xml.dom.minidom as minidom
class DefFilter:
def __init__(self, fd, conf = None):
if conf is None:self.conf = {}
else: self.conf = conf
self.fd = fd
self.line = re.compile(r'#\s*(\w+)\s*((?:,\s*\<\w+\>)+)')
self.tagname = re.compile(',\s*<(\w*)>((?:,\s*\<\w+\>)*)')
def _parse_tags(self, line):
l = []
while True:
m = self.tagname.match(line)
#print('>', m.group(2), '<', sep='')
l.append(m.group(1))
if len(m.group(2)) == 0: return l
line = m.group(2)
def __iter__(self):
return self
def next(self):
while True:
line = next(self.fd).strip()
if not line.startswith('#'): return line
m = self.line.match(line)
if m:
self.conf[m.group(1)] = self._parse_tags(m.group(2))
def __next__(self):
return self.next()
class Parser:
def __init__(self, conf = None):
self.conf = conf
def parse(self, fd):
flt = DefFilter(fd, self.conf)
rd = csv.reader(flt)
root = ET.Element('tags')
for row in rd:
if len(row) ==2:
name = 'name'
tag = row[0].strip()
try:
name = flt.conf[tag][0]
except:
pass
elt = ET.SubElement(root, tag, { name: row[1].strip() })
elif len(row) > 2:
tag = row[0].strip()
x = ET.SubElement(elt, tag)
tags = [ 'param_' + str(i+1) for i in range(len(row) - 1)]
try:
tags = flt.conf[tag]
except:
pass
for i, val in enumerate(row[1:]):
y = ET.SubElement(x, tags[i])
y.text = val.strip()
self.root = root
def parsefile(self, filename):
with open(filename) as fd:
self.parse(fd)
def prettyprint(self, fd, addindent = ' ', newl = '\n'):
minidom.parseString(ET.tostring(p.root)).writexml(fd, newl = newl,
addindent=addindent)
然后您可以使用:
with open('in.csv') as in, open('out.xml', 'w') as out:
p = Parser()
p.parse(in)
p.prettyprint(out)
答案 1 :(得分:0)
您需要将参数从注释行存储到字典以进行处理
tags = {"TYPESomeTag":["id", "name", "param"]}
进入
import csv
csvFile = 'sample.csv'
nextLineIsTagName = False
tags = dict()
tag = None
tagOpened = False
for row in csv.reader(open(csvFile), skipinitialspace=True):
if not row: #skipping empty lines
continue
if row[0][0] == '#': #processing types definition within csv comment block
if tagOpened: #there is opened tag so we need to close it
print "</" + tag + ">"
tags = dict()
tag = None
tagOpened = False
if (len(row) == 1) and 'Definition' in row[0]:
nextLineIsTagName = True
continue
if nextLineIsTagName and len(row) == 2:
tag = row[0][1:]
nextLineIsTagName = False
continue
if not nextLineIsTagName and len(row) > 1:
tags[row[0][1:]] = row[1:] #adding 'parameters' to 'tag' dict entry
else: #processing csv data
if len(row) < 2:
continue
if row[0] == tag: #we need to start new TYPE element
if tagOpened: #close previous tag before open new one
print "</" + tag + ">"
print "<" + tag, row[1] + ">"
tagOpened = True
else: #we need to add parameters to open TYPE element
print "\t<" + row[0] + ">"
for i in range(1, len(row)): #iterating over parameters
print "\t\t<" + tags[row[0]][i-1] + ">" + row[i] + "</" + tags[row[0]][i-1] + ">"
print "\t</" + row[0] + ">"
if tagOpened: #closing last tag at end of file
print "</"+ tag + ">"
这样你就可以解析每个评论行而无需手动编码参数列表。 以下是处理您给定的csv的示例代码。
update NewTable
inner join OldTable on NewTabl.eCreatorId = OldTable.CreatorId
set latitude = avg(OldTable.latitude)
group by OldTable.latitutude;
答案 2 :(得分:0)
考虑使用xml模块构建xml文档,而不是连接元素的字符串表示。通过这种方式,您可以逐行读取csv,根据行位置有条件地添加子元素和文本值。下面为孙子女添加了通用<tags>
:
import csv
import lxml.etree as ET
# INITIATE TREE
root = ET.Element('tags')
# READ CSV LINE BY LINE
cnt = 0; strtype = ''
with open('Type1.csv', 'r') as f:
csvr = csv.reader(f)
for line in csvr:
# CONDITIONALLY ADD CHILDREN ATTRIB OR ELEMENTS
if len(line) > 1:
if cnt==0 or line[0] == strtype:
strtype = line[0]
typeNode = ET.SubElement(root, strtype.strip())
typeNode.set('attr', line[1].strip())
if cnt >= 1:
typesomeNode = ET.SubElement(typeNode, line[0].strip())
ET.SubElement(typesomeNode, 'tag').text = line[1].strip()
ET.SubElement(typesomeNode, 'tag').text = line[2].strip()
ET.SubElement(typesomeNode, 'tag').text = line[3].strip()
else:
cnt = 0
continue
cnt += 1
# CONVERT TREE TO STRING W/ INDENTATION
tree_out = ET.tostring(root, pretty_print=True)
print(tree_out.decode("utf-8"))
要使用<id>
,<name>
,<param>
,<param1>
等替换通用标记,请考虑使用XSLT(用于重新设计的转换语言) /重组xml文件)。 Python的lxml模块可以运行这样的XSLT 1.0脚本。这是避免上面第一次阅读的许多条件的一种方法:
xslt_str = '''
<xsl:transform xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0">
<xsl:output version="1.0" encoding="UTF-8" indent="yes" />
<xsl:strip-space elements="*"/>
<!-- Identity Transform -->
<xsl:template match="@*|node()">
<xsl:copy>
<xsl:apply-templates select="@*|node()"/>
</xsl:copy>
</xsl:template>
<xsl:template match="TYPESomeTag|TYPE2SomeTag">
<xsl:copy>
<id><xsl:value-of select="tag[1]"/></id>
<name><xsl:value-of select="tag[2]"/></name>
<param><xsl:value-of select="tag[3]"/></param>
</xsl:copy>
</xsl:template>
<xsl:template match="TYPEAnothertag|TYPE2Anothertag">
<xsl:copy>
<param_1><xsl:value-of select="tag[1]"/></param_1>
<param_2><xsl:value-of select="tag[2]"/></param_2>
<param_3><xsl:value-of select="tag[3]"/></param_3>
</xsl:copy>
</xsl:template>
</xsl:transform>
'''
# PARSE XSL STRING (CAN ALSO READ FROM FILE)
xslt = ET.fromstring(xslt_str)
# TRANSFORM SOURCE XML WITH XSLT
transform = ET.XSLT(xslt)
newdom = transform(root)
print(str(newdom))
输出 (对于TYPE1,但类似于TYPE2)
<?xml version="1.0"?>
<tags>
<TYPE attr="Name_1">
<TYPESomeTag>
<id>1</id>
<name>2</name>
<param>3</param>
</TYPESomeTag>
<TYPESomeTag>
<id>4</id>
<name>2</name>
<param>5</param>
</TYPESomeTag>
<TYPEAnothertag>
<param_1>a</param_1>
<param_2>b</param_2>
<param_3>c</param_3>
</TYPEAnothertag>
</TYPE>
<TYPE attr="Name_2">
<TYPESomeTag>
<id>1</id>
<name>2</name>
<param>3</param>
</TYPESomeTag>
<TYPESomeTag>
<id>4</id>
<name>2</name>
<param>5</param>
</TYPESomeTag>
<TYPEAnothertag>
<param_1>a</param_1>
<param_2>b</param_2>
<param_3>c</param_3>
</TYPEAnothertag>
</TYPE>
</tags>