我正在使用元素树在XML文档中查找标记的值。然后我想将它附加到Excel工作表。我也有一些正则表达式寻找值。我想将它附加到同一张纸上。我还想将文件名附加到工作表中。这是我附加文件名的代码(尝试至少)。
#!/usr/bin/python
from openpyxl import Workbook
import os, sys
# Open a file
path = "xmls"
dirs = os.listdir( path )
# This would print all the files and directories
for file in dirs:
print (file)
wb = Workbook()
ws = wb.active
ws.append([file])
wb.save("sample.xlsx")
此代码使用elementTree从标签中获取值:
from openpyxl import Workbook
import xml.etree.ElementTree as ET
import os
for filename in os.listdir('xml'):
element_tree = ET.parse(filename)
root = element_tree.getroot()
agreement = root.find(".//title").text
print (agreement)
wb = Workbook()
#kevin = ["1", "2", "3"]
# grab the active worksheet
ws = wb.active
# Data can be assigned directly to cells
#ws['A1'] = 42
# Rows can also be appended
ws.append([agreement])
#ws.append(kevin)
# Save the file
wb.save("sample.xlsx")`
这是我查找正则表达式的代码
import re
import os
import openpyxl
#regular expressions to find the data we want to count
regexPattern1 = ">Data\s+\d*\s*\=*\s*</content>"
regexPattern2 = ">Some Data\s+\d*\s*\=*\s*</content>"
regexPattern3 = ">More Data\s+\d*\s*\=*\s*</content>"
regexPattern4 = ">Data More\s+\d*\s*\=*\s*</content>"
regexPattern5 = ">Some More Data\s+\d*\s*\=*\s*</content>"
#function to get the values of the various regular expressions above
def get_values(filepath):
#empty list to hold the values we find.
values = []
another = []
more = []
mores = []
smore = []
for line in open(filepath, encoding="utf8").readlines():
matchValue1 = re.search(regexPattern1, line)
matchValue2 = re.search(regexPattern2, line)
matchValue3 = re.search(regexPattern3, line)
matchValue4 = re.search(regexPattern4, line)
matchValue5 = re.search(regexPattern5, line)
if matchValue1:
values.append(matchValue1)
if matchValue2:
another.append(matchValue2)
if matchValue3:
more.append(matchValue3)
if matchValue4:
mores.append(matchValue4)
if matchValue5:
smore.append(matchValue5)
# Now we want to calculate highest number in all the lists.
try:
maxVal = len(values)
except:
maxVal = '' # This case will handle if there are NO values at all
try:
maxAnother = len(another)
except:
maxAnother = ''
try:
maxmore = len(more)
except:
maxmore = ''
try:
maxmores = len(mores)
except:
maxmores = ''
try:
maxsmore = len(smore)
except:
maxsmore = ''
return maxVal, maxAnother, maxmore, maxmores, maxsmore
def process_folder(folder, output_xls_path):
files = [folder+'/'+f for f in os.listdir(folder) if ".xml" in f]
writable_lines = []
writable_lines.append(("Data1","Data1", "Data3", "Data4", "Data5")) # Header in the excel
for file in files:
values = get_values(file)
writable_lines.append((str(values[0]),str(values[1]), str(values[2]), str(values[3]), str(values[4])))
wb = openpyxl.Workbook()
sheet = wb.active
for i in range(len(writable_lines)):
sheet['A' + str(i+1)].value = writable_lines[i][0]
sheet['B' + str(i+1)].value = writable_lines[i][1]
sheet['C' + str(i+1)].value = writable_lines[i][2]
sheet['D' + str(i+1)].value = writable_lines[i][3]
sheet['E' + str(i+1)].value = writable_lines[i][4]
wb.save(output_xls_path)
if __name__ == '__main__':
process_folder("xmls", "xml.xlsx")