我正在使用正则表达式查找json文件中的数据元素。我已经定义了5种模式。
from pathlib import Path # path, iterdir
import fileinput # read file input
import re # regular expressions
import os
def main():
mainList = [] # the list of elements to populate
p1 = re.compile('\s*(\[|{),') # pattern: open group e.g., ' {'
p2 = re.compile('\s*(\]|}),?') # pattern: close group e.g., ' }'
p3 = re.compile('\s*".*": (\[|{)') # pattern: element, no value e.g., ' "lnccodes": ['
p4 = re.compile('\s*".*": (".*"|[^[{]*),?') # pattern: element & value e.g., ' "abnormal": false,'
p5 = re.compile('\s*".*"') # pattern: value only (rare; lab) e.g., ' "urn:cl:ien:60:205:72"'
p = Path(os.getcwd()) # create a path
for file in p.glob('lab' + '*' + '.json'): # inner loop through files to read elements and values
infile = open(file.name, 'r') # open file
start = False # initialize start
lineNumber = 0
for line in infile: # for each line in the selected file
lineNumber = lineNumber + 1
if start: # if we've gotten to the data in the file
splitLine = line.split('"') # get parsed array of parts
if re.fullmatch(p4, line): # test most common match first -- this is element & value
element = splitLine[1]
if splitLine[2].strip() == ':': # if not string, get value from
value = splitLine[3]
else:
value = splitLine[2].split(':') # remove ':' from values
mainList.append((element, value, file.name, lineNumber))
continue # once a match is found & processed, get the next line
elif re.fullmatch(p1, line):
continue
elif re.fullmatch(p2, line):
continue
elif re.fullmatch(p3, line):
element = splitLine[1]
continue
elif re.fullmatch(p5, line):
value = splitLine[1]
mainList.append((element, value, file.name, lineNumber))
continue
else:
if "[" in line: # look for beginning of data
start = True # indicate it has started
element = None # initialize element
infile.readline() # start has one [ and one {
lineNumber = lineNumber + 1 # so we skip the { artificially
infile.close()
for k in mainList:
print(k)
main()
所有模式在IDLE中始终如一。
#4在脚本中一致地工作; #2有时会工作(没有前导空格)。添加原始标志似乎没有任何效果。
我做错了什么?