我目前有以下RegEx工作流程来匹配WordPress标题标记,并用Jekyll标题标记替换它们:
import sys, re
def re_sub(pattern, replacement, string):
def _r(m):
# Now this is ugly.
# Python has a "feature" where unmatched groups return None
# then re.sub chokes on this.
# see http://bugs.python.org/issue1519638
# this works around and hooks into the internal of the re module...
# the match object is replaced with a wrapper that
# returns "" instead of None for unmatched groups
class _m():
def __init__(self, m):
self.m=m
self.string=m.string
def group(self, n):
return m.group(n) or ""
return re._expand(pattern, _m(m), replacement)
return re.sub(pattern, _r, string)
def parseCaptions(content):
"""
[caption id="attachment_76716" align="aligncenter" width="500"]<a href="http://martin-thoma.com/wp-content/uploads/2013/11/WER-calculation.png"><img src="http://martin-thoma.com/wp-content/uploads/2013/11/WER-calculation.png" alt="WER calculation" width="500" height="494" class="size-full wp-image-76716" /></a> WER calculation[/caption]
to
{% caption align="aligncenter" width="500" alt="WER calculation" text="WER calculation" url="../images/2013/11/WER-calculation.png" %}
"""
import re
pattern = '\[caption(.*?)align="(?P<align>.*?)"(.*?)caption="(?P<caption>.*?)"(.*?)\]' + \
'<a(.*?)href=\"(?P<url>(.*?))\"(?P<asonst>.*?)>' + \
'<img(.*?)src=\"(?P<imgurl>http://martin-thoma.com/wp-content/uploads/(?P<innerurl>(.*?)))\" ' + \
'(class=\"(?P<imgclass>.*?)\")?\s*' + \
'title=\"(?P<title>.*?)\"\s*' + \
'(alt=\"(?P<alt>.*?)\")?\s*' + \
'width=\"(?P<width>.*?)\"\s*' + \
'height=\"(?P<height>.*?)\"\s*' + \
'(?P<isonst>.*?)/>' + \
'</a>\s*' + \
'(?P<text>.*?)\[/caption\]'
pattern = re.compile(pattern)
results = [m.groupdict() for m in pattern.finditer(content)]
for result in results:
for key, value in result.items():
print("%s:\t%s" % (key, value))
content = re_sub(pattern, '{% caption align="\g<align>" width="\g<width>" caption="\g<caption>\g<text>" url="../images/\g<innerurl>" alt="\g<alt>" title="\g<title>" height="\g<height>" class="\g<imgclass>" %}', content)
return content
我的正则表达式不是很好,因为标签内的属性顺序无关紧要,但它对我的正则表达式很重要。如何使正则表达式匹配标签内的任何订单(但最多一次或与最新出现的匹配)?
(编辑:此RegEx不是最理想的另一个原因是所有群组都应将"
替换为'
)