我正在尝试从非常丑陋的自由文本中提取地址详细信息:
import regex
pat_addr_verbose = """(?ix) # case insensitive and verbose flag
(?:(?:BND|BY|CNR|OF)\W+)* # non-capturing (list)
(?:(?!RD|HWY|TRAIL|St) # negative lookahead (list of street types)
(?: # either
(?P<n_start>\d+)-(?P<n_end>\d+) # number sequence
|(?<!-)(?P<n>\d+) # single number
)\W+)? # No number, maybe non word character follows
(?P<name>
(?:
(?!RD|HWY|TRAIL|St)\w+\W*)+)\W+ # capturing words not preceded by (list of street types)
(?P<type>RD|HWY|TRAIL|St)* # non-capturing (list of street types)
"""
pat_addr = regex.compile(pat_addr_verbose, regex.IGNORECASE & regex.VERBOSE)
text = """BND BY THOMAS RAIL TRAIL, 7 SNOW WHITE HWY & MICKEY RD,
337-343 BOGEYMAN RD, 4, 8, 9-13, 16-18 Fictional Rd & 17 Elm St"""
regex.findall(pat_addr, text)
我为简单地址获得了正确的结果,但我未能在Fictional Road
[m.groupdict() for m in pat_addr.finditer(text)]
[{'n': None,
'n_end': None,
'n_start': None,
'name': 'THOMAS RAIL',
'type': 'TRAIL'},
{'n': '7',
'n_end': None,
'n_start': None,
'name': 'SNOW WHITE',
'type': 'HWY'},
{'n': None, 'n_end': None, 'n_start': None, 'name': 'MICKEY', 'type': 'RD'},
{'n': None,
'n_end': '343',
'n_start': '337',
'name': 'BOGEYMAN',
'type': 'RD'},
{'n': '4',
'n_end': None,
'n_start': None,
'name': '8, 9-13, 16-18 Fictional',
'type': 'Rd'},
{'n': '17', 'n_end': None, 'n_start': None, 'name': 'Elm', 'type': 'St'}]
我想知道是否有可能获得list
个数字(如果没有命名则无关紧要)或正则表达式为dict
?
编辑:这是我期望得到的:
选项1:
{'numbers':
[
{
'n': '4',
'n_end': None,
'n_start': None,
},
{
'n': '8',
'n_end': None,
'n_start': None,
},
{
'n': None,
'n_end': '13',
'n_start': '9',
},
{
'n': None,
'n_end': '18',
'n_start': '16',
}
],
'name': 'Fictional',
'type': 'Rd'},
选项2:
{'numbers':
[
'4',
'8',
'9-13',
'16-18'
],
'name': '8, 9-13, 16-18 Fictional',
'type': 'Rd'},
答案 0 :(得分:1)
(?ix) # case insensitive and verbose flag
(?:(?:BND|BY|CNR|OF)\W+)* # non-capturing (list)
(?: #Number non capture Start
(?!RD|HWY|TRAIL|St) # negative lookahead (list of street types)
# EITHER
(?P<numbers>\d+-\d+|\d+) #double number OR single number
\W+ # No number, maybe non word character follows
) #Number non capture End
*? #This Number group repeats to produce numbers
(?P<name>
(?:
(?!RD|HWY|TRAIL|St)[A-Z]+\W*)+)\W+ # capturing words not preceded by (list of street types)
(?P<type>RD|HWY|TRAIL|St)*
使用新的REGEX MODULE更新
新的regex module确实允许捕获重复的组。
import regex
text='BND BY THOMAS RAIL TRAIL, 7 SNOW WHITE HWY & MICKEY RD, 337-343 BOGEYMAN RD, 4, 8, 9-13, 16-18 Fictional Rd & 17 Elm St'
reg=r'(?ix)(?:(?:BND|BY|CNR|OF)\W+)*(?:(?!RD|HWY|TRAIL|St)(?P<numbers>\d+-\d+|\d+)\W+)*?(?P<name>(?:(?!RD|HWY|TRAIL|St)[A-Z]+\W*)+)\W+(?P<type>RD|HWY|TRAIL|St)*'
def updateD(m):
d=m.groupdict()
d['numbers']=m.captures('numbers')
return d
[updateD(m) for m in regex.finditer(reg,text)]
<强>输出强>
[
{
'numbers': [],
'name': 'THOMAS RAIL',
'type': 'TRAIL'
},
{
'numbers': ['7'],
'name': 'SNOW WHITE',
'type': 'HWY'
},
{
'numbers': [],
'name': 'MICKEY',
'type': 'RD'
},
{
'numbers': ['337-343'],
'name': 'BOGEYMAN',
'type': 'RD'
},
{
'numbers': ['4', '8', '9-13', '16-18'],
'name': 'Fictional',
'type': 'Rd'
},
{
'numbers': ['17'],
'name': 'Elm',
'type': 'St'
}
]