我有以下示例文字/推文:
sqlite
他们以如下方式清理推文,最终结果是:
RT @trader $AAPL 2012 is o´o´o´o´o´pen to ‘Talk’ about patents with GOOG definitely not the treatment #samsung got:-) heh url_that_cannot_be_posted_on_SO
我使用以下脚本基于正则表达式对推文进行标记化:
{RT|123456} {USER|56789} {TICKER|AAPL} {NUMBER|2012} notooopen nottalk patent {COMPANY|GOOG} notdefinetli treatment {HASH|samsung} {EMOTICON|POS} haha {URL}
这将产生以下输出:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import re
emoticon_string = r"""
(?:
[<>]?
[:;=8] # eyes
[\-o\*\']? # optional nose
[\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
|
[\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
[\-o\*\']? # optional nose
[:;=8] # eyes
[<>]?
)"""
regex_strings = (
# URL:
r"""http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"""
,
# Twitter username:
r"""(?:@[\w_]+)"""
,
# Hashtags:
r"""(?:\#+[\w_]+[\w\'_\-]*[\w_]+)"""
,
# Cashtags:
r"""(?:\$+[\w_]+[\w\'_\-]*[\w_]+)"""
,
# Remaining word types:
r"""
(?:[+\-]?\d+[,/.:-]\d+[+\-]?) # Numbers, including fractions, decimals.
|
(?:[\w_]+) # Words without apostrophes or dashes.
|
(?:\.(?:\s*\.){1,}) # Ellipsis dots.
|
(?:\S) # Everything else that isn't whitespace.
"""
)
word_re = re.compile(r"""(%s)""" % "|".join(regex_strings), re.VERBOSE | re.I | re.UNICODE)
emoticon_re = re.compile(regex_strings[1], re.VERBOSE | re.I | re.UNICODE)
######################################################################
class Tokenizer:
def __init__(self, preserve_case=False):
self.preserve_case = preserve_case
def tokenize(self, s):
try:
s = str(s)
except UnicodeDecodeError:
s = str(s).encode('string_escape')
s = unicode(s)
# Tokenize:
words = word_re.findall(s)
if not self.preserve_case:
words = map((lambda x: x if emoticon_re.search(x) else x.lower()), words)
return words
if __name__ == '__main__':
tok = Tokenizer(preserve_case=False)
test = ' RT @trader $AAPL 2012 is oooopen to ‘Talk’ about patents with GOOG definitely not the treatment #samsung got:-) heh url_that_cannot_be_posted_on_SO'
tokenized = tok.tokenize(test)
print("\n".join(tokenized))
如何调整此脚本以获取:
rt
@trader
$aapl
2012
is
oooopen
to
‘
talk
’
about
patents
with
goog
definitely
not
the
treatment
#samsung
got
:-)
heh
url_that_cannot_be_posted_on_SO
预先感谢您帮助我度过了美好的时光!
答案 0 :(得分:1)
您确实需要使用 named 捕获组(mentioned by thebjorn),并使用groupdict()
在每次匹配时获取名称/值对。但是,它需要一些后处理:
None
的对都必须丢弃self.preserve_case
为假,则可以立即将其转换为小写字母WORD
, ELLIPSIS
或 ELSE
,则会添加值照原样words
HASHTAG
, CASHTAG
, USER
或 URL
首先将值添加到$
,#
和@
字符中,然后以{{1 }}项目words
项目添加到{<GROUP_NAME>|<VALUE>}
。请注意,默认情况下,words
与下划线匹配,因此{<GROUP_NAME>|<VALUE>}
= \w
。我对模式进行了一些优化。
[\w_]
使用\w
,它输出
import re
emoticon_string = r"""
(?P<EMOTICON>
[<>]?
[:;=8] # eyes
[-o*']? # optional nose
[][()dDpP/:{}@|\\] # mouth
|
[][()dDpP/:}{@|\\] # mouth
[-o*']? # optional nose
[:;=8] # eyes
[<>]?
)"""
regex_strings = (
# URL:
r"""(?P<URL>https?://(?:[-a-zA-Z0-9_$@.&+!*(),]|%[0-9a-fA-F][0-9a-fA-F])+)"""
,
# Twitter username:
r"""(?P<USER>@\w+)"""
,
# Hashtags:
r"""(?P<HASHTAG>\#+\w+[\w'-]*\w+)"""
,
# Cashtags:
r"""(?P<CASHTAG>\$+\w+[\w'-]*\w+)"""
,
# Remaining word types:
r"""
(?P<NUMBER>[+-]?\d+(?:[,/.:-]\d+[+-]?)?) # Numbers, including fractions, decimals.
|
(?P<WORD>\w+) # Words without apostrophes or dashes.
|
(?P<ELLIPSIS>\.(?:\s*\.)+) # Ellipsis dots.
|
(?P<ELSE>\S) # Everything else that isn't whitespace.
"""
)
word_re = re.compile(r"""({}|{})""".format(emoticon_string, "|".join(regex_strings)), re.VERBOSE | re.I | re.UNICODE)
#print(word_re.pattern)
emoticon_re = re.compile(regex_strings[1], re.VERBOSE | re.I | re.UNICODE)
######################################################################
class Tokenizer:
def __init__(self, preserve_case=False):
self.preserve_case = preserve_case
def tokenize(self, s):
try:
s = str(s)
except UnicodeDecodeError:
s = str(s).encode('string_escape')
s = unicode(s)
# Tokenize:
words = []
for x in word_re.finditer(s):
for key, val in x.groupdict().items():
if val:
if not self.preserve_case:
val = val.lower()
if key in ['WORD','ELLIPSIS','ELSE']:
words.append(val)
elif key in ['HASHTAG','CASHTAG','USER','URL']: # Add more here if needed
words.append("{{{}|{}}}".format(key, re.sub(r'^[#@$]+', '', val)))
else:
words.append("{{{}|{}}}".format(key, val))
return words
if __name__ == '__main__':
tok = Tokenizer(preserve_case=False)
test = ' RT @trader $AAPL 2012 is oooopen to ‘Talk’ about patents with GOOG definitely not the treatment #samsung got:-) heh http://some.site.here.com'
tokenized = tok.tokenize(test)
print("\n".join(tokenized))