将gensim的短语和preprocess_string一起使用的正确方法是什么?我正在这样做,但它有点人为。
from gensim.models.phrases import Phrases
from gensim.parsing.preprocessing import preprocess_string
from gensim.parsing.preprocessing import strip_tags
from gensim.parsing.preprocessing import strip_short
from gensim.parsing.preprocessing import strip_multiple_whitespaces
from gensim.parsing.preprocessing import stem_text
from gensim.parsing.preprocessing import remove_stopwords
from gensim.parsing.preprocessing import strip_numeric
import re
from gensim import utils
# removed "_" from regular expression
punctuation = r"""!"#$%&'()*+,-./:;<=>?@[\]^`{|}~"""
RE_PUNCT = re.compile(r'([%s])+' % re.escape(punctuation), re.UNICODE)
def strip_punctuation(s):
"""Replace punctuation characters with spaces in `s` using :const:`~gensim.parsing.preprocessing.RE_PUNCT`.
Parameters
----------
s : str
Returns
-------
str
Unicode string without punctuation characters.
Examples
--------
>>> from gensim.parsing.preprocessing import strip_punctuation
>>> strip_punctuation("A semicolon is a stronger break than a comma, but not as much as a full stop!")
u'A semicolon is a stronger break than a comma but not as much as a full stop '
"""
s = utils.to_unicode(s)
return RE_PUNCT.sub(" ", s)
my_filter = [
lambda x: x.lower(), strip_tags, strip_punctuation,
strip_multiple_whitespaces, strip_numeric,
remove_stopwords, strip_short, stem_text
]
documents = ["the mayor of new york was there", "machine learning can be useful sometimes","new york mayor was present"]
sentence_stream = [doc.split(" ") for doc in documents]
bigram = Phrases(sentence_stream, min_count=1, threshold=2)
sent = [u'the', u'mayor', u'of', u'new', u'york', u'was', u'there']
test = " ".join(bigram[sent])
print(preprocess_string(test))
print(preprocess_string(test, filters=my_filter))
结果是:
['mayor', 'new', 'york']
['mayor', 'new_york'] #correct
答案 0 :(得分:0)
对于您的示例,我建议使用gensim.utils.tokenize()
而不是gensim.parsing.preprocessing.preprocess_string()
。
在许多情况下,tokenize()
可以很好地完成工作,因为它只会返回字母字符序列(无数字)。这样可以节省标点符号等额外的清洁步骤。
但是,tokenize()
不包括删除停用词,短标记或词干。如果您使用的是英语以外的其他语言,则无论如何都必须将其切掉。
以下是您(已经很干净)的示例文档的代码,这些代码为您提供了所需的二元组。
documents = ["the mayor of new york was there",
"machine learning can be useful sometimes",
"new york mayor was present"]
import gensim, pprint
# tokenize documents with gensim's tokenize() function
tokens = [list(gensim.utils.tokenize(doc, lower=True)) for doc in documents]
# build bigram model
bigram_mdl = gensim.models.phrases.Phrases(tokens, min_count=1, threshold=2)
# do more pre-processing on tokens (remove stopwords, stemming etc.)
# NOTE: this can be done better
from gensim.parsing.preprocessing import preprocess_string, remove_stopwords, stem_text
CUSTOM_FILTERS = [remove_stopwords, stem_text]
tokens = [preprocess_string(" ".join(doc), CUSTOM_FILTERS) for doc in tokens]
# apply bigram model on tokens
bigrams = bigram_mdl[tokens]
pprint.pprint(list(bigrams))
输出:
[['mayor', 'new_york'],
['machin', 'learn', 'us'],
['new_york', 'mayor', 'present']]