token_pattern=r'(?u)\b\w\w+\b'
def preprocess(line,
token_pattern=token_pattern,
exclude_stoptword=config.cooccurrence_word_exclude_stopword,
encode_digit=False):
token_pattern=re.compile(token_pattern,re.UNICODE)
tokens=[x.lower() for x in token_pattern.findall(line)]
tokens_stemmed=stem_tokens(tokens,english_stemmer)
if exclude_stoptword:
tokens_stemmed=[x for x in tokens_stemmed if x not in stopwords]
return tokens_stemmed
def extract(df):
#Unigram Features
df["query_unigram"]=list(df.apply(lambda x:preprocess(df["query"]),axis=1))
出现以下错误:
tokens=[x.lower() for x in token_pattern.findall(line)]
TypeError: ('expected string or buffer', u'occurred at index 0')
df [" query"]包含:
0新娘淋浴装饰
1个led圣诞灯
2台投影机
3个酒架
4个灯泡
5奥克利偏振雷达
6条男朋友牛仔裤
7屏幕保护三星
8个锅和平底锅
<9>华夫饼干制造商