我有一项任务是为产品标题创建一个多类分类器,将它们分为11类。我正在使用scikit的LinearSVC
进行分类。我首先通过删除停用词,使用POS标记进行词形还原以及使用带有TFIDF矢量化程序的双字母来预处理产品标题。
我现在想要使用chi2
特征选择方法从这些方法中消除不重要的特征然后进行训练。但是如何在模型中使用chi2
。以下是代码:
def identity(arg):
"""
Simple identity function works as a passthrough.
"""
return arg
class NLTKPreprocessor(BaseEstimator, TransformerMixin):
def __init__(self, stopwords=None, punct=None,
lower=True, strip=True):
self.lower = lower
self.strip = strip
self.stopwords = stopwords or set(sw.words('english'))
self.punct = punct or set(string.punctuation)
self.lemmatizer = WordNetLemmatizer()
def fit(self, X, y=None):
return self
def inverse_transform(self, X):
return [" ".join(doc) for doc in X]
def transform(self, X):
return [
list(self.tokenize(doc)) for doc in X
]
def tokenize(self, document):
# Break the document into sentences
for sent in sent_tokenize(document):
# Break the sentence into part of speech tagged tokens
for token, tag in pos_tag(wordpunct_tokenize(sent)):
# Apply preprocessing to the token
token = token.lower() if self.lower else token
token = token.strip() if self.strip else token
token = token.strip('_') if self.strip else token
token = token.strip('*') if self.strip else token
# If stopword, ignore token and continue
if token in self.stopwords or token.isdigit() == True:
continue
# If punctuation, ignore token and continue
if all(char in self.punct for char in token):
continue
# Lemmatize the token and yield
lemma = self.lemmatize(token, tag)
yield lemma
def lemmatize(self, token, tag):
tag = {
'N': wn.NOUN,
'V': wn.VERB,
'R': wn.ADV,
'J': wn.ADJ
}.get(tag[0], wn.NOUN)
return self.lemmatizer.lemmatize(token, tag)
def build_and_evaluate(X, y,
classifier=LinearSVC, outpath=None, verbose=True):
def build(classifier, X, y=None):
if isinstance(classifier, type):
classifier = classifier()
model = Pipeline([
('preprocessor', NLTKPreprocessor()),
('vectorizer', TfidfVectorizer(
tokenizer=identity, preprocessor=None, ngram_range = (1,2), min_df = 4, lowercase=False
)),
('classifier', classifier),
])
model.fit(X, y)
return model
labels = LabelEncoder()
y = labels.fit_transform(y)
X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2)
model = build(classifier, X_train, y_train)
y_pred = model.predict(X_test)
print(clsr(y_test, y_pred, target_names=labels.classes_))
return model
if __name__ == '__main__':
df = pd.read_csv('file.txt', sep='\t', quoting=csv.QUOTE_NONE, usecols=[6, 12], skiprows=[0],
names=["category", "product_title"])
freq = df['category'].value_counts()[:10].to_dict()
new_categories = []
for i, category in enumerate(df['category']):
if category in freq.keys():
new_categories.append(category)
else:
new_categories.append('Other')
df['new_categories'] = new_categories
X = df['product_title'].tolist()
X = [i.replace('"', '') for i in X]
newlist=[]
for i in X:
i = i.decode('utf8')
newlist.append(i)
y = df['new_categories'].tolist()
model = build_and_evaluate(newlist,y)
任何人都可以帮助我如何使用上述代码chi2
吗?谢谢!
答案 0 :(得分:1)
以与NLTKPreprocessor
相同的方式声明它,但在管道内的分类器之上。
按以下方式声明您的管道:
model = Pipeline([
('preprocessor', NLTKPreprocessor()),
('vectorizer', TfidfVectorizer(
tokenizer=identity, preprocessor=None, ngram_range = (1,2), min_df = 4, lowercase=False
)),
('selector', SelectKBest(chi2, k=10)),
('classifier', classifier),
])
尝试使用参数k
设置不同数量的所选要素。我在这里使用了10,但你需要调整它。也许使用GridSearchCV。