我正在尝试使用sklearn管道。但我在网上尝试了各种教程,它没有帮助我。
import pandas as pd
import numpy as np
import json
import seaborn as sb
from sklearn.metrics import log_loss
from sklearn import linear_model
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC
from scipy.stats import zscore
from Transformers import TextTransformer
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV
%matplotlib inline
df = pd.read_json('data/train.json', encoding = 'utf-8', dtype = {'description': str})
len(df)
df = df[['description', 'interest_level']]
from sklearn.pipeline import Pipeline, FeatureUnion
a = TextTransformer('description', max_features=50)
b = TextTransformer('features', max_features=10)
pipeline = Pipeline([
('description',a ), # can pass in either a pipeline
#('features',b ) # or a transformer
J ('clf', SVC()) # classifier
])
pipeline.fit(df[:,'interest_level'])
我的文字转换器
from sklearn.base import BaseEstimator, TransformerMixin
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
class TextTransformer(BaseEstimator, TransformerMixin):
def __init__(self, column, max_features=5000):
self.tfidfVectorizer = TfidfVectorizer(use_idf=False, stop_words='english',
tokenizer=self._custom_tokenizer, analyzer='word',
max_features=max_features)
self._vectorizer = None
self._column = column
def _custom_tokenizer(self, string):
# string = re.sub('^[\w]', '', string)
tokens = nltk.word_tokenize(string)
cleaned = [x if not x.isdigit() else '_NUM_' for x in tokens]
return [str(x.encode('utf-8')) for x in cleaned if (x.isalpha() or x == '_NUM_')]
def _clean_html_tags(self, content):
return BeautifulSoup(content, 'lxml').text
def fit(self, df):
self._vectorizer = self.tfidfVectorizer.fit(df[self._column].apply(self._clean_html_tags))
return self
def transform(self, df):
return self._vectorizer.transform(df[self._column]).todense()
然而,我似乎无法做对。它继续在ipython notebook
中抛出此异常---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-11-b3788282dc5c> in <module>()
8 ('clf', SVC()) # classifier
9 ])
---> 10 pipeline.fit(df[:,'interest_level'])
/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/core/frame.pyc in __getitem__(self, key)
2057 return self._getitem_multilevel(key)
2058 else:
-> 2059 return self._getitem_column(key)
2060
2061 def _getitem_column(self, key):
/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/core/frame.pyc in _getitem_column(self, key)
2064 # get column
2065 if self.columns.is_unique:
-> 2066 return self._get_item_cache(key)
2067
2068 # duplicate columns & possible reduce dimensionality
/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/core/generic.pyc in _get_item_cache(self, item)
1382 """Return the cached item, item represents a label indexer."""
1383 cache = self._item_cache
-> 1384 res = cache.get(item)
1385 if res is None:
1386 values = self._data.get(item)
TypeError: unhashable type
数据描述
description interest_level
10 A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ... medium
10000 low
100004 Top Top West Village location, beautiful Pre-w... high
100007 Building Amenities - Garage - Garden - fitness... low
100013 Beautifully renovated 3 bedroom flex 4 bedroom... low
兴趣水平将是我的目标变量
答案 0 :(得分:2)
您只适合一列(enter code
hereprivate void mLoadData() {
mUtil.getBaseClassService(this,"").getContry(currentPage,new Callback<JsonArray>() {
@Override
public void success(JsonArray jsonArray, Response response) {
try {
if (jsonArrayList.size() > 2) {
if (jsonArrayList.get(jsonArrayList.size() - 2).equals(jsonArrayList.get(jsonArrayList.size() - 1))) {
Log.d(TAG, "success: jsonarray eaqual" + "equal");
mDuplicateValuesStarts = true;
return;
}
}
jsonArrayList.add(jsonArray);
if(!mDuplicateValuesStarts) {
for (int i = 0; i < jsonArray.size(); i++) {
CountriesModel countriesModel = new CountriesModel();
JsonObject jsonObject = jsonArray.get(i).getAsJsonObject();
countriesModel.setCountry(jsonObject.get("country").getAsString());
countriesModel.setId(jsonObject.get("id").getAsInt());
countriesModelList.add(countriesModel);
// countryList.add(jsonObject.get("country").getAsString());
}
// Log.d(TAG, "success: before"+countriesModelList.size());
filetDummyContriesListFromAPI(countriesModelList);
// Log.d(TAG, "success: after"+countriesModelList.size());
adapter.notifyDataSetChanged();
listview_ll.setVisibility(View.VISIBLE);
}
}catch (Exception e){
Log.d(TAG, "success: "+e.getMessage());
}
}
@Override
public void failure(RetrofitError error) {
Toast.makeText(SignupActivity.this, "Internet connection error!", Toast.LENGTH_SHORT).show();
Log.d(TAG, "failure: retro"+error.getMessage());
}
});
}
mListView.setOnScrollListener(new AbsListView.OnScrollListener() {
@Override
public void onScrollStateChanged(AbsListView absListView, int i) {
}
@Override
public void onScroll(AbsListView absListView, int firstVisibleItem, int visibleItemCount, int totalItemCount) {
if (loading) {
if (totalItemCount > previousTotal) {
loading = false;
previousTotal = totalItemCount;
currentPage++;
}
}
if (!loading && (totalItemCount - visibleItemCount) <= (firstVisibleItem + visibleThreshold)) {
// I load the next page of gigs using a background task,
// but you can call any function here.
// new LoadGigsTask().execute(currentPage + 1);
if(!mDuplicateValuesStarts)
mLoadData();
loading = true;
}
}
});
private void filetDummyContriesListFromAPI(List<CountriesModel> countriesModelList) {
// Log.d(TAG, "filetDummyContriesListFromAPI: before2"+countriesModelList.size());
TreeSet<CountriesModel> myset = new TreeSet<>(countriesModelList);
// Log.d(TAG, "filetDummyContriesListFromAPI: after"+countriesModelList.size());
this.countriesModelList.clear();
this.countriesModelList.addAll(myset);
}
),但您的第一步(变换器df[:, 'interest_level]
)正在尝试访问列a: TextTransformer
。
答案 1 :(得分:2)
使用装饰器可以更轻松地编写管道,请参见example
您的代码应如下所示:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
@SKTransform
def clean_num( txt):
return re.compile('\\d+').sub('_NUM_', txt)
@SKTransform
def clean_tags(content):
return BeautifulSoup(content, 'lxml').text
ppl = Pipeline([clean_tags,
clean_num,
TfidfVectorizer(use_idf=False, stop_words='english',tokenizer=nltk.word_tokenize,analyzer='word',max_features=max_features),
])
答案 2 :(得分:1)
用于所有特征分布(如分类,名义,序数等)的sklearn管道的简单示例。
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
# All data encoders
ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')
std = StandardScaler()
# To fill the missing values
imp_mean = SimpleImputer(strategy='most_frequent')
X_dense_data = imp_mean.fit_transform(X)
X = pd.DataFrame(X_dense_data, columns=X.columns.values.tolist())
# All columns distribution
ohe_column_catagorical_feature = ['race', 'sex', 'age group']
std_column_numeric_feature = ['height', 'weight', 'temperature', 'blood glucose']
# Numaric feature transformer
feature_numeric_transformer = Pipeline(steps=[
('scaler_data', std)
])
# catagorical feature transformer
catagorical_numeric_transformer = Pipeline(steps=[
('onehot', ohe)
])
# column transformer to transform the value of each feature
preprocessor_feature = ColumnTransformer(
transformers=[
('num', feature_numeric_transformer, std_column_numeric_feature),
('cat', catagorical_numeric_transformer, ohe_column_catagorical_feature)
], remainder='drop'
)
确保您的数据值已满。如果不是这样,则是使用sklear SimpleImputer填充无效的示例
插补策略。
如果为“均值”,则使用每列中的均值替换缺失值。只能与数字数据一起使用。
如果为“中位数”,则使用每列中的中位数替换缺失值。只能与数字数据一起使用。
如果为“ most_frequent”,则使用每一列中的最频繁值替换“ missing”。可以与字符串或数字数据一起使用。
如果为“常量”,则将缺失的值替换为fill_value。可以与字符串或数字数据一起使用。
# To fill the missing values
imp_mean = SimpleImputer(strategy='most_frequent')
X_dense_data = imp_mean.fit_transform(X)