完整的sklearn管道示例

时间:2017-03-25 07:05:06

标签: python pandas scikit-learn pipeline

我正在尝试使用sklearn管道。但我在网上尝试了各种教程,它没有帮助我。

import pandas as pd 
import numpy as np
import json
import seaborn as sb 
from sklearn.metrics import log_loss
from sklearn import linear_model 
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC
from scipy.stats import zscore
from Transformers import TextTransformer
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV
%matplotlib inline
df = pd.read_json('data/train.json', encoding = 'utf-8', dtype = {'description': str})
len(df)
df = df[['description', 'interest_level']]
from sklearn.pipeline import Pipeline, FeatureUnion
a = TextTransformer('description', max_features=50)
b = TextTransformer('features', max_features=10)
pipeline = Pipeline([
    ('description',a ), # can pass in either a pipeline
        #('features',b ) # or a transformer
J    ('clf', SVC())  # classifier
])
pipeline.fit(df[:,'interest_level'])

我的文字转换器

from sklearn.base import BaseEstimator, TransformerMixin
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk


class TextTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, column, max_features=5000):
        self.tfidfVectorizer = TfidfVectorizer(use_idf=False, stop_words='english',
                                               tokenizer=self._custom_tokenizer, analyzer='word',
                                               max_features=max_features)
        self._vectorizer = None
        self._column = column

    def _custom_tokenizer(self, string):
        # string = re.sub('^[\w]', '', string)
        tokens = nltk.word_tokenize(string)
        cleaned = [x if not x.isdigit() else '_NUM_' for x in tokens]
        return [str(x.encode('utf-8')) for x in cleaned if (x.isalpha() or x == '_NUM_')]

    def _clean_html_tags(self, content):
        return BeautifulSoup(content, 'lxml').text

    def fit(self, df):
        self._vectorizer = self.tfidfVectorizer.fit(df[self._column].apply(self._clean_html_tags))
        return self

    def transform(self, df):
        return self._vectorizer.transform(df[self._column]).todense()

然而,我似乎无法做对。它继续在ipython notebook

中抛出此异常
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-11-b3788282dc5c> in <module>()
      8     ('clf', SVC())  # classifier
      9 ])
---> 10 pipeline.fit(df[:,'interest_level'])

/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/core/frame.pyc in __getitem__(self, key)
   2057             return self._getitem_multilevel(key)
   2058         else:
-> 2059             return self._getitem_column(key)
   2060 
   2061     def _getitem_column(self, key):

/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/core/frame.pyc in _getitem_column(self, key)
   2064         # get column
   2065         if self.columns.is_unique:
-> 2066             return self._get_item_cache(key)
   2067 
   2068         # duplicate columns & possible reduce dimensionality

/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/core/generic.pyc in _get_item_cache(self, item)
   1382         """Return the cached item, item represents a label indexer."""
   1383         cache = self._item_cache
-> 1384         res = cache.get(item)
   1385         if res is None:
   1386             values = self._data.get(item)

TypeError: unhashable type

数据描述

    description interest_level
10  A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...   medium
10000       low
100004  Top Top West Village location, beautiful Pre-w...   high
100007  Building Amenities - Garage - Garden - fitness...   low
100013  Beautifully renovated 3 bedroom flex 4 bedroom...   low

兴趣水平将是我的目标变量

3 个答案:

答案 0 :(得分:2)

您只适合一列(enter code hereprivate void mLoadData() { mUtil.getBaseClassService(this,"").getContry(currentPage,new Callback<JsonArray>() { @Override public void success(JsonArray jsonArray, Response response) { try { if (jsonArrayList.size() > 2) { if (jsonArrayList.get(jsonArrayList.size() - 2).equals(jsonArrayList.get(jsonArrayList.size() - 1))) { Log.d(TAG, "success: jsonarray eaqual" + "equal"); mDuplicateValuesStarts = true; return; } } jsonArrayList.add(jsonArray); if(!mDuplicateValuesStarts) { for (int i = 0; i < jsonArray.size(); i++) { CountriesModel countriesModel = new CountriesModel(); JsonObject jsonObject = jsonArray.get(i).getAsJsonObject(); countriesModel.setCountry(jsonObject.get("country").getAsString()); countriesModel.setId(jsonObject.get("id").getAsInt()); countriesModelList.add(countriesModel); // countryList.add(jsonObject.get("country").getAsString()); } // Log.d(TAG, "success: before"+countriesModelList.size()); filetDummyContriesListFromAPI(countriesModelList); // Log.d(TAG, "success: after"+countriesModelList.size()); adapter.notifyDataSetChanged(); listview_ll.setVisibility(View.VISIBLE); } }catch (Exception e){ Log.d(TAG, "success: "+e.getMessage()); } } @Override public void failure(RetrofitError error) { Toast.makeText(SignupActivity.this, "Internet connection error!", Toast.LENGTH_SHORT).show(); Log.d(TAG, "failure: retro"+error.getMessage()); } }); } mListView.setOnScrollListener(new AbsListView.OnScrollListener() { @Override public void onScrollStateChanged(AbsListView absListView, int i) { } @Override public void onScroll(AbsListView absListView, int firstVisibleItem, int visibleItemCount, int totalItemCount) { if (loading) { if (totalItemCount > previousTotal) { loading = false; previousTotal = totalItemCount; currentPage++; } } if (!loading && (totalItemCount - visibleItemCount) <= (firstVisibleItem + visibleThreshold)) { // I load the next page of gigs using a background task, // but you can call any function here. // new LoadGigsTask().execute(currentPage + 1); if(!mDuplicateValuesStarts) mLoadData(); loading = true; } } }); private void filetDummyContriesListFromAPI(List<CountriesModel> countriesModelList) { // Log.d(TAG, "filetDummyContriesListFromAPI: before2"+countriesModelList.size()); TreeSet<CountriesModel> myset = new TreeSet<>(countriesModelList); // Log.d(TAG, "filetDummyContriesListFromAPI: after"+countriesModelList.size()); this.countriesModelList.clear(); this.countriesModelList.addAll(myset); } ),但您的第一步(变换器df[:, 'interest_level])正在尝试访问列a: TextTransformer

答案 1 :(得分:2)

使用装饰器可以更轻松地编写管道,请参见example

您的代码应如下所示:

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
@SKTransform
def clean_num( txt):
        return re.compile('\\d+').sub('_NUM_', txt)

@SKTransform
def clean_tags(content):
        return BeautifulSoup(content, 'lxml').text

ppl = Pipeline([clean_tags,
                clean_num,
                TfidfVectorizer(use_idf=False, stop_words='english',tokenizer=nltk.word_tokenize,analyzer='word',max_features=max_features),
      ])

答案 2 :(得分:1)

用于所有特征分布(如分类,名义,序数等)的sklearn管道的简单示例。

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline




# All data encoders 
ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')
std = StandardScaler()



# To fill the missing values
imp_mean = SimpleImputer(strategy='most_frequent')

X_dense_data = imp_mean.fit_transform(X)
X = pd.DataFrame(X_dense_data, columns=X.columns.values.tolist())



# All columns distribution
ohe_column_catagorical_feature = ['race', 'sex', 'age group']
std_column_numeric_feature = ['height', 'weight', 'temperature', 'blood glucose']



# Numaric feature transformer
feature_numeric_transformer = Pipeline(steps=[          
          ('scaler_data', std)
])


# catagorical feature transformer
catagorical_numeric_transformer = Pipeline(steps=[          
          ('onehot', ohe)
])


# column transformer to transform the value of each feature
preprocessor_feature = ColumnTransformer(
    transformers=[
        ('num', feature_numeric_transformer, std_column_numeric_feature),
        ('cat', catagorical_numeric_transformer, ohe_column_catagorical_feature)
        ], remainder='drop'
        )

确保您的数据值已满。如果不是这样,则是使用sklear SimpleImputer填充无效的示例

插补策略。

如果为“均值”,则使用每列中的均值替换缺失值。只能与数字数据一起使用。

如果为“中位数”,则使用每列中的中位数替换缺失值。只能与数字数据一起使用。

如果为“ most_frequent”,则使用每一列中的最频繁值替换“ missing”。可以与字符串或数字数据一起使用。

如果为“常量”,则将缺失的值替换为fill_value。可以与字符串或数字数据一起使用。

# To fill the missing values
imp_mean = SimpleImputer(strategy='most_frequent')

X_dense_data = imp_mean.fit_transform(X)