from sklearn.tree import DecisionTreeClassifier
clfs = {'RF': RandomForestClassifier(n_estimators=50, n_jobs=-1),
'ET': ExtraTreesClassifier(n_estimators=10, n_jobs=-1, criterion='entropy'),
'AB': AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME", n_estimators=200),
'LR': LogisticRegression(penalty='l1', C=1e5),
'SVM': svm.SVC(kernel='linear', probability=True, random_state=0),
'GB': GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=10),
'NB': GaussianNB(),
'DT': DecisionTreeClassifier()
}
class Model:
def __init__(self, dataSet, dependentVar, doFeatureSelection=True, doPCA=False, nComponents=10):
for i,tp in enumerate(dataSet.dtypes):
if tp == 'object':
print 'Encoding feature \"' + dataSet.columns[i] + '\" ...'
print 'Old dataset shape: ' + str(dataSet.shape)
temp = pd.get_dummies(dataSet[dataSet.columns[i]],prefix=dataSet.columns[i])
dataSet = pd.concat([dataSet,temp],axis=1).drop(dataSet.columns[i],axis=1)
print 'New dataset shape: ' + str(dataSet.shape)
#unique_vals, dataSet.ix[:,i] = np.unique(dataSet.ix[:,i] , return_inverse=True)
# Set the dependent variable (y) to the appropriate column
y = dataSet.loc[:,dependentVar]
# Transform that information to a format that scikit-learn understands
# This may be redundant at times
labels = preprocessing.LabelEncoder().fit_transform(y)
# Remove the dependent variable from training sets
X = dataSet.drop(dependentVar,1).values
# Perform entropy-based feature selection
if doFeatureSelection:
print 'Performing Feature Selection:'
print 'Shape of dataset before feature selection: ' + str(X.shape)
clf = DecisionTreeClassifier(criterion='entropy')
X = clf.fit(X, y).transform(X)
print 'Shape of dataset after feature selection: ' + str(X.shape) + '\n'