将GridSearchCSV与KNN

时间:2018-07-22 06:19:44

标签: python machine-learning scikit-learn

我正在尝试解决Twitter情绪分析问题。我正在使用代码:

print()
print("Importing")
print()
#IMPORTS
from __future__ import print_function
import pandas as pd
import numpy as np
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics
from sklearn import model_selection
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.cross_validation import train_test_split
from sklearn.model_selection import GridSearchCV



def getting_data(train_dataset_name, test_dataset_name):
    print()
    print("Getting the data")
    print()
    #Parameter names are self explanatory - file names for datasets
    #This assumes you are executing this code statement from inside the directory with your datasets
    train = pd.read_csv(train_dataset_name).values
    train_y = train[:,1]
    train_x = train[:,2]

    test = pd.read_csv(test_dataset_name).values
    test = test[:,1]
    test = np.reshape(test,(test.shape[0],1))

    return train_x,train_y,test



def bagOfWords(test,train_x):
    print()
    print("Creating bag of words model")
    print()
    #Creates and returns bag-of-words versions of the test and train x

    #Train transformations
    corpus_train = []
    for i in range(0,train_x.shape[0]):
        review = re.sub('[^a-zA-Z]', ' ', train_x[i])
        review = review.lower().split()
        ps = PorterStemmer()
        review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
        review = ' '.join(review)
        corpus_train.append(review)

    #Test transformations
    corpus_test = []
    for i in range(0,test.shape[0]):
        review = re.sub('[^a-zA-Z]', ' ', test[i][0])
        review = review.lower().split()
        ps = PorterStemmer()
        review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
        review = ' '.join(review)
        corpus_test.append(review)

    return corpus_train,corpus_test



def dimensionality_reduction(corpus_train,corpus_test, return_ratio, components):
    print()
    print("Performing Dimensionality Reduction")
    print()
    #CountVectorizer
    cv = CountVectorizer(max_features = 1500)
    train_x = cv.fit_transform(corpus_train).toarray()

    #PCA
    pca = PCA(n_components=components)
    train_x = pca.fit_transform(train_x)
    explained_variance = pca.explained_variance_ratio_


    test = cv.transform(corpus_test).toarray()
    test = pca.transform(test)
    test = test.astype('float32')

    if (return_ratio):
        return train_x,test, explained_variance
    else:
        return train_x,test



def getOptimumParameters(train_x,train_y, return_stats):
    print()
    print("Getting optimum parameters")
    print("This optimization algorithm may take a while, so please be patient.")
    print("Please do not do other tasks while this runs.")
    print()
    train_x = train_x.astype('float32')
    train_y = train_y.astype('float32')

    classifier = KNeighborsClassifier() 
    classifier.fit(train_x,train_y)

    #For the sake of my program I used my own parameter lists.
    #If you use this code, please change them
    neighbor_list = [1,3,6,9,12,15,18,21,25]
    algorithm_list = ['brute', 'kd_tree', 'ball_tree']
    weights_list = ['uniform', 'distance']
    p_list = [1] #p_list = [1,2,3,4]
    leaf_list = [10,15,20,25,30,35,40,45,50]
    parameters = [{'n_neighbors':neighbor_list, 'weights':weights_list, 'algorithm':algorithm_list, 'p':p_list, 'leaf_size':leaf_list}]


    clf = GridSearchCV(estimator=classifier, param_grid = parameters, cv=5,refit=True, error_score=0, n_jobs = -1)
    clf = clf.fit(train_x,train_y)

    bc = clf.best_score_
    bp = clf.best_params_

    if return_stats:
        return clf, bc, bp
    else:
        return clf



def predictions(classifier, train_x, train_y, test, ratio):
    print()
    print("Making predictions")
    print()
    #Changing types to work with a classifier
    train_x= train_x.astype('float32')
    train_y = train_y.astype('float32')

    #Splitting training set into a training + dev set
    train_x,dev_x,train_y,dev_y = train_test_split(train_x,train_y,test_size = ratio, random_state=0)
    #Making predictions
    test = test.astype('float32')
    pred = classifier.predict(test)
    return pred



def convertPredToCsv(pred, csv_name):

    df = pd.DataFrame(pred)
    df.index.name = 'id'
    df.columns = ['label']

    df.to_csv("predictions.csv")




def main():
    #Retrieving the data
    train_x,train_y,test = getting_data('train.csv', 'test_tweets.csv')
    #Constructing Bag of words model
    corpus_train,corpus_test = bagOfWords(test,train_x)
    #Performing Dimensionality Reduction
    train_x,test = dimensionality_reduction(corpus_train,corpus_test,False,350)
    #Getting the optimum classifier
    classifier= getOptimumParameters(train_x,train_y, False)
    #Predicting + converting to csv
    pred = predictions(classifier, train_x, train_y, test, 0.1)
    convertPredToCsv(pred, 'predictions.csv')


if __name__ == "__main__":
    main()

每次涉及到getOptimumParameters函数时,都会遇到很多错误。有人说AttributeError,但是对于大多数错误,我找不到错误名称。我认为其他大多数错误都是为了将​​我引向AttributeError。我无法弄清楚为什么会发生此错误。我知道GridSearch出了点问题,但是我不知道参数是否出了点问题(我对其进行了三遍检查,找不到任何问题),或者是否还有其他问题。任何帮助是极大的赞赏。谢谢。

D:\Anaconda\lib\site-packages\numpy\core\fromnumeric.py in _wrapfunc(obj=array([[ 0.        , 30.70562651, 27.84020028, .... 38.11465899,
        25.22553572,  0.        ]]), method='argpartition', *args=(0,), **kwds={'axis': 1, 'kind': 'introselect', 'order': None})
     47     return result
     48 
     49 
     50 def _wrapfunc(obj, method, *args, **kwds):
     51     try:
---> 52         return getattr(obj, method)(*args, **kwds)
        obj = array([[ 0.        , 30.70562651, 27.84020028, .... 38.11465899,
        25.22553572,  0.        ]])
        method = 'argpartition'
        args = (0,)
        kwds = {'axis': 1, 'kind': 'introselect', 'order': None}
     53 
     54     # An AttributeError occurs if the object does not have
     55     # such a method in its class.
     56 

MemoryError: 

数据来自我的analyticsvidhya问题。这是下载培训数据的链接-这是一个保管箱链接。 https://www.dropbox.com/s/w4tagiewcuoxgkt/train.csv?dl=0

这是测试数据链接: https://www.dropbox.com/s/qiitwlpnkbs2c3m/test_tweets.csv?dl=0

谢谢。

2 个答案:

答案 0 :(得分:0)

您是否更新了模块?

这很奇怪,因为以下代码在我的macbook上运行时没有任何错误:

print()
print("Importing")
print()
#IMPORTS
from __future__ import print_function
import pandas as pd
import numpy as np
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics
from sklearn import model_selection
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.cross_validation import train_test_split
from sklearn.model_selection import GridSearchCV



def getting_data(train_dataset_name, test_dataset_name):
    print()
    print("Getting the data")
    print()
    #Parameter names are self explanatory - file names for datasets
    #This assumes you are executing this code statement from inside the directory with your datasets
    train = pd.read_csv(train_dataset_name).values
    train_y = train[:,1]
    train_x = train[:,2]

    test = pd.read_csv(test_dataset_name).values
    test = test[:,1]
    test = np.reshape(test,(test.shape[0],1))

    return train_x,train_y,test



def bagOfWords(test,train_x):
    print()
    print("Creating bag of words model")
    print()
    #Creates and returns bag-of-words versions of the test and train x

    #Train transformations
    corpus_train = []
    for i in range(0,train_x.shape[0]):
        review = re.sub('[^a-zA-Z]', ' ', train_x[i])
        review = review.lower().split()
        ps = PorterStemmer()
        review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
        review = ' '.join(review)
        corpus_train.append(review)

    #Test transformations
    corpus_test = []
    for i in range(0,test.shape[0]):
        review = re.sub('[^a-zA-Z]', ' ', test[i][0])
        review = review.lower().split()
        ps = PorterStemmer()
        review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
        review = ' '.join(review)
        corpus_test.append(review)

    return corpus_train,corpus_test



def dimensionality_reduction(corpus_train,corpus_test, return_ratio, components):
    print()
    print("Performing Dimensionality Reduction")
    print()
    #CountVectorizer
    cv = CountVectorizer(max_features = 1500)
    train_x = cv.fit_transform(corpus_train).toarray()

    #PCA
    pca = PCA(n_components=components)
    train_x = pca.fit_transform(train_x)
    explained_variance = pca.explained_variance_ratio_


    test = cv.transform(corpus_test).toarray()
    test = pca.transform(test)
    test = test.astype('float32')

    if (return_ratio):
        return train_x,test, explained_variance
    else:
        return train_x,test



def getOptimumParameters(train_x,train_y, return_stats):
    print()
    print("Getting optimum parameters")
    print("This optimization algorithm may take a while, so please be patient.")
    print("Please do not do other tasks while this runs.")
    print()
    train_x = train_x.astype('float32')
    train_y = train_y.astype('float32')

    classifier = KNeighborsClassifier() 
    #classifier.fit(train_x,train_y)

    #For the sake of my program I used my own parameter lists.
    #If you use this code, please change them
    neighbor_list = [1]
    algorithm_list = ['brute', 'kd_tree', 'ball_tree']
    weights_list = ['uniform', 'distance']
    p_list = [1] #p_list = [1,2,3,4]
    leaf_list = [10]
    parameters = [{'n_neighbors':neighbor_list, 'weights':weights_list, 'algorithm':algorithm_list, 'p':p_list, 'leaf_size':leaf_list}]


    clf = GridSearchCV(estimator=classifier, param_grid = parameters, cv=5,refit=True, error_score=0, n_jobs = -1)
    clf = clf.fit(train_x,train_y)

    bc = clf.best_score_
    bp = clf.best_params_

    if return_stats:
        return clf, bc, bp
    else:
        return clf



def predictions(classifier, train_x, train_y, test, ratio):
    print()
    print("Making predictions")
    print()
    #Changing types to work with a classifier
    train_x= train_x.astype('float32')
    train_y = train_y.astype('float32')

    #Splitting training set into a training + dev set
    train_x,dev_x,train_y,dev_y = train_test_split(train_x,train_y,test_size = ratio, random_state=0)
    #Making predictions
    test = test.astype('float32')
    pred = classifier.predict(test)
    return pred



def convertPredToCsv(pred, csv_name):

    df = pd.DataFrame(pred)
    df.index.name = 'id'
    df.columns = ['label']

    df.to_csv("predictions.csv")



def main():
    #Retrieving the data
    train_x,train_y,test = getting_data('train.csv', 'test_tweets.csv')
    #Constructing Bag of words model
    corpus_train,corpus_test = bagOfWords(test,train_x)
    #Performing Dimensionality Reduction
    train_x,test = dimensionality_reduction(corpus_train,corpus_test,False,350)
    #Getting the optimum classifier
    classifier= getOptimumParameters(train_x,train_y, False)
    #Predicting + converting to csv
    pred = predictions(classifier, train_x, train_y, test, 0.1)
    convertPredToCsv(pred, 'predictions.csv')


if __name__ == "__main__":
    main()

我的版本:

import sklearn

print(sklearn.__version__)
#0.19.1


import nltk

print(nltk.__version__)
#3.3 

答案 1 :(得分:0)

我知道已经有一段时间了,很抱歉。

只是想让大家知道,对于长时间的网格搜索,至少对于Windows用户而言,导入是不必要的

sklearn.model_selection.GridSearchCV

但实际上

sklearn.grid_search.GridSearchCV

前者几乎总是会引发内存错误,而后者即使在长时间的网格搜索中也能正常工作。