我正在尝试解决Twitter情绪分析问题。我正在使用代码:
print()
print("Importing")
print()
#IMPORTS
from __future__ import print_function
import pandas as pd
import numpy as np
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics
from sklearn import model_selection
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.cross_validation import train_test_split
from sklearn.model_selection import GridSearchCV
def getting_data(train_dataset_name, test_dataset_name):
print()
print("Getting the data")
print()
#Parameter names are self explanatory - file names for datasets
#This assumes you are executing this code statement from inside the directory with your datasets
train = pd.read_csv(train_dataset_name).values
train_y = train[:,1]
train_x = train[:,2]
test = pd.read_csv(test_dataset_name).values
test = test[:,1]
test = np.reshape(test,(test.shape[0],1))
return train_x,train_y,test
def bagOfWords(test,train_x):
print()
print("Creating bag of words model")
print()
#Creates and returns bag-of-words versions of the test and train x
#Train transformations
corpus_train = []
for i in range(0,train_x.shape[0]):
review = re.sub('[^a-zA-Z]', ' ', train_x[i])
review = review.lower().split()
ps = PorterStemmer()
review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
review = ' '.join(review)
corpus_train.append(review)
#Test transformations
corpus_test = []
for i in range(0,test.shape[0]):
review = re.sub('[^a-zA-Z]', ' ', test[i][0])
review = review.lower().split()
ps = PorterStemmer()
review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
review = ' '.join(review)
corpus_test.append(review)
return corpus_train,corpus_test
def dimensionality_reduction(corpus_train,corpus_test, return_ratio, components):
print()
print("Performing Dimensionality Reduction")
print()
#CountVectorizer
cv = CountVectorizer(max_features = 1500)
train_x = cv.fit_transform(corpus_train).toarray()
#PCA
pca = PCA(n_components=components)
train_x = pca.fit_transform(train_x)
explained_variance = pca.explained_variance_ratio_
test = cv.transform(corpus_test).toarray()
test = pca.transform(test)
test = test.astype('float32')
if (return_ratio):
return train_x,test, explained_variance
else:
return train_x,test
def getOptimumParameters(train_x,train_y, return_stats):
print()
print("Getting optimum parameters")
print("This optimization algorithm may take a while, so please be patient.")
print("Please do not do other tasks while this runs.")
print()
train_x = train_x.astype('float32')
train_y = train_y.astype('float32')
classifier = KNeighborsClassifier()
classifier.fit(train_x,train_y)
#For the sake of my program I used my own parameter lists.
#If you use this code, please change them
neighbor_list = [1,3,6,9,12,15,18,21,25]
algorithm_list = ['brute', 'kd_tree', 'ball_tree']
weights_list = ['uniform', 'distance']
p_list = [1] #p_list = [1,2,3,4]
leaf_list = [10,15,20,25,30,35,40,45,50]
parameters = [{'n_neighbors':neighbor_list, 'weights':weights_list, 'algorithm':algorithm_list, 'p':p_list, 'leaf_size':leaf_list}]
clf = GridSearchCV(estimator=classifier, param_grid = parameters, cv=5,refit=True, error_score=0, n_jobs = -1)
clf = clf.fit(train_x,train_y)
bc = clf.best_score_
bp = clf.best_params_
if return_stats:
return clf, bc, bp
else:
return clf
def predictions(classifier, train_x, train_y, test, ratio):
print()
print("Making predictions")
print()
#Changing types to work with a classifier
train_x= train_x.astype('float32')
train_y = train_y.astype('float32')
#Splitting training set into a training + dev set
train_x,dev_x,train_y,dev_y = train_test_split(train_x,train_y,test_size = ratio, random_state=0)
#Making predictions
test = test.astype('float32')
pred = classifier.predict(test)
return pred
def convertPredToCsv(pred, csv_name):
df = pd.DataFrame(pred)
df.index.name = 'id'
df.columns = ['label']
df.to_csv("predictions.csv")
def main():
#Retrieving the data
train_x,train_y,test = getting_data('train.csv', 'test_tweets.csv')
#Constructing Bag of words model
corpus_train,corpus_test = bagOfWords(test,train_x)
#Performing Dimensionality Reduction
train_x,test = dimensionality_reduction(corpus_train,corpus_test,False,350)
#Getting the optimum classifier
classifier= getOptimumParameters(train_x,train_y, False)
#Predicting + converting to csv
pred = predictions(classifier, train_x, train_y, test, 0.1)
convertPredToCsv(pred, 'predictions.csv')
if __name__ == "__main__":
main()
每次涉及到getOptimumParameters函数时,都会遇到很多错误。有人说AttributeError,但是对于大多数错误,我找不到错误名称。我认为其他大多数错误都是为了将我引向AttributeError。我无法弄清楚为什么会发生此错误。我知道GridSearch出了点问题,但是我不知道参数是否出了点问题(我对其进行了三遍检查,找不到任何问题),或者是否还有其他问题。任何帮助是极大的赞赏。谢谢。
D:\Anaconda\lib\site-packages\numpy\core\fromnumeric.py in _wrapfunc(obj=array([[ 0. , 30.70562651, 27.84020028, .... 38.11465899,
25.22553572, 0. ]]), method='argpartition', *args=(0,), **kwds={'axis': 1, 'kind': 'introselect', 'order': None})
47 return result
48
49
50 def _wrapfunc(obj, method, *args, **kwds):
51 try:
---> 52 return getattr(obj, method)(*args, **kwds)
obj = array([[ 0. , 30.70562651, 27.84020028, .... 38.11465899,
25.22553572, 0. ]])
method = 'argpartition'
args = (0,)
kwds = {'axis': 1, 'kind': 'introselect', 'order': None}
53
54 # An AttributeError occurs if the object does not have
55 # such a method in its class.
56
MemoryError:
数据来自我的analyticsvidhya问题。这是下载培训数据的链接-这是一个保管箱链接。 https://www.dropbox.com/s/w4tagiewcuoxgkt/train.csv?dl=0
这是测试数据链接: https://www.dropbox.com/s/qiitwlpnkbs2c3m/test_tweets.csv?dl=0
谢谢。
答案 0 :(得分:0)
您是否更新了模块?
这很奇怪,因为以下代码在我的macbook上运行时没有任何错误:
print()
print("Importing")
print()
#IMPORTS
from __future__ import print_function
import pandas as pd
import numpy as np
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics
from sklearn import model_selection
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.cross_validation import train_test_split
from sklearn.model_selection import GridSearchCV
def getting_data(train_dataset_name, test_dataset_name):
print()
print("Getting the data")
print()
#Parameter names are self explanatory - file names for datasets
#This assumes you are executing this code statement from inside the directory with your datasets
train = pd.read_csv(train_dataset_name).values
train_y = train[:,1]
train_x = train[:,2]
test = pd.read_csv(test_dataset_name).values
test = test[:,1]
test = np.reshape(test,(test.shape[0],1))
return train_x,train_y,test
def bagOfWords(test,train_x):
print()
print("Creating bag of words model")
print()
#Creates and returns bag-of-words versions of the test and train x
#Train transformations
corpus_train = []
for i in range(0,train_x.shape[0]):
review = re.sub('[^a-zA-Z]', ' ', train_x[i])
review = review.lower().split()
ps = PorterStemmer()
review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
review = ' '.join(review)
corpus_train.append(review)
#Test transformations
corpus_test = []
for i in range(0,test.shape[0]):
review = re.sub('[^a-zA-Z]', ' ', test[i][0])
review = review.lower().split()
ps = PorterStemmer()
review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
review = ' '.join(review)
corpus_test.append(review)
return corpus_train,corpus_test
def dimensionality_reduction(corpus_train,corpus_test, return_ratio, components):
print()
print("Performing Dimensionality Reduction")
print()
#CountVectorizer
cv = CountVectorizer(max_features = 1500)
train_x = cv.fit_transform(corpus_train).toarray()
#PCA
pca = PCA(n_components=components)
train_x = pca.fit_transform(train_x)
explained_variance = pca.explained_variance_ratio_
test = cv.transform(corpus_test).toarray()
test = pca.transform(test)
test = test.astype('float32')
if (return_ratio):
return train_x,test, explained_variance
else:
return train_x,test
def getOptimumParameters(train_x,train_y, return_stats):
print()
print("Getting optimum parameters")
print("This optimization algorithm may take a while, so please be patient.")
print("Please do not do other tasks while this runs.")
print()
train_x = train_x.astype('float32')
train_y = train_y.astype('float32')
classifier = KNeighborsClassifier()
#classifier.fit(train_x,train_y)
#For the sake of my program I used my own parameter lists.
#If you use this code, please change them
neighbor_list = [1]
algorithm_list = ['brute', 'kd_tree', 'ball_tree']
weights_list = ['uniform', 'distance']
p_list = [1] #p_list = [1,2,3,4]
leaf_list = [10]
parameters = [{'n_neighbors':neighbor_list, 'weights':weights_list, 'algorithm':algorithm_list, 'p':p_list, 'leaf_size':leaf_list}]
clf = GridSearchCV(estimator=classifier, param_grid = parameters, cv=5,refit=True, error_score=0, n_jobs = -1)
clf = clf.fit(train_x,train_y)
bc = clf.best_score_
bp = clf.best_params_
if return_stats:
return clf, bc, bp
else:
return clf
def predictions(classifier, train_x, train_y, test, ratio):
print()
print("Making predictions")
print()
#Changing types to work with a classifier
train_x= train_x.astype('float32')
train_y = train_y.astype('float32')
#Splitting training set into a training + dev set
train_x,dev_x,train_y,dev_y = train_test_split(train_x,train_y,test_size = ratio, random_state=0)
#Making predictions
test = test.astype('float32')
pred = classifier.predict(test)
return pred
def convertPredToCsv(pred, csv_name):
df = pd.DataFrame(pred)
df.index.name = 'id'
df.columns = ['label']
df.to_csv("predictions.csv")
def main():
#Retrieving the data
train_x,train_y,test = getting_data('train.csv', 'test_tweets.csv')
#Constructing Bag of words model
corpus_train,corpus_test = bagOfWords(test,train_x)
#Performing Dimensionality Reduction
train_x,test = dimensionality_reduction(corpus_train,corpus_test,False,350)
#Getting the optimum classifier
classifier= getOptimumParameters(train_x,train_y, False)
#Predicting + converting to csv
pred = predictions(classifier, train_x, train_y, test, 0.1)
convertPredToCsv(pred, 'predictions.csv')
if __name__ == "__main__":
main()
我的版本:
import sklearn
print(sklearn.__version__)
#0.19.1
import nltk
print(nltk.__version__)
#3.3
答案 1 :(得分:0)
我知道已经有一段时间了,很抱歉。
只是想让大家知道,对于长时间的网格搜索,至少对于Windows用户而言,导入是不必要的
sklearn.model_selection.GridSearchCV
但实际上
sklearn.grid_search.GridSearchCV
前者几乎总是会引发内存错误,而后者即使在长时间的网格搜索中也能正常工作。