优化NLTK代码以从文本进行预测

时间:2014-09-12 21:32:44

标签: python performance nltk tokenize text-mining

我正在尝试建立一个模型来预测工作描述的薪水是高于还是低于第75个百分点(高于1,低于0)我的数据有大约250,000行,并且很难将所有文本标记为工作介绍。我的代码似乎工作正常,但它需要花费大量的时间来完成100行以上。我需要找到一种方法来提高效率,以便我可以在预测中包含更多行。

import random
import nltk
import pandas
import csv
import numpy as np

io = pandas.read_csv('Train_rev1.csv',sep=',',usecols=(2,10), nrows=501)
#converted = df.apply(lambda io : int(io[0]))
data = [np.array(x) for x in io.values]

random.shuffle(data)
size = int(len(data) * 0.6)
test_set, train_set = data[size:], data[:size]
train_set = np.array(train_set)
test_set = np.array(test_set)
x = train_set[:,1]
Sal75=np.percentile(x,75)
y = test_set[:,1]
Test75=np.percentile(y,75)

for i in range(len(train_set[:,1])):
    if train_set[i,1]>=Sal75:
        train_set[i,1]=1
    else:
        train_set[i,1]=0

for i in range(len(test_set[:,1])):
    if test_set[i,1]>=Test75:
        test_set[i,1]=1
    else:
        test_set[i,1]=0

train_setT = [tuple(x) for x in train_set]
test_setT = [tuple(x) for x in test_set]



from nltk.tokenize import word_tokenize
all_words = set(word.lower() for passage in train_setT for word in word_tokenize(passage[0]))
t = [({word: (word in word_tokenize(x[0])) for word in all_words}, x[1]) for x in train_setT]

classifier = nltk.NaiveBayesClassifier.train(t)

all_words2 = set(word.lower() for passage in test_setT for word in word_tokenize(passage[0]))
tt = [({word: (word in word_tokenize(x[0])) for word in all_words}, x[1]) for x in test_setT]


print nltk.classify.accuracy(classifier, tt)
classifier.show_most_informative_features(20)
testres = []
predres = []
for i in range(len(tt)):
    testres.append(tt[i][1])
for i in range(len(tt)):
    z = classifier.classify(tt[i][0])
    predres.append(z)
from nltk.metrics import ConfusionMatrix
cm = nltk.ConfusionMatrix(testres, predres)
print(cm)

csv文件是从Kaggle。Use Train_rev1

中提取的

1 个答案:

答案 0 :(得分:1)

将数据拆分为60%和40%后,您可以执行以下操作。这将需要新工具,也许不需要NLTK。

import random
import nltk
import pandas
import csv
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from operator import itemgetter
from sklearn.metrics import classification_report
train_setT = [tuple(x) for x in train_set]
test_setT = [tuple(x) for x in test_set]


train_set = np.array([''.join(el[0]) for el in train_setT])
test_set = np.array([''.join(el[0]) for el in test_setT])

y_train = np.array([el[1] for el in train_setT])
y_test = np.array([el[1] for el in test_setT])

vectorizer = TfidfVectorizer(min_df=2,ngram_range=(1, 2), strip_accents='unicode', norm='l2')

X_train = vectorizer.fit_transform(train_set)
X_test = vectorizer.transform(test_set)

nb_classifier = MultinomialNB().fit(X_train, y_train)

y_nb_predicted = nb_classifier.predict(X_test)


print metrics.confusion_matrix(y_test, y_nb_predicted)
print classification_report(y_test, y_nb_predicted)