我正在尝试建立一个模型来预测工作描述的薪水是高于还是低于第75个百分点(高于1,低于0)我的数据有大约250,000行,并且很难将所有文本标记为工作介绍。我的代码似乎工作正常,但它需要花费大量的时间来完成100行以上。我需要找到一种方法来提高效率,以便我可以在预测中包含更多行。
import random
import nltk
import pandas
import csv
import numpy as np
io = pandas.read_csv('Train_rev1.csv',sep=',',usecols=(2,10), nrows=501)
#converted = df.apply(lambda io : int(io[0]))
data = [np.array(x) for x in io.values]
random.shuffle(data)
size = int(len(data) * 0.6)
test_set, train_set = data[size:], data[:size]
train_set = np.array(train_set)
test_set = np.array(test_set)
x = train_set[:,1]
Sal75=np.percentile(x,75)
y = test_set[:,1]
Test75=np.percentile(y,75)
for i in range(len(train_set[:,1])):
if train_set[i,1]>=Sal75:
train_set[i,1]=1
else:
train_set[i,1]=0
for i in range(len(test_set[:,1])):
if test_set[i,1]>=Test75:
test_set[i,1]=1
else:
test_set[i,1]=0
train_setT = [tuple(x) for x in train_set]
test_setT = [tuple(x) for x in test_set]
from nltk.tokenize import word_tokenize
all_words = set(word.lower() for passage in train_setT for word in word_tokenize(passage[0]))
t = [({word: (word in word_tokenize(x[0])) for word in all_words}, x[1]) for x in train_setT]
classifier = nltk.NaiveBayesClassifier.train(t)
all_words2 = set(word.lower() for passage in test_setT for word in word_tokenize(passage[0]))
tt = [({word: (word in word_tokenize(x[0])) for word in all_words}, x[1]) for x in test_setT]
print nltk.classify.accuracy(classifier, tt)
classifier.show_most_informative_features(20)
testres = []
predres = []
for i in range(len(tt)):
testres.append(tt[i][1])
for i in range(len(tt)):
z = classifier.classify(tt[i][0])
predres.append(z)
from nltk.metrics import ConfusionMatrix
cm = nltk.ConfusionMatrix(testres, predres)
print(cm)
csv文件是从Kaggle。Use Train_rev1
中提取的答案 0 :(得分:1)
将数据拆分为60%和40%后,您可以执行以下操作。这将需要新工具,也许不需要NLTK。
import random
import nltk
import pandas
import csv
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from operator import itemgetter
from sklearn.metrics import classification_report
train_setT = [tuple(x) for x in train_set]
test_setT = [tuple(x) for x in test_set]
train_set = np.array([''.join(el[0]) for el in train_setT])
test_set = np.array([''.join(el[0]) for el in test_setT])
y_train = np.array([el[1] for el in train_setT])
y_test = np.array([el[1] for el in test_setT])
vectorizer = TfidfVectorizer(min_df=2,ngram_range=(1, 2), strip_accents='unicode', norm='l2')
X_train = vectorizer.fit_transform(train_set)
X_test = vectorizer.transform(test_set)
nb_classifier = MultinomialNB().fit(X_train, y_train)
y_nb_predicted = nb_classifier.predict(X_test)
print metrics.confusion_matrix(y_test, y_nb_predicted)
print classification_report(y_test, y_nb_predicted)