我只是尝试编译这段代码,但是我总是在hackerrank中遇到超时问题。 Traningdata.txt是hackerrank中的数据文件。 这个项目关于文件分类。 训练数据包括句子和标签。 我试图标记句子并与之合作。 结论中,我打印了所有y_head值,但是我的代码没有运行,只要可以打印。
# Enter your code here. Read input from STDIN. Print output to STDOUT
from sklearn import model_selection, preprocessing, naive_bayes
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import nltk
import re
import pandas as pd
import numpy as np
nltk.download('stopwords')
ps=PorterStemmer()
documents=[]
data=pd.read_csv("trainingdata.txt",header=None)
T=data[0][0]
data.drop(data.index[0],inplace=True)
sentences=[]
labels=[]
for each in data.iloc[:,0]:
each=each.split(" ",1)
labels.append(each[0])
sentences.append(each[1])
sentences=pd.DataFrame(sentences)
labels=pd.DataFrame(labels)
dataFrame=pd.concat([sentences,labels],axis=1)
dataFrame.columns=["Sentence","Label"]
for each in range(0,dataFrame.shape[0]):
yorum=re.sub("[^a-zA-Z]",' ',dataFrame["Sentence"][each])
yorum=yorum.lower()
yorum=nltk.word_tokenize(yorum)
yorum=[ps.stem(kelime) for kelime in yorum if not kelime in set(stopwords.words("english"))]
yorum=' '.join(yorum)
documents.append(yorum)
cv=CountVectorizer(max_features=2000)
x=cv.fit_transform(sentences).toarray()
y=dataFrame["Label"].values
from sklearn.naive_bayes import GaussianNB
nb=GaussianNB()
nb.fit(x,y)
y_head=nb.predict(x)
for each in y_head:
print(each)