Question

我是一名新的python程序员，并且在分类方面也是如此。这是我第一次尝试使用监督数据进行预处理和分类任务。我用tweepy收集了数据。

我正在进行多级分类。我在训练分类器之前尝试清理数据，这里是我用过的scrpit：

from flask import Flask, request 
from sklearn.cross_validation import cross_val_score 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.naive_bayes import MultinomialNB 
from sklearn.linear_model import LogisticRegression 
from sklearn.svm import SVC 
from nltk.corpus import wordnet as wn 
from nltk.stem import PorterStemmer 
from nltk.corpus import stopwords 
from sklearn.svm import LinearSVC 
from sklearn.neighbors import KNeighborsClassifier 
from string import punctuation 
import nltk 
import re 

data = []  # the tweet array 
target = []  # the category array 

file = open("readin file.txt", "r") 
count = 0 
for line in file:  # Declare 'line' variable and store the file into it
line_array = line.split(",,,") 
try: 
    data.append(line_array[4])  # appending tweet line into data array 
    target.append(line_array[0])  # appending categories into target array 
except:  
    pass  

stopWords = stopwords.words('english') 

stemmer = PorterStemmer() 

def stem_tokens(tokens, stemmer): 
    stemmed = [] 
    for item in tokens: 
        stemmed.append(stemmer.stem(item)) 
    return stemmed 


def tokenize(text): 
    text = text.lower() 
    text = text.lower() 
    text = text.replace("#","") 
    text = re.sub(r'((www\.[^\s]+)|(https?://[^\s]+))|(http?://[^\s]+)', '', text) 
    text = re.sub('&[^\s]+', r'', text) 
    text = re.sub('@[^\s]+', r'', text) 
    text = re.sub('#([^\s]+)', r'', text) 
    text = text.strip(r'\'"') 
    text = re.sub(r'[\s]+', ' ', text) 

tokens = nltk.word_tokenize(text) 
tokens_new = [] 

for item in tokens: 
    if "www." in item or "http" in item: 
        item = "URL" 
    if "@" in item: 
        item = "AT_USER" 
    if re.match("[a-zA-Z]",item) and len(item)>2\ 
        and "#illegalimmigration" not in item\ 
        and "#illegalimmigrants" not in item\ 
        and "#GOPDepate" not in item\ 
        and "#WakeUpAmerica" not in item\ 
        and "#election2016" not in item\ 
        and "#trump" not in item\ 
        and "#SanctuaryCities" not in item\ 
        and "#Hillary2016" not in item\ 
        and "#PopeVisitsUS" not in item\ 
        and "#tcot" not in item\ 
        and "#DonaldTrump" not in item\ 
        and "#PopeFrancis" not in item\ 
        and "#ACA" not in item\ 
        and "#NoAmnesty" not in item\ 
        and "#blm" not in item: 
        all = [] 
        for i,j in enumerate(wn.synsets(item)): 
            all.extend(j.lemma_names()) 
            tokens_new.extend(list(set(all))) 

# print "Text " 
# print text 
# print "Tokens " 
# print tokens 
# print tokens_new 
stems = stem_tokens(tokens_new, stemmer) 
# print "Stems " 
# print stems 
return stems 

from sklearn.feature_extraction.text import CountVectorizer 
count_vect = CountVectorizer(stop_words = stopWords,tokenizer=tokenize) 
X_train_counts = count_vect.fit_transform(data) 

from sklearn.feature_extraction.text import TfidfTransformer 
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts) 
X_train_tf = tf_transformer.transform(X_train_counts) 

tfidf_transformer = TfidfTransformer() 
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) 


clf = MultinomialNB().fit(X_train_tfidf, target) 
scores = cross_val_score(clf,X_train_tfidf,target,cv=10,scoring='accuracy') 
print 'Naive Bayes Classifier' 
print scores 
print scores.mean() 
print

数据看起来像一个样本： category ,,, countTweet ,,, profileName ,,, userName ,,, tweet

1,,, 4,,,cjlamb,,,16campaignbites,,,@thinkprogress Let's give GOPers citizenship test. If they fail, they leave the country and immigrants stay 

2,,, 191,,,Acadi Anna,,,Acadianna32,,,#Deport the millions of #IllegalImmigrants in the United States illegally and build a wall between the U.S. &amp; Mexicohttp://t.co/AWJZBuZcJb 

3,,, 460,,,The Angry Vet,,,DemonTwoSix,,,RT @sweety125: @hempforfood @RickCanton But an illegal alien can be deported 5 times &amp; come back &amp; get #SanctuaryCities then kills a young

1 ,,,支持在美国保留非法违法者的意见1号。 2 ,,,支持意见2号，即驱逐所有非法移民。和3 ,,,支持第3号意见，这只是驱逐犯罪的非法移民我得到这个输出，这是分类的坏结果：

Naive Bayes Classifier 
[ 0.51612903  0.51612903  0.5         0.58333333  0.74576271  0.62068966 
  0.53448276  0.60344828  0.5         0.5862069 ] 
0.570618169592

以下是打印时的词干，文本，新标记和标记的一个示例：

Text 
let's give gopers citizenship test. if they fail, they leave the country and immigrants stay 
Tokens 
[u'let', u"'s", u'give', u'gopers', u'citizenship', u'test', u'.', u'if',     u'they', u'fail', u',', u'they', u'leave', u'the', u'country', u'and', u'immigrants', u'stay'] 
New Tokens 
[u'Army_of_the_Pure', u'Army_of_the_Righteous', u'LET', u'Lashkar-e-Taiba', u'Lashkar-e-Tayyiba', u'Lashkar-e-Toiba', u'Army_of_the_Pure', u'Army_of_the_Righteous', u'net_ball', u'LET', u'Lashkar-e-Taiba', u'let', u'Lashkar-e-Tayyiba', u'Lashkar-e-Toiba', u'Army_of_the_Pure', u'Army_of_the_Righteous', u'net_ball', u'LET', u'Lashkar-e-Taiba', u'allow', u'permit', u'let', u'Lashkar-e-Tayyiba', u'Lashkar-e-Toiba', u'Army_of_the_Pure', u'Army_of_the_Righteous', u'net_ball', u'LET', u'Lashkar-e-Taiba', u'allow', u'permit', u'let', u'Lashkar-e-Tayyiba', u'Lashkar-e-Toiba', u'Army_of_the_Pure', u'Army_of_the_Righteous', u'countenance', u'net_ball', u'LET', u'Lashkar-e-Taiba', u'allow', u'permit', u'let', u'Lashkar-e-Tayyiba', u'Army_of_the_Righteous', u'countenance', u'net_ball', u'LET', u'Lashkar-e-Taiba', u'allow', u'permit', u'let', u'Lashkar-e-Tayyiba', u'Lashkar-e-Toiba', u'have', u'get', u'Army_of_the_Pure', u'Army_of_the_Righteous', u'countenance', u'net_ball', u'LET', u'Lashkar-e-Taiba', u'allow', u'permit', u'rent', u'let', u'lease', u'Lashkar-e-Tayyiba', u'Lashkar-e-Toiba', u'spring', u'springiness', u'give', u'spring', u'springiness', u'give', u'afford', u'spring', u'yield', u'springiness', u'yield', u'springiness', u'pay', u'hold', u'throw', u'present', u'have', u'gift', u'give', u'afford', u'spring', u'make', u'devote', u'yield', u'springiness', u'pay', u'hold', u'throw', u'present', u'return', u'have', u'gift', u'give', u'afford', u'spring', u'make', u'devote', u'yield',  u'apply', u'establish', u'founder', u'open', u'grant', u'pay', u'make', u'devote', u'throw', u'cave_in', u'impart', u'fall_in', u'chip_in', u'return', u'collapse', u'turn_over', u'afford', u'sacrifice', u'give_way', u'reach', u'hand', u'break', u'hold', u'generate', u'present', u'gift', u'dedicate', u'yield', u'leave', u'ease_up', u'commit', u'pass_on', u'citizenship', u'citizenship', u'test', u'trial', u'trial_run', u'tryout', u'trial_run', u'mental_test'] 
Stems 
[u'Army_of_the_Pur', u'Army_of_the_Right', u'LET', u'Lashkar-e-Taiba', u'Lashkar-e-Tayyiba', u'Lashkar-e-Toiba', u'Army_of_the_Pur', u'Army_of_the_Right', u'net_bal', u'LET', u'Lashkar-e-Taiba', u'let', u'Lashkar-e-Tayyiba', u'Lashkar-e-Toiba', u'Army_of_the_Pur', u'Army_of_the_Right', u'net_bal', u'LET', u'Lashkar-e-Taiba', u'allow', u'permit', u'let', u'Lashkar-e-Tayyiba', u'Lashkar-e-Toiba', u'Army_of_the_Pur', u'Army_of_the_Right', u'net_bal', u'LET', u'Lashkar-e-Taiba', u'allow', u'permit', u'let', u'Lashkar-e-Tayyiba', u'Lashkar-e-Toiba', u'Army_of_the_Pur', u'Army_of_the_Right', u'counten', u'net_bal', u'LET', u'pass', u'appli', u'establish', u'founder', u'open', u'grant', u'pay', u'make', u'devot', u'throw', u'cave_in', u'tryout', u'test', u'exam', u'trial_run', u'mental_test', u'psychometric_test', u'trial', u'mental_test', u'tryout', u'test', u'examin', u'exam', u'trial_run', u'mental_test', u'psychometric_test', u'trial', u'mental_test', u'tryout', u'test', u'examin', u'run', u'exam', u'trial_run', u'mental_test', u'psychometric_test', u'trial', u'mental_test', u'tryout', u'test', u'examin', u'run', u'exam', u'trial_run', u'mental_test', u'psychometric_test', u'trial', u'mental_test', u'tryout', u'test', u'examin', u'essay', u'run', u'exam', u'prove', u'trial_run', u'mental_test', u'psychometric_test', u'tri', u'trial', u'examin', u'mental_test', u'try_out', u'tryout', u'test', u'examin', u'essay', u'run', u'exam', u'prove', u'screen', u'trial_run', u'mental_test', u'examin', u'mental_test', u'try_out', u'tryout', u'test', u'examin', u'essay', u'run', u'exam', u'prove', u'screen', u'trial_run', u'mental_test', u'psychometric_test', u'quiz', u'tri', u'trial', u'examin', u'mental_test', u'try_out']

我非常感谢您在分类和测试之前如何确保我的数据清洁和纯净的建议。

为什么我在分类中进行10倍交叉验证会得到不好的结果？

0 个答案: