我是一名新的python程序员,并且在分类方面也是如此。这是我第一次尝试使用监督数据进行预处理和分类任务。我用tweepy收集了数据。
我正在进行多级分类。我在训练分类器之前尝试清理数据,这里是我用过的scrpit:
from flask import Flask, request
from sklearn.cross_validation import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from nltk.corpus import wordnet as wn
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from string import punctuation
import nltk
import re
data = [] # the tweet array
target = [] # the category array
file = open("readin file.txt", "r")
count = 0
for line in file: # Declare 'line' variable and store the file into it
line_array = line.split(",,,")
try:
data.append(line_array[4]) # appending tweet line into data array
target.append(line_array[0]) # appending categories into target array
except:
pass
stopWords = stopwords.words('english')
stemmer = PorterStemmer()
def stem_tokens(tokens, stemmer):
stemmed = []
for item in tokens:
stemmed.append(stemmer.stem(item))
return stemmed
def tokenize(text):
text = text.lower()
text = text.lower()
text = text.replace("#","")
text = re.sub(r'((www\.[^\s]+)|(https?://[^\s]+))|(http?://[^\s]+)', '', text)
text = re.sub('&[^\s]+', r'', text)
text = re.sub('@[^\s]+', r'', text)
text = re.sub('#([^\s]+)', r'', text)
text = text.strip(r'\'"')
text = re.sub(r'[\s]+', ' ', text)
tokens = nltk.word_tokenize(text)
tokens_new = []
for item in tokens:
if "www." in item or "http" in item:
item = "URL"
if "@" in item:
item = "AT_USER"
if re.match("[a-zA-Z]",item) and len(item)>2\
and "#illegalimmigration" not in item\
and "#illegalimmigrants" not in item\
and "#GOPDepate" not in item\
and "#WakeUpAmerica" not in item\
and "#election2016" not in item\
and "#trump" not in item\
and "#SanctuaryCities" not in item\
and "#Hillary2016" not in item\
and "#PopeVisitsUS" not in item\
and "#tcot" not in item\
and "#DonaldTrump" not in item\
and "#PopeFrancis" not in item\
and "#ACA" not in item\
and "#NoAmnesty" not in item\
and "#blm" not in item:
all = []
for i,j in enumerate(wn.synsets(item)):
all.extend(j.lemma_names())
tokens_new.extend(list(set(all)))
# print "Text "
# print text
# print "Tokens "
# print tokens
# print tokens_new
stems = stem_tokens(tokens_new, stemmer)
# print "Stems "
# print stems
return stems
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(stop_words = stopWords,tokenizer=tokenize)
X_train_counts = count_vect.fit_transform(data)
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
clf = MultinomialNB().fit(X_train_tfidf, target)
scores = cross_val_score(clf,X_train_tfidf,target,cv=10,scoring='accuracy')
print 'Naive Bayes Classifier'
print scores
print scores.mean()
print
数据看起来像一个样本: category ,,, countTweet ,,, profileName ,,, userName ,,, tweet
1,,, 4,,,cjlamb,,,16campaignbites,,,@thinkprogress Let's give GOPers citizenship test. If they fail, they leave the country and immigrants stay
2,,, 191,,,Acadi Anna,,,Acadianna32,,,#Deport the millions of #IllegalImmigrants in the United States illegally and build a wall between the U.S. & Mexicohttp://t.co/AWJZBuZcJb
3,,, 460,,,The Angry Vet,,,DemonTwoSix,,,RT @sweety125: @hempforfood @RickCanton But an illegal alien can be deported 5 times & come back & get #SanctuaryCities then kills a young
1 ,,,支持在美国保留非法违法者的意见1号。 2 ,,,支持意见2号,即驱逐所有非法移民。和3 ,,,支持第3号意见,这只是驱逐犯罪的非法移民 我得到这个输出,这是分类的坏结果:
Naive Bayes Classifier
[ 0.51612903 0.51612903 0.5 0.58333333 0.74576271 0.62068966
0.53448276 0.60344828 0.5 0.5862069 ]
0.570618169592
以下是打印时的词干,文本,新标记和标记的一个示例:
Text
let's give gopers citizenship test. if they fail, they leave the country and immigrants stay
Tokens
[u'let', u"'s", u'give', u'gopers', u'citizenship', u'test', u'.', u'if', u'they', u'fail', u',', u'they', u'leave', u'the', u'country', u'and', u'immigrants', u'stay']
New Tokens
[u'Army_of_the_Pure', u'Army_of_the_Righteous', u'LET', u'Lashkar-e-Taiba', u'Lashkar-e-Tayyiba', u'Lashkar-e-Toiba', u'Army_of_the_Pure', u'Army_of_the_Righteous', u'net_ball', u'LET', u'Lashkar-e-Taiba', u'let', u'Lashkar-e-Tayyiba', u'Lashkar-e-Toiba', u'Army_of_the_Pure', u'Army_of_the_Righteous', u'net_ball', u'LET', u'Lashkar-e-Taiba', u'allow', u'permit', u'let', u'Lashkar-e-Tayyiba', u'Lashkar-e-Toiba', u'Army_of_the_Pure', u'Army_of_the_Righteous', u'net_ball', u'LET', u'Lashkar-e-Taiba', u'allow', u'permit', u'let', u'Lashkar-e-Tayyiba', u'Lashkar-e-Toiba', u'Army_of_the_Pure', u'Army_of_the_Righteous', u'countenance', u'net_ball', u'LET', u'Lashkar-e-Taiba', u'allow', u'permit', u'let', u'Lashkar-e-Tayyiba', u'Army_of_the_Righteous', u'countenance', u'net_ball', u'LET', u'Lashkar-e-Taiba', u'allow', u'permit', u'let', u'Lashkar-e-Tayyiba', u'Lashkar-e-Toiba', u'have', u'get', u'Army_of_the_Pure', u'Army_of_the_Righteous', u'countenance', u'net_ball', u'LET', u'Lashkar-e-Taiba', u'allow', u'permit', u'rent', u'let', u'lease', u'Lashkar-e-Tayyiba', u'Lashkar-e-Toiba', u'spring', u'springiness', u'give', u'spring', u'springiness', u'give', u'afford', u'spring', u'yield', u'springiness', u'yield', u'springiness', u'pay', u'hold', u'throw', u'present', u'have', u'gift', u'give', u'afford', u'spring', u'make', u'devote', u'yield', u'springiness', u'pay', u'hold', u'throw', u'present', u'return', u'have', u'gift', u'give', u'afford', u'spring', u'make', u'devote', u'yield', u'apply', u'establish', u'founder', u'open', u'grant', u'pay', u'make', u'devote', u'throw', u'cave_in', u'impart', u'fall_in', u'chip_in', u'return', u'collapse', u'turn_over', u'afford', u'sacrifice', u'give_way', u'reach', u'hand', u'break', u'hold', u'generate', u'present', u'gift', u'dedicate', u'yield', u'leave', u'ease_up', u'commit', u'pass_on', u'citizenship', u'citizenship', u'test', u'trial', u'trial_run', u'tryout', u'trial_run', u'mental_test']
Stems
[u'Army_of_the_Pur', u'Army_of_the_Right', u'LET', u'Lashkar-e-Taiba', u'Lashkar-e-Tayyiba', u'Lashkar-e-Toiba', u'Army_of_the_Pur', u'Army_of_the_Right', u'net_bal', u'LET', u'Lashkar-e-Taiba', u'let', u'Lashkar-e-Tayyiba', u'Lashkar-e-Toiba', u'Army_of_the_Pur', u'Army_of_the_Right', u'net_bal', u'LET', u'Lashkar-e-Taiba', u'allow', u'permit', u'let', u'Lashkar-e-Tayyiba', u'Lashkar-e-Toiba', u'Army_of_the_Pur', u'Army_of_the_Right', u'net_bal', u'LET', u'Lashkar-e-Taiba', u'allow', u'permit', u'let', u'Lashkar-e-Tayyiba', u'Lashkar-e-Toiba', u'Army_of_the_Pur', u'Army_of_the_Right', u'counten', u'net_bal', u'LET', u'pass', u'appli', u'establish', u'founder', u'open', u'grant', u'pay', u'make', u'devot', u'throw', u'cave_in', u'tryout', u'test', u'exam', u'trial_run', u'mental_test', u'psychometric_test', u'trial', u'mental_test', u'tryout', u'test', u'examin', u'exam', u'trial_run', u'mental_test', u'psychometric_test', u'trial', u'mental_test', u'tryout', u'test', u'examin', u'run', u'exam', u'trial_run', u'mental_test', u'psychometric_test', u'trial', u'mental_test', u'tryout', u'test', u'examin', u'run', u'exam', u'trial_run', u'mental_test', u'psychometric_test', u'trial', u'mental_test', u'tryout', u'test', u'examin', u'essay', u'run', u'exam', u'prove', u'trial_run', u'mental_test', u'psychometric_test', u'tri', u'trial', u'examin', u'mental_test', u'try_out', u'tryout', u'test', u'examin', u'essay', u'run', u'exam', u'prove', u'screen', u'trial_run', u'mental_test', u'examin', u'mental_test', u'try_out', u'tryout', u'test', u'examin', u'essay', u'run', u'exam', u'prove', u'screen', u'trial_run', u'mental_test', u'psychometric_test', u'quiz', u'tri', u'trial', u'examin', u'mental_test', u'try_out']
我非常感谢您在分类和测试之前如何确保我的数据清洁和纯净的建议。