我读了一篇名为“社会网站无监督人格识别”的文章,内容涉及从文本中提取人格。有22个功能,4个类代表4个人。通过计算文本中的特征,我们可以知道这个句子属于哪个类,这意味着我们可以知道句子的个性。 本文提供了每个功能和类的相关性。因此,类的得分是特征值减去所有特征值的平均值,然后除以所有特征值的标准差,然后乘以文章提供的相关系数。然后我们可以通过它的分数来判断该句子是否属于该类。我设置阈值以提高其准确性,但仍然不够好。我的结果是大约50-60%的准确度,不知道如何改进它。有人可以帮帮我吗?
import csv
import re
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize, RegexpTokenizer
import pickle
from statistics import mean, stdev
with open('mypersonality_final.csv', newline = '') as csvfile:
reader = csv.reader(csvfile)
test = []
for w in reader:
test.append(w)
def all_punctuation(text):
punctuations = ['.', ',', ';', ':']
count = 0
for w in text:
if w in punctuations:
count += 1
return count
def count_commas(text):
count = 0
for w in text:
if w == ',':
count += 1
return count
def count_pattern(text):
grammar = RegexpTokenizer(r'\@')
pattern = grammar.tokenize(text)
return len(pattern)
def count_exclamation(text):
grammar = RegexpTokenizer(r'\!')
pattern = grammar.tokenize(text)
return len(pattern)
def ex_links(text):
grammar = RegexpTokenizer(r'http?\S+\w(?:(?:\/[^\s/]*))*|www\.\S+\w(?:(?:\/[^\s/]*))*|ftp\S+\w(?:(?:\/[^\s/]*))*')
pattern = grammar.tokenize(text)
return len(pattern)
def firs_sinpronouns(text):
sigpronouns = ['i', 'me', 'my', 'mine', 'we']
count = 0
for w in text:
if w.lower() in sigpronouns:
count += 1
return count
def negative_particle(text):
with open('negative-words.txt') as neg:
neg = neg.read()
words = nltk.word_tokenize(neg)
grammar = RegexpTokenizer(r'\w+')
nopunctuation = grammar.tokenize(text)
count = 0
for w in nopunctuation:
if w.lower() in words:
count += 1
return count
def negative_emoticon(text):
grammar = RegexpTokenizer(r"(?::|;|=)(?:-)?(?:\()")
emoticons = grammar.tokenize(text)
return len(emoticons)
def numbers(text):
grammar = RegexpTokenizer(r'\d+')
pattern = grammar.tokenize(text)
return len(pattern)
def parenthesis(text):
pat = '\([^)]*\)'
parent = re.findall(pat, text)
return len(parent)
def positive_emoticon(text):
grammar = RegexpTokenizer(r'(?::|;|=|<|>)(?:-|\.)?(?:\)|D|P|3|<)')
emoticons = grammar.tokenize(text)
return len(emoticons)
def prepositions(text):
tagged = nltk.pos_tag(text)
count = 0
for w in tagged:
if w[1] == 'IN':
count += 1
return count
def pronouns(text):
tagged = nltk.pos_tag(text)
count = 0
for w in tagged:
if (w[1] == 'PRP' or w[1] == 'PRP$' or w[1] == 'WP' or w[1] == 'WPR$'):
count += 1
return count
def count_question(text):
grammar = RegexpTokenizer(r'\?')
pattern = grammar.tokenize(text)
return len(pattern)
def long_words(text):
grammar = RegexpTokenizer(r'\w{7,}')
pattern = grammar.tokenize(text)
return len(pattern)
def firs_pronouns(text):
firstpronouns = ['i', 'me', 'my', 'mine', 'we', 'our', 'ours', 'us']
count = 0
for w in text:
if w.lower() in firstpronouns:
count += 1
return count
def swears_count(text):
with open('swears.txt') as test:
words = test.read()
swears = re.sub(r'[^\w+\s]+', '', words)
swears = swears.split('\n')
count = 0
for w in text:
if w.lower() in swears:
count += 1
return count
def typetoken_ratio(text):
typed = set(text)
token = text
ratio = len(typed)/len(token)
return ratio
def count_words(text):
grammar = RegexpTokenizer(r'\w+')
pattern = grammar.tokenize(text)
return len(pattern)
def firs_pluralpronouns(text):
pluralpronouns = ['we', 'our', 'ours', 'us']
count = 0
for w in text:
if w.lower() in pluralpronouns:
count += 1
return count
def sec_pronouns(text):
secpronouns = ['you', 'your', 'yours']
count = 0
for w in text:
if w.lower() in secpronouns:
count += 1
return count
def mean_freq(text):
## grammar = RegexpTokenizer(r'\w+')
words = word_tokenize(text)
wordsl = []
for w in words:
wordsl.append(w.lower())
unique = set(wordsl)
return (len(wordsl)/len(unique))
def mean_std(test):
f1 = []
f2 = []
f3 = []
f4 = []
f5 = []
f6 = []
f7 = []
f8 = []
f9 = []
f10 = []
f11 = []
f12 = []
f13 = []
f14 = []
f15 = []
f16 = []
f17 = []
f18 = []
f19 = []
f20 = []
f21 = []
f22 = []
for w in test[1:]:
f1.append(all_punctuation(word_tokenize(w[1])))
f2.append(count_commas(word_tokenize(w[1])))
f3.append(count_pattern(w[1]))
f4.append(count_exclamation(w[1]))
f5.append(ex_links(w[1]))
f6.append(firs_sinpronouns(word_tokenize(w[1])))
f7.append(negative_particle(w[1]))
f8.append(negative_emoticon(w[1]))
f9.append(numbers(w[1]))
f10.append(parenthesis(w[1]))
f11.append(positive_emoticon(w[1]))
f12.append(prepositions(word_tokenize(w[1])))
f13.append(pronouns(word_tokenize(w[1])))
f14.append(count_question(w[1]))
f15.append(long_words(w[1]))
f16.append(firs_pronouns(word_tokenize(w[1])))
f17.append(swears_count(word_tokenize(w[1])))
f18.append(typetoken_ratio(word_tokenize(w[1])))
f19.append(count_words(w[1]))
f20.append(firs_pluralpronouns(word_tokenize(w[1])))
f21.append(sec_pronouns(word_tokenize(w[1])))
f22.append(mean_freq(w[1]))
value = [f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11, f12, f13, f14, f15, f16, f17, f18, f19, f20, f21, f22]
mean1 = []
stdev1 = []
for a in value:
mean1.append(round(mean(a),2))
stdev1.append(round(stdev(a),2))
return (mean1, stdev1)
##save_file = open('sample_value.pickle', 'wb')
##pickle.dump(mean_std(test), save_file)
##save_file.close()
savedfile = open('sample_value.pickle', 'rb')
trained = pickle.load(savedfile)
savedfile.close()
def evaluation(test):
ne = 0
ns = 0
na = 0
nc = 0
no = 0
for w in test[1:]:
z1 = (all_punctuation(word_tokenize(w[1])) - trained[0][0])/(trained[1][0])
z2 = (count_commas(word_tokenize(w[1]))- trained[0][1])/(trained[1][1])
z3 = (count_pattern(w[1]) - trained[0][2])/(trained[1][2])
z4 = (count_exclamation(w[1]) - trained[0][3])/(trained[1][3])
z5 = (ex_links(w[1]) - trained[0][4])/(trained[1][4])
z6 = (firs_sinpronouns(word_tokenize(w[1]))- trained[0][5])/(trained[1][5])
z7 = (negative_particle(w[1])-trained[0][6])/(trained[1][6])
z8 = (negative_emoticon(w[1]) - trained[0][7])/(trained[1][7])
z9 = (numbers(w[1])-trained[0][8])/(trained[1][8])
z10 = (parenthesis(w[1])-trained[0][9])/(trained[1][9])
z11 = (positive_emoticon(w[1])-trained[0][10])/(trained[1][10])
z12 = (prepositions(word_tokenize(w[1]))-trained[0][11])/(trained[1][11])
z13 = (pronouns(word_tokenize(w[1]))-trained[0][12])/(trained[1][12])
z14 = (count_question(w[1])-trained[0][13])/(trained[1][13])
z15 = (long_words(w[1])-trained[0][14])/(trained[1][14])
z16 = (firs_pronouns(word_tokenize(w[1]))-trained[0][15])/(trained[1][15])
z17 = (swears_count(word_tokenize(w[1]))-trained[0][16])/(trained[1][16])
z18 = (typetoken_ratio(word_tokenize(w[1]))-trained[0][17])/(trained[1][17])
z19 = (count_words(w[1])-trained[0][18])/(trained[1][18])
z20 = (firs_pluralpronouns(word_tokenize(w[1]))-trained[0][19])/(trained[1][19])
z21 = (sec_pronouns(word_tokenize(w[1]))-trained[0][20])/(trained[1][20])
z22 = (mean_freq(w[1])-trained[0][21])/(trained[1][21])
E = -0.08*z1-0.02*z2-0.07*z3-0.05*z5+0.05*z6-0.08*z7-0.03*z8-0.03*z9-0.06*z10+0.07*z11+0.07*z13-0.06*z14-0.06*z15+0.07*z16-0.01*z17-0.05*z18-0.01*z19+0.06*z20-0.01*z21+0.05*z22
S = -0.04*z1+0.01*z2+0.02*z3-0.05*z4-0.02*z5-0.15*z6+0.12*z7-0.18*z8+0.05*z9+0.03*z10+0.07*z11+0.06*z12+0.12*z13-0.05*z14+0.06*z15-0.14*z16+0.1*z18+0.02*z19+0.07*z20+0.03*z21-0.06*z22
A = -0.01*z1-0.02*z2+0.01*z3+0.06*z4-0.01*z5+0.05*z6+0.11*z7-0.11*z8-0.03*z9-0.04*z10+0.05*z11+0.04*z12+0.04*z13-0.04*z14-0.05*z15-0.06*z16-0.14*z17-0.04*z18+0.02*z19+0.04*z20-0.06*z21+0.03*z22
C = -0.04*z1-0.01*z2+0.01*z3-0.03*z5+0.04*z6-0.07*z7-0.11*z8-0.02*z9-0.01*z10+0.02*z11+0.08*z12+0.02*z13-0.06*z14+0.02*z15-0.04*z16-0.11*z17-0.05*z18-0.02*z19+0.01*z20-0.04*z21+0.06*z22
O = -10*z1+0.1*z2+0.06*z3-0.03*z4+0.09*z5-0.14*z6+0.01*z7+0.04*z8-0.06*z9+0.1*z10+0.02*z11-0.04*z12-0.06*z13+0.08*z14+0.1*z15-0.14*z16+0.08*z17+0.09*z18+0.06*z19+0.04*z20+0.11*z21-0.07*z22
if E>0.65:
if w[7] =='y':
ne+=1
if E<0.65:
if w[7]=='n':
ne+=1
if S>0.75:
if w[8] == 'y':
ns +=1
if S<0.75:
if w[8] == 'n':
ns+=1
if A>0.005:
if w[9]=='y':
na+=1
if A<0.005:
if w[9]=='n':
na+=1
if C>0.58:
if w[10]=='y':
nc+=1
if C<0.58:
if w[10]=='n':
nc+=1
if O>(-0.05):
if w[11]=='y':
no+=1
if O<(-0.05):
if w[11]=='n':
no+=1
print (round((ne/9917)*100,2), round((ns/9917)*100,2),round((na/9917)*100,2),round((nc/9917)*100,2),round((no/9917)*100,2))
evaluation(test)
示例数据是: enter image description here