如何提高人格提取的准确性?

时间:2016-10-05 19:39:16

标签: python nltk

我读了一篇名为“社会网站无监督人格识别”的文章,内容涉及从文本中提取人格。有22个功能,4个类代表4个人。通过计算文本中的特征,我们可以知道这个句子属于哪个类,这意味着我们可以知道句子的个性。 本文提供了每个功能和类的相关性。因此,类的得分是特征值减去所有特征值的平均值,然后除以所有特征值的标准差,然后乘以文章提供的相关系数。然后我们可以通过它的分数来判断该句子是否属于该类。我设置阈值以提高其准确性,但仍然不够好。我的结果是大约50-60%的准确度,不知道如何改进它。有人可以帮帮我吗?

import csv
import re
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize, RegexpTokenizer
import pickle
from statistics import mean, stdev


with open('mypersonality_final.csv', newline = '') as csvfile:
    reader = csv.reader(csvfile)
    test = []
    for w in reader:
        test.append(w)

def all_punctuation(text):
    punctuations = ['.', ',', ';', ':']
    count = 0
    for w in text:
        if w in punctuations:
            count += 1
    return count

def count_commas(text):
    count = 0
    for w in text:
        if w == ',':
            count += 1
    return count

def count_pattern(text):
    grammar = RegexpTokenizer(r'\@')
    pattern = grammar.tokenize(text)
    return len(pattern)

def count_exclamation(text):
    grammar = RegexpTokenizer(r'\!')
    pattern = grammar.tokenize(text)
    return len(pattern)

def ex_links(text):
    grammar = RegexpTokenizer(r'http?\S+\w(?:(?:\/[^\s/]*))*|www\.\S+\w(?:(?:\/[^\s/]*))*|ftp\S+\w(?:(?:\/[^\s/]*))*')
    pattern = grammar.tokenize(text)
    return len(pattern)

def firs_sinpronouns(text):
    sigpronouns = ['i', 'me', 'my', 'mine', 'we']
    count = 0
    for w in text:
        if w.lower() in sigpronouns:
            count += 1
    return count

def negative_particle(text):
    with open('negative-words.txt') as neg:
        neg = neg.read()
    words = nltk.word_tokenize(neg)
    grammar = RegexpTokenizer(r'\w+')
    nopunctuation = grammar.tokenize(text)
    count = 0
    for w in nopunctuation:
        if w.lower() in words:
            count += 1
    return count

def negative_emoticon(text):
    grammar = RegexpTokenizer(r"(?::|;|=)(?:-)?(?:\()")
    emoticons = grammar.tokenize(text)
    return len(emoticons)

def numbers(text):
    grammar = RegexpTokenizer(r'\d+')
    pattern = grammar.tokenize(text)
    return len(pattern)

def parenthesis(text):
    pat = '\([^)]*\)'
    parent = re.findall(pat, text)
    return len(parent)

def positive_emoticon(text):
    grammar = RegexpTokenizer(r'(?::|;|=|<|>)(?:-|\.)?(?:\)|D|P|3|<)')
    emoticons = grammar.tokenize(text)
    return len(emoticons)

def prepositions(text):
    tagged = nltk.pos_tag(text)
    count = 0
    for w in tagged:
        if w[1] == 'IN':
            count += 1
    return count

def pronouns(text):
    tagged = nltk.pos_tag(text)
    count = 0
    for w in tagged:
        if (w[1] == 'PRP' or w[1] == 'PRP$' or w[1] == 'WP' or w[1] == 'WPR$'):
            count += 1
    return count

def count_question(text):
    grammar = RegexpTokenizer(r'\?')
    pattern = grammar.tokenize(text)
    return len(pattern)

def long_words(text):
    grammar = RegexpTokenizer(r'\w{7,}')
    pattern = grammar.tokenize(text)
    return len(pattern)

def firs_pronouns(text):
    firstpronouns = ['i', 'me', 'my', 'mine', 'we', 'our', 'ours', 'us']
    count = 0
    for w in text:
        if w.lower() in firstpronouns:
            count += 1
    return count

def swears_count(text):
    with open('swears.txt') as test:
        words = test.read()
        swears = re.sub(r'[^\w+\s]+', '', words)
        swears = swears.split('\n')
        count = 0
    for w in text:
        if w.lower() in swears:
            count += 1
    return count

def typetoken_ratio(text):
    typed = set(text)
    token = text
    ratio = len(typed)/len(token)
    return ratio

def count_words(text):
    grammar = RegexpTokenizer(r'\w+')
    pattern = grammar.tokenize(text)
    return len(pattern)

def firs_pluralpronouns(text):
    pluralpronouns = ['we', 'our', 'ours', 'us']
    count = 0
    for w in text:
        if w.lower() in pluralpronouns:
            count += 1
    return count

def sec_pronouns(text):
    secpronouns = ['you', 'your', 'yours']
    count = 0
    for w in text:
        if w.lower() in secpronouns:
            count += 1
    return count

def mean_freq(text):
##    grammar = RegexpTokenizer(r'\w+')
    words = word_tokenize(text)
    wordsl = []
    for w in words:
        wordsl.append(w.lower())
    unique = set(wordsl)
    return (len(wordsl)/len(unique))


def mean_std(test):
    f1 = []
    f2 = []
    f3 = []
    f4 = []
    f5 = []
    f6 = []
    f7 = []
    f8 = []
    f9 = []
    f10 = []
    f11 = []
    f12 = []
    f13 = []
    f14 = []
    f15 = []
    f16 = []
    f17 = []
    f18 = []
    f19 = []
    f20 = []
    f21 = []
    f22 = []
    for w in test[1:]:
        f1.append(all_punctuation(word_tokenize(w[1])))
        f2.append(count_commas(word_tokenize(w[1])))
        f3.append(count_pattern(w[1]))
        f4.append(count_exclamation(w[1]))
        f5.append(ex_links(w[1]))
        f6.append(firs_sinpronouns(word_tokenize(w[1])))
        f7.append(negative_particle(w[1]))
        f8.append(negative_emoticon(w[1]))
        f9.append(numbers(w[1]))
        f10.append(parenthesis(w[1]))
        f11.append(positive_emoticon(w[1]))
        f12.append(prepositions(word_tokenize(w[1])))
        f13.append(pronouns(word_tokenize(w[1])))
        f14.append(count_question(w[1]))
        f15.append(long_words(w[1]))
        f16.append(firs_pronouns(word_tokenize(w[1])))
        f17.append(swears_count(word_tokenize(w[1])))
        f18.append(typetoken_ratio(word_tokenize(w[1])))
        f19.append(count_words(w[1]))
        f20.append(firs_pluralpronouns(word_tokenize(w[1])))
        f21.append(sec_pronouns(word_tokenize(w[1])))
        f22.append(mean_freq(w[1]))
    value = [f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11, f12, f13, f14, f15, f16, f17, f18, f19, f20, f21, f22]
    mean1 = []
    stdev1 = []
    for a in value:
        mean1.append(round(mean(a),2))
        stdev1.append(round(stdev(a),2))
    return (mean1, stdev1)

##save_file = open('sample_value.pickle', 'wb')
##pickle.dump(mean_std(test), save_file)
##save_file.close()
savedfile = open('sample_value.pickle', 'rb')
trained = pickle.load(savedfile)
savedfile.close()

def evaluation(test):
    ne = 0
    ns = 0
    na = 0
    nc = 0
    no = 0
    for w in test[1:]:
        z1 = (all_punctuation(word_tokenize(w[1])) - trained[0][0])/(trained[1][0])
        z2 = (count_commas(word_tokenize(w[1]))- trained[0][1])/(trained[1][1])
        z3 = (count_pattern(w[1]) - trained[0][2])/(trained[1][2])
        z4 = (count_exclamation(w[1]) - trained[0][3])/(trained[1][3])
        z5 = (ex_links(w[1]) - trained[0][4])/(trained[1][4])
        z6 = (firs_sinpronouns(word_tokenize(w[1]))- trained[0][5])/(trained[1][5])
        z7 = (negative_particle(w[1])-trained[0][6])/(trained[1][6])
        z8 = (negative_emoticon(w[1]) - trained[0][7])/(trained[1][7])
        z9 = (numbers(w[1])-trained[0][8])/(trained[1][8])
        z10 = (parenthesis(w[1])-trained[0][9])/(trained[1][9])
        z11 = (positive_emoticon(w[1])-trained[0][10])/(trained[1][10])
        z12 = (prepositions(word_tokenize(w[1]))-trained[0][11])/(trained[1][11])
        z13 = (pronouns(word_tokenize(w[1]))-trained[0][12])/(trained[1][12])
        z14 = (count_question(w[1])-trained[0][13])/(trained[1][13])
        z15 = (long_words(w[1])-trained[0][14])/(trained[1][14])
        z16 = (firs_pronouns(word_tokenize(w[1]))-trained[0][15])/(trained[1][15])
        z17 = (swears_count(word_tokenize(w[1]))-trained[0][16])/(trained[1][16])
        z18 = (typetoken_ratio(word_tokenize(w[1]))-trained[0][17])/(trained[1][17])
        z19 = (count_words(w[1])-trained[0][18])/(trained[1][18])
        z20 = (firs_pluralpronouns(word_tokenize(w[1]))-trained[0][19])/(trained[1][19])
        z21 = (sec_pronouns(word_tokenize(w[1]))-trained[0][20])/(trained[1][20])
        z22 = (mean_freq(w[1])-trained[0][21])/(trained[1][21])
        E = -0.08*z1-0.02*z2-0.07*z3-0.05*z5+0.05*z6-0.08*z7-0.03*z8-0.03*z9-0.06*z10+0.07*z11+0.07*z13-0.06*z14-0.06*z15+0.07*z16-0.01*z17-0.05*z18-0.01*z19+0.06*z20-0.01*z21+0.05*z22
        S = -0.04*z1+0.01*z2+0.02*z3-0.05*z4-0.02*z5-0.15*z6+0.12*z7-0.18*z8+0.05*z9+0.03*z10+0.07*z11+0.06*z12+0.12*z13-0.05*z14+0.06*z15-0.14*z16+0.1*z18+0.02*z19+0.07*z20+0.03*z21-0.06*z22
        A = -0.01*z1-0.02*z2+0.01*z3+0.06*z4-0.01*z5+0.05*z6+0.11*z7-0.11*z8-0.03*z9-0.04*z10+0.05*z11+0.04*z12+0.04*z13-0.04*z14-0.05*z15-0.06*z16-0.14*z17-0.04*z18+0.02*z19+0.04*z20-0.06*z21+0.03*z22
        C = -0.04*z1-0.01*z2+0.01*z3-0.03*z5+0.04*z6-0.07*z7-0.11*z8-0.02*z9-0.01*z10+0.02*z11+0.08*z12+0.02*z13-0.06*z14+0.02*z15-0.04*z16-0.11*z17-0.05*z18-0.02*z19+0.01*z20-0.04*z21+0.06*z22
        O = -10*z1+0.1*z2+0.06*z3-0.03*z4+0.09*z5-0.14*z6+0.01*z7+0.04*z8-0.06*z9+0.1*z10+0.02*z11-0.04*z12-0.06*z13+0.08*z14+0.1*z15-0.14*z16+0.08*z17+0.09*z18+0.06*z19+0.04*z20+0.11*z21-0.07*z22

        if E>0.65:
            if w[7] =='y':
                ne+=1
        if E<0.65:
            if w[7]=='n':
                ne+=1
        if S>0.75:
            if w[8] == 'y':
                ns +=1
        if S<0.75:
            if w[8] == 'n':
                ns+=1
        if A>0.005:
            if w[9]=='y':
                na+=1
        if A<0.005:
            if w[9]=='n':
                na+=1
        if C>0.58:
            if w[10]=='y':
                nc+=1
        if C<0.58:
            if w[10]=='n':
                nc+=1
        if O>(-0.05):
            if w[11]=='y':
                no+=1
        if O<(-0.05):
            if w[11]=='n':
                no+=1       
    print (round((ne/9917)*100,2), round((ns/9917)*100,2),round((na/9917)*100,2),round((nc/9917)*100,2),round((no/9917)*100,2))

evaluation(test)

示例数据是: enter image description here

0 个答案:

没有答案