Word解码器

时间:2013-10-02 19:33:46

标签: python python-2.7 nltk

我正在使用NLPP的一些语料库材料。我正在努力提高我在代码中的解密分数...目前我的命中率为91.250%。

练习的目的是改变represent_word函数以改善分数。 该函数使用一个字符串,这个字要么是加扰的,要么是未加扰的。该函数生成单词的“表示”,该列表包含以下信息:

  • 字长
  • 元音数量
  • 辅音数量
  • 单词的第一个和最后一个字母(这些字母总是未被打字)
  • 语料库中最常用单词的元组,其中的字符也是给定单词输入的成员。

我也试过分析前缀和后缀的字谜,但是它们对于具有常见字符元组的最常见单词的阴影中的分数没有任何贡献。

我不确定为什么我不能提高分数。我甚至尝试通过从另一个语料库中导入单词来增加字典大小。

这里唯一可以改变的部分是represent_word函数及其上方的定义。但是,我包括整个来源,因为它可能会给某些人提供一些有见地的信息。

    import nltk
    import re

    def word_counts(corpus, wordcounts = {}):
    """ Function that counts all the words in the corpus."""
    for word in corpus:
        wordcounts.setdefault(word.lower(), 0)
        wordcounts[word.lower()] += 1
    return wordcounts

JA_list = filter(lambda x: x.isalpha(), map(lambda x:x.lower(), 
                        nltk.corpus.gutenberg.words('austen-persuasion.txt')))
JA_freqdist=nltk.FreqDist(JA_list)
JA_toplist=sorted(JA_freqdist.items(),key=lambda x: x[1], reverse=True)[:0]
JA_topwords=[]
for i in JA_toplist:
    JA_topwords.append(i[0])

PP_list = filter(lambda x: x.isalpha(),map(lambda x:x.lower(), 
                            open("Pride and Prejudice.txt").read().split()))
PP_freqdist=nltk.FreqDist(PP_list)
PP_toplist=sorted(PP_freqdist.items(),key=lambda x: x[1], reverse=True)[:7]
PP_topwords=[]
for i in PP_toplist:
    PP_topwords.append(i[0])

uniquewords=[]
for i in JA_topwords:
    if i not in PP_topwords:
        uniquewords.append(i)
    else:
        continue
uniquewords.extend(PP_topwords)

def represent_word(word):
    def common_word(word):
        dictionary= uniquewords 
        findings=[]
        for string in dictionary:
            if all((letter in word) for letter in string):
                findings.append(string)
            else:
                False
        if not findings:
            return None
        else:
            return tuple(findings)      
    vowels = list("aeiouy") 
    consonants = list("bcdfghjklmnpqrstvexz") 
    number_of_consonants = sum(word.count(i) for i in consonants)
    number_of_vowels = sum(word.count(i) for i in vowels)
    split_word=list(word)
    common_words=common_word(word)
    return tuple([split_word[0],split_word[-1], len(split_word),number_of_consonants, number_of_vowels, common_words])





def create_mapping(words, mapping = {}):
    """ Returns a mapping of representations of words to the most common word for that representation. """
    for word in words:
        representation = represent_word(word)
        mapping.setdefault(representation, ("", 0))
        if mapping[representation][1] < words[word]:
            mapping[representation] = (word, words[word])
    return mapping

if __name__ == '__main__':
    # Create a mapping of representations of the words in Persuasian by Jane Austen to use as a corpus
    words = JA_freqdist
    mapping = create_mapping(words)

    # Load the words in the scrambled file
    with open("Pdrie and Puicejdre.txt") as scrambled_file:
        scrambled_lines = [line.split() for line in scrambled_file if len(line.strip()) > 0 ]
        scrambled_words = [word.lower() for line in scrambled_lines for word in line]

    # Descramble the words using the best mapping 
    descrambled_words = []
    for scrambled_word in scrambled_words:
        representation = represent_word(scrambled_word)
        if representation in mapping:
            descrambled_word = mapping[representation][0]
        else:
            descrambled_word = scrambled_word
        descrambled_words.append(descrambled_word)

    # Load the original words
    with open("Pride and Prejudice.txt") as original_file:
        original_lines = [line.split() for line in original_file if len(line.strip()) > 0 ]
        original_words = [word.lower() for line in original_lines for word in line]

    # Make a list of word pairs from descrambled_words and original words
    word_pairs = zip(descrambled_words, original_words)
    # See if the words are the same
    judgements = [descrambled_word == original_word for (descrambled_word, original_word) in word_pairs]
    # Print the results
    print "Correct: {0:.3%}".format(float(judgements.count(True))/len(judgements))

0 个答案:

没有答案