成为Pythonic:收集任意字符串 - 索引器

时间:2016-01-01 13:21:21

标签: python python-2.x



import sys
wordcount = {}
last_word = ""
last_last_word = ""
last_last_last_word = ""

for word in file.read().split():
    if word not in wordcount:
        wordcount[word] = 1
        wordcount[word] += 1

    if last_last_last_word != "":
        if "{} {} {} {}".format(last_last_last_word,last_last_word,last_word,word) not in wordcount:
            wordcount[last_last_last_word + " " + last_last_word + " " + last_word + " " + word ] = 1
            wordcount[last_last_last_word + " " + last_last_word + " " + last_word + " " + word ] += 1
    last_last_last_word = last_last_word

    if last_last_word != "":
        if last_last_word + " " + last_word + " " + word not in wordcount:
            wordcount[last_last_word + " " + last_word + " " + word ] = 1
            wordcount[last_last_word + " " + last_word + " " + word ] += 1
    last_last_word = last_word

    if last_word != "":
        if last_word + " " + word not in wordcount:
            wordcount[last_word + " " + word] = 1
            wordcount[last_word + " " + word] += 1
    last_word = word

for k,v in sorted(wordcount.items(), key=lambda x:x[1], reverse=True):
    print k,v



this is a sample input file an input file will always be all lower case with no punctuation


file 2
input 2
input file 2
an input file 1
all 1
lower case 1
be 1
is 1
file will always 1
an 1
sample 1
case 1
always be all lower 1
this is a 1
will always be 1
sample input file 1
will always 1
is a sample 1
all lower 1
lower case with no 1
no 1
with 1
with no 1
file will always be 1
with no punctuation 1
lower 1
be all lower case 1
no punctuation 1
an input file will 1
input file an 1
file an 1
input file an input 1
always be 1
file an input file 1
be all 1
is a 1
input file will 1
file will 1
an input 1
input file will always 1
will always be all 1
always be all 1
lower case with 1
a sample 1
a sample input file 1
a sample input 1
is a sample input 1
be all lower 1
a 1
sample input file an 1
sample input 1
case with no punctuation 1
all lower case with 1
this 1
always 1
file an input 1
case with 1
case with no 1
will 1
all lower case 1
punctuation 1
this is 1
this is a sample 1

6 个答案:

答案 0 :(得分:3)


import re
import mmap
from itertools import islice, izip, tee
from collections import Counter
from pprint import pprint

def word_grouper(filename, size):
    counts = Counter()
    with open(filename) as fin:
        mm = mmap.mmap(fin.fileno(), 0, access=mmap.ACCESS_READ)
        words = (m.group() for m in re.finditer('[a-z]+', mm))
        sliding = [islice(w, n, None) for n, w in enumerate(tee(words, size+1))]
        for slide in izip(*sliding):
            counts.update(slide[:n] for n in range(1, len(slide)))

    return counts

counts = word_grouper('input filename', 4)
# do appropriate formatting instead of just `pprint`ing


[(('file',), 2),
 (('input', 'file'), 2),
 (('input',), 2),
 (('a', 'sample', 'input'), 1),
 (('file', 'will', 'always', 'be'), 1),
 (('sample', 'input', 'file', 'an'), 1),
 (('this', 'is', 'a', 'sample'), 1),
 (('this', 'is'), 1),
 (('will',), 1),
 (('lower', 'case', 'with'), 1),
 (('an', 'input', 'file', 'will'), 1),
 (('sample', 'input'), 1),
 (('is', 'a'), 1),
 (('all', 'lower', 'case', 'with'), 1),
 (('input', 'file', 'will'), 1),
 (('an',), 1),
 (('always', 'be'), 1),
 (('lower', 'case', 'with', 'no'), 1),
 (('an', 'input'), 1),
 (('be', 'all', 'lower'), 1),
 (('this',), 1),
 (('be', 'all', 'lower', 'case'), 1),
 (('this', 'is', 'a'), 1),
 (('sample',), 1),
 (('sample', 'input', 'file'), 1),
 (('will', 'always', 'be', 'all'), 1),
 (('a',), 1),
 (('a', 'sample'), 1),
 (('is', 'a', 'sample'), 1),
 (('will', 'always'), 1),
 (('lower',), 1),
 (('lower', 'case'), 1),
 (('file', 'an'), 1),
 (('file', 'an', 'input'), 1),
 (('file', 'will'), 1),
 (('is',), 1),
 (('all', 'lower'), 1),
 (('input', 'file', 'an', 'input'), 1),
 (('always', 'be', 'all', 'lower'), 1),
 (('an', 'input', 'file'), 1),
 (('input', 'file', 'an'), 1),
 (('be', 'all'), 1),
 (('input', 'file', 'will', 'always'), 1),
 (('be',), 1),
 (('all',), 1),
 (('always', 'be', 'all'), 1),
 (('is', 'a', 'sample', 'input'), 1),
 (('always',), 1),
 (('all', 'lower', 'case'), 1),
 (('file', 'an', 'input', 'file'), 1),
 (('file', 'will', 'always'), 1),
 (('a', 'sample', 'input', 'file'), 1),
 (('will', 'always', 'be'), 1)]

答案 1 :(得分:0)



import sys
from collections import defaultdict


wordcount = defaultdict(int)
wordlist = ["" for i in range(int(sys.argv[2]))]

def check(wordcount, wordlist, word):

    for i, word in enumerate(wordlist):
        if word != "":
            current = "".join([w + " " for w in wordlist[i:]])
            wordcount[current] += 1

    return wordlist[1:]

for word in file.read().split():
    wordlist = check(wordcount, wordlist, word)

for k,v in sorted(wordcount.items(), key=lambda x:x[1], reverse=True):
    print k,v

答案 2 :(得分:0)


from collections import Counter
import itertools
import operator as op

def count_phrases(words, phrase_len):
    return reduce(op.add, 
    (Counter(tuple(words[i:i+l]) for i in xrange(len(words)-l+1)) for l in phrase_len))


words = "a b c a a".split()
for phrase, count in count_phrases(words, [1, 2]).iteritems():
    print " ".join(phrase), counts


b c 1
a 3
c 1
b 1
c a 1
a a 1
a b 1

答案 3 :(得分:0)


def parser(data,size):
    chunked = data.split()
    phrases = []
    for i in xrange(len(chunked)-size):
        phrase=' '.join(chunked[i:size+i])
    return phrases

def parse_file(fname,size):    
    result = []
    with open(fname,'r') as f:    
        for data in f.readlines():
            for i in xrange(1,size):

    return Counter(result)

result= parse_file('file.txt',4) 
print sorted(result.items(),key=lambda x:x[1],reverse=True)

[('file', 2),
 ('input', 2),
 ('input file', 2),
 ('an input file', 1),
 ('all', 1),
 ('always be all', 1),
 ('is', 1),
 ('an', 1),
 ('sample', 1),
 ('this is a', 1),
 ('will always be', 1),
 ('sample input file', 1),
 ('will always', 1),
 ('is a sample', 1),
 ('all lower', 1),
 ('no', 1),
 ('with no', 1),
 ('lower case', 1),
 ('case', 1),
 ('input file will', 1),
 ('case with no', 1),
 ('input file an', 1),
 ('file an', 1),
 ('be', 1),
 ('always be', 1),
 ('be all lower', 1),
 ('be all', 1),
 ('lower', 1),
 ('is a', 1),
 ('an input', 1),
 ('a sample input', 1),
 ('lower case with', 1),
 ('a sample', 1),
 ('file will', 1),
 ('with', 1),
 ('a', 1),
 ('file will always', 1),
 ('sample input', 1),
 ('this', 1),
 ('always', 1),
 ('file an input', 1),
 ('case with', 1),
 ('will', 1),
 ('all lower case', 1),
 ('this is', 1)]

答案 4 :(得分:0)


string="this is a sample input file an input file will always be all lower case with no punctuation"

def words(count):
    return [" ".join(string.split()[a:b]) for a in range(len(string.split())) for b in range(a+count+1) if len(string.split()[a:b]) == count]



lst = words(3)


for word in set(lst):
    print word, lst.count(word)

an input file 1
file will always 1
is a sample 1
be all lower 1
file an input 1
with no punctuation 1
input file will 1
lower case with 1
this is a 1
always be all 1
will always be 1
sample input file 1
a sample input 1
all lower case 1
case with no 1
input file an 1




words_list = string.split()
words_dict = {}

for a in range(len(words_list)):
    for b in range(a):
        phrase = " ".join(words_list[b:a])
        if phrase in words_dict:
            words_dict[phrase] += 1
            words_dict[phrase] = 1

for i in words_dict:
    print i, words_dict[i]


答案 5 :(得分:0)


import sys
wordcount = {}
nb_words = 4
last_words = []

for word in file.read().split():
    last_words = [word] + last_words 
    if len (last_words) > nb_words:
    for i in range(len(last_words)-1,-1,-1):
        if last_words[i] != "":
            key = ' '.join(last_words[:i+1])
            if key not in wordcount:
                wordcount[key] = 1
                wordcount[key] += 1

for k,v in sorted(wordcount.items(), key=lambda x:x[1], reverse=True):
    print k,v

我编写了一个循环来替换变量。所以现在你有一个超过4个字的参数。 编辑:在一些错误修正后,我现在确定它产生相同的输出