假设我有一个像
这样的文字text="I came from the moon. He went to the other room. She went to the drawing room."
这里最常见的3个单词组是"went to the"
我知道如何找到最常见的bigram
或trigram
,但我陷入了困境。我想找到这个解决方案without using NLTK library
。
答案 0 :(得分:1)
nltk
使这个问题变得微不足道,但看到你不想要这样的依赖,我已经包含了一个只使用核心库的简单实现。该代码适用于python2.7和python3.x,并使用collections.Counter来计算n-gram的频率。在计算上,它是O(NM),其中N是文本中的单词数,M是计算的n-gram的数量(因此,如果计算uni和bigrams,则M = 2)。
import collections
import re
import sys
import time
# Convert a string to lowercase and split into words (w/o punctuation)
def tokenize(string):
return re.findall(r'\w+', string.lower())
def count_ngrams(lines, min_length=2, max_length=4):
lengths = range(min_length, max_length + 1)
ngrams = {length: collections.Counter() for length in lengths}
queue = collections.deque(maxlen=max_length)
# Helper function to add n-grams at start of current queue to dict
def add_queue():
current = tuple(queue)
for length in lengths:
if len(current) >= length:
ngrams[length][current[:length]] += 1
# Loop through all lines and words and add n-grams to dict
for line in lines:
for word in tokenize(line):
queue.append(word)
if len(queue) >= max_length:
add_queue()
# Make sure we get the n-grams at the tail end of the queue
while len(queue) > min_length:
queue.popleft()
add_queue()
return ngrams
def print_most_frequent(ngrams, num=10):
for n in sorted(ngrams):
print('----- {} most common {}-grams -----'.format(num, n))
for gram, count in ngrams[n].most_common(num):
print('{0}: {1}'.format(' '.join(gram), count))
print('')
if __name__ == '__main__':
if len(sys.argv) < 2:
print('Usage: python ngrams.py filename')
sys.exit(1)
start_time = time.time()
with open(sys.argv[1]) as f:
ngrams = count_ngrams(f)
print_most_frequent(ngrams)
elapsed_time = time.time() - start_time
print('Took {:.03f} seconds'.format(elapsed_time))
答案 1 :(得分:1)
import string
text="I came from the moon. He went to the other room. She went to the drawing room."
for character in string.punctuation:
text = text.replace(character, " ")
while text != text.replace(" ", " "):
text = text.replace(" ", " ")
text = text.split(" ")
wordlist = []
frequency_dict = dict()
for i in range(len(text)-3):
wordlist.append([text[i], text[i+1], text[i+2]])
for three_words in wordlist:
frequency= wordlist.count(three_words)
frequency_dict[", ".join(three_words)] = frequency
print max(frequency_dict, key=frequency_dict.get), frequency_dict[max(frequency_dict, key=frequency_dict.get)]
输出:went, to, the 2
不幸的是,列表不可清除。否则,它将有助于创建一组three_words项。
答案 2 :(得分:0)
text="I came from the moon. He went to the other room. She went to the drawing room."
fixed_text = re.sub("[^a-zA-Z ]"," ",text)
text_list = fixed_text.split()
print Counter(" ".join(text_list[i:i+3]) for i in range(len(text_list)-3)).most_common(1)
我想......也许?
>>> text="I came from the moon. He went to the other room. She went to the drawi
ng room."
>>> fixed_text = re.sub("[^a-zA-Z ]"," ",text)
>>> text_list = fixed_text.split()
>>> print Counter(" ".join(text_list[i:i+3]) for i in range(len(text_list)-3)).most_common(1)
[('went to the', 2)]
>>>