def ngram(n, k, document):
f = open(document, 'r')
for i, line in enumerate(f):
words = line.split() + line.split()
print words
return {}
对于前 - “我喜欢Python编程语言”和n = 2 是“我喜欢”,“喜欢”,“Python”,“Python编程”和“编程语言”;
我想存储在一个列表中,然后比较它们中有多少是相同的。
答案 0 :(得分:3)
你想要归还的并不完全清楚。假设一行说:
I love the Python programming language
而且你想在线间无所事事。
from collections import deque
def linesplitter(line, n):
prev = deque(maxlen=n) # fixed length list
for word in line.split(): # iterate through each word
prev.append(word) # keep adding to the list
if len(prev) == n: # until there are n elements
print " ".join(prev) # then start printing
# oldest element is removed automatically
with open(document) as f: # 'r' is implied
for line in f:
linesplitter(line, 2) # or any other length!
输出:
I love
love the
the Python
Python programming
programming language
答案 1 :(得分:2)
你可以改编自itertools recipes之一:
import itertools
def ngrams(N, k, filepath):
with open(filepath) as infile:
words = (word for line in infile for word in line.split())
ts = itertools.tee(words, N)
for i in range(1, len(ts)):
for t in ts[i:]:
next(t, None)
return zip(*ts)
使用如下所示的测试文件:
I love
the
python programming language
这是输出:
In [21]: ngrams(2, '', 'blah')
Out[21]:
[('I', 'love'),
('love', 'the'),
('the', 'python'),
('python', 'programming'),
('programming', 'language')]
In [22]: ngrams(3, '', 'blah')
Out[22]:
[('I', 'love', 'the'),
('love', 'the', 'python'),
('the', 'python', 'programming'),
('python', 'programming', 'language')]
答案 2 :(得分:1)
嗯,你可以通过列表理解来实现这个目标:
>>> [s1 + " " + s2 for s1, s2 in zip(s.split(), s.split()[1:])]
['I love', 'love the', 'the Python', 'Python programming', 'programming language']
您还可以使用str.format
功能:
>>> ["{} {}".format(s1, s2) for s1, s2 in zip(s.split(), s.split()[1:])]
['I love', 'love the', 'the Python', 'Python programming', 'programming language']
该函数的最终版本:
from itertools import tee, islice
def ngram(n, s):
var = [islice(it, i, None) for i, it in enumerate(tee(s.split(), n))]
return [("{} " * n).format(*itt) for itt in zip(*var)]
演示:
>>> from splitting import ngram
>>> thing = 'I love the Python programming language'
>>> ngram(2, thing)
['I love ', 'love the ', 'the Python ', 'Python programming ', 'programming language ']
>>> ngram(3, thing)
['I love the ', 'love the Python ', 'the Python programming ', 'Python programming language ']
>>> ngram(4, thing)
['I love the Python ', 'love the Python programming ', 'the Python programming language ']
>>> ngram(1, thing)
['I ', 'love ', 'the ', 'Python ', 'programming ', 'language ']
答案 3 :(得分:0)
这是“one-line”解决方案,使用 list comprenhension :
s = "I love the Python programming language"
def ngram(s, n):
return [" ".join(k) for k in zip(*[l[0] for l in zip(s.split()[e:] for e in range(n))])]
# Test
for i in range(1, 7):
print ngram(s, i)
<强>输出:强>
['I', 'love', 'the', 'Python', 'programming', 'language']
['I love', 'love the', 'the Python', 'Python programming', 'programming language']
['I love the', 'love the Python', 'the Python programming', 'Python programming language']
['I love the Python', 'love the Python programming', 'the Python programming language']
['I love the Python programming', 'love the Python programming language']
['I love the Python programming language']
注意不需要k
参数。
适应您的情况:
def ngram(document, n):
with open(document) as f:
for line in f:
print [" ".join(k) for k in zip(*[l[0] for l in zip(line.split()[e:] for e in range(n))])]