我用SMT的短语提取算法编写了以下代码。
# -*- coding: utf-8 -*-
def phrase_extraction(srctext, trgtext, alignment):
"""
Phrase extraction algorithm.
"""
def extract(f_start, f_end, e_start, e_end):
phrases = set()
# return { } if f end == 0
if f_end == 0:
return
# for all (e,f) ∈ A do
for e,f in alignment:
# return { } if e < e start or e > e end
if e < e_start or e > e_end:
return
fs = f_start
# repeat-
while True:
fe = f_end
# repeat-
while True:
# add phrase pair ( e start .. e end , f s .. f e ) to set E
trg_phrase = " ".join(trgtext[i] for i in range(fs,fe))
src_phrase = " ".join(srctext[i] for i in range(e_start,e_end))
phrases.add("\t".join([src_phrase, trg_phrase]))
fe+=1 # fe++
# -until fe aligned
if fe in f_aligned or fe > trglen:
break
fs-=1 # fe--
# -until fs aligned
if fs in f_aligned or fs < 0:
break
return phrases
# Calculate no. of tokens in source and target texts.
srctext = srctext.split()
trgtext = trgtext.split()
srclen = len(srctext)
trglen = len(trgtext)
# Keeps an index of which source/target words are aligned.
e_aligned = [i for i,_ in alignment]
f_aligned = [j for _,j in alignment]
bp = set() # set of phrase pairs BP
# for e start = 1 ... length(e) do
for e_start in range(srclen):
# for e end = e start ... length(e) do
for e_end in range(e_start, srclen):
# // find the minimally matching foreign phrase
# (f start , f end ) = ( length(f), 0 )
f_start, f_end = trglen, 0
# for all (e,f) ∈ A do
for e,f in alignment:
# if e start ≤ e ≤ e end then
if e_start <= e <= e_end:
f_start = min(f, f_start)
f_end = max(f, f_end)
# add extract (f start , f end , e start , e end ) to set BP
phrases = extract(f_start, f_end, e_start, e_end)
if phrases:
bp.update(phrases)
return bp
srctext = "michael assumes that he will stay in the house"
trgtext = "michael geht davon aus , dass er im haus bleibt"
alignment = [(0,0), (1,1), (1,2), (1,3), (2,5), (3,6), (4,9), (5,9), (6,7), (7,7), (8,8)]
phrases = phrase_extraction(srctext, trgtext, alignment)
for i in phrases:
print i
Philip Koehn的统计机器翻译一书中的短语提取算法,第133页是这样的:
所需的输出应为:
但是使用我的代码,我只能获得这些输出:
迈克尔认为他将继续留在 - 迈克尔·戴顿, dass er im haus 迈克尔认为他将继续留在 - 迈克尔·戴顿, dass er im haus bleibt
是否有人发现我的实施有什么问题?
代码确实提取了短语,但它并不是完整的所需输出,如上面的转换表所示:
答案 0 :(得分:6)
所以我已经看过这个问题,现在可以重现所需的输出。事实证明存在许多问题:
示例输出(匹配19个短语,其中一些匹配重复5次,总共提取24个):
$ python2.7 phrase_extract_new.py
( 1) (0, 1) michael — michael
( 2) (0, 2) michael assumes — michael geht davon aus ; michael geht davon aus ,
( 3) (0, 3) michael assumes that — michael geht davon aus , dass
( 4) (0, 4) michael assumes that he — michael geht davon aus , dass er
( 5) (0, 9) michael assumes that he will stay in the house — michael geht davon aus , dass er im haus bleibt
( 6) (1, 2) assumes — geht davon aus ; geht davon aus ,
( 7) (1, 3) assumes that — geht davon aus , dass
( 8) (1, 4) assumes that he — geht davon aus , dass er
( 9) (1, 9) assumes that he will stay in the house — geht davon aus , dass er im haus bleibt
(10) (2, 3) that — dass ; , dass
(11) (2, 4) that he — dass er ; , dass er
(12) (2, 9) that he will stay in the house — dass er im haus bleibt ; , dass er im haus bleibt
(13) (3, 4) he — er
(14) (3, 9) he will stay in the house — er im haus bleibt
(15) (4, 6) will stay — bleibt
(16) (4, 9) will stay in the house — im haus bleibt
(17) (6, 8) in the — im
(18) (6, 9) in the house — im haus
(19) (8, 9) house — haus
$ python2.7 phrase_extract_new.py | grep -c ';'
5
下面是对算法的建议解释。该算法需要进行相当多的重构,但在此之前需要使用不同的示例进行更多测试。再现书中的例子只是一个开始:
# -*- coding: utf-8 -*-
def phrase_extraction(srctext, trgtext, alignment):
"""
Phrase extraction algorithm.
"""
def extract(f_start, f_end, e_start, e_end):
if f_end < 0: # 0-based indexing.
return {}
# Check if alignement points are consistent.
for e,f in alignment:
if ((f_start <= f <= f_end) and
(e < e_start or e > e_end)):
return {}
# Add phrase pairs (incl. additional unaligned f)
# Remark: how to interpret "additional unaligned f"?
phrases = set()
fs = f_start
# repeat-
while True:
fe = f_end
# repeat-
while True:
# add phrase pair ([e_start, e_end], [fs, fe]) to set E
# Need to +1 in range to include the end-point.
src_phrase = " ".join(srctext[i] for i in range(e_start,e_end+1))
trg_phrase = " ".join(trgtext[i] for i in range(fs,fe+1))
# Include more data for later ordering.
phrases.add(((e_start, e_end+1), src_phrase, trg_phrase))
fe += 1 # fe++
# -until fe aligned or out-of-bounds
if fe in f_aligned or fe == trglen:
break
fs -=1 # fe--
# -until fs aligned or out-of- bounds
if fs in f_aligned or fs < 0:
break
return phrases
# Calculate no. of tokens in source and target texts.
srctext = srctext.split() # e
trgtext = trgtext.split() # f
srclen = len(srctext) # len(e)
trglen = len(trgtext) # len(f)
# Keeps an index of which source/target words are aligned.
e_aligned = [i for i,_ in alignment]
f_aligned = [j for _,j in alignment]
bp = set() # set of phrase pairs BP
# for e start = 1 ... length(e) do
# Index e_start from 0 to len(e) - 1
for e_start in range(srclen):
# for e end = e start ... length(e) do
# Index e_end from e_start to len(e) - 1
for e_end in range(e_start, srclen):
# // find the minimally matching foreign phrase
# (f start , f end ) = ( length(f), 0 )
# f_start ∈ [0, len(f) - 1]; f_end ∈ [0, len(f) - 1]
f_start, f_end = trglen-1 , -1 # 0-based indexing
# for all (e,f) ∈ A do
for e,f in alignment:
# if e start ≤ e ≤ e end then
if e_start <= e <= e_end:
f_start = min(f, f_start)
f_end = max(f, f_end)
# add extract (f start , f end , e start , e end ) to set BP
phrases = extract(f_start, f_end, e_start, e_end)
if phrases:
bp.update(phrases)
return bp
# Refer to match matrix.
# 0 1 2 3 4 5 6 7 8
srctext = "michael assumes that he will stay in the house"
# 0 1 2 3 4 5 6 7 8 9
trgtext = "michael geht davon aus , dass er im haus bleibt"
alignment = [(0,0), (1,1), (1,2), (1,3), (2,5), (3,6), (4,9), (5,9), (6,7), (7,7), (8,8)]
phrases = phrase_extraction(srctext, trgtext, alignment)
# Keep track of translations of each phrase in srctext and its
# alignement using a dictionary with keys as phrases and values being
# a list [e_alignement pair, [f_extractions, ...] ]
dlist = {}
for p, a, b in phrases:
if a in dlist:
dlist[a][1].append(b)
else:
dlist[a] = [p, [b]]
# Sort the list of translations based on their length. Shorter phrases first.
for v in dlist.values():
v[1].sort(key=lambda x: len(x))
# Function to help sort according to book example.
def ordering(p):
k,v = p
return v[0]
#
for i, p in enumerate(sorted(dlist.items(), key = ordering), 1):
k, v = p
print "({0:2}) {1} {2} — {3}".format( i, v[0], k, " ; ".join(v[1]))
准备输出的最后部分可能会得到改进......而且算法代码可以更加苛刻。