我想写一个语法并获得以下类型句子的作品
我写了这个:
from __future__ import division
import sys
from collections import defaultdict
import nltk
from nltk import ConditionalFreqDist, Nonterminal, FreqDist
from fixesNLTK3 import *
from BetterICP import *
from nltk import InsideChartParser
from nltk import induce_pcfg
from nltk import PCFG
from nltk import ProbabilisticProduction
from nltk import Tree, DictionaryProbDist
from nltk.grammar import Production
toy_grammer = parse_pgrammar("""
# Grammatical productions.
S -> NP VP [1.0]
NP -> Pro [0.1] | Det N [0.3] | N [0.5] | NP PP [0.1]
VP -> Vi [0.05] | Vt NP [0.9] | VP PP [0.05]
Det -> Art [1.0]
PP -> Prep NP [1.0]
PRP -> Prp_Obj[0.5] | vbd VBD [0.3] | to TO[0.2]
# Lexical productions.
Pro -> "i" [0.3] | "we" [0.1] | "you" [0.1] | "he" [0.3] | "she" [0.2]
Art -> "a" [0.4] | "an" [0.1] | "the" [0.4] | "The" [0.1]
Prep -> "with" [0.7] | "in" [0.3]
N -> "salad" [0.3] | "fork" [0.3] | "mushrooms" [0.2] | "book" [0.2]
Vi -> "sneezed" [0.4] | "ran" [0.4] | "read" [0.2]
Vt -> "eat" [0.2] | "eats" [0.2] | "ate" [0.2] | "see" [0.2] | "saw" [0.2]
Prp_Obj -> "her" [0.5] | "I" [0.5]
vbd -> "was" [1.0]
to -> "to" [1.0]
""")
def input_file():
sppc = BetterICP(toy_grammer)
with open("input.txt", "r") as ins:
array = []
for line in ins:
array.append(line)
for a in array:
sppc.parse(a.split())
input_file()
我的输出为0 total parses found
我的语法是否正确定义?
更新(包含在input.txt
中):
eat the salad
I read her the book
The book was read to her
BetterICP.py
from __future__ import division
import sys
from pprint import pprint
from collections import defaultdict
import nltk
from nltk.corpus import treebank
from nltk import ConditionalFreqDist, Nonterminal, FreqDist
from fixesNLTK3 import *
from nltk import InsideChartParser
from nltk.parse.chart import Chart,AbstractChartRule
from nltk.tree import Tree,ProbabilisticTree,_child_names
from nltk.parse.pchart import ProbabilisticFundamentalRule,ProbabilisticBottomUpInitRule,ProbabilisticTreeEdge,ProbabilisticLeafEdge
from nltk.parse.pchart import SingleEdgeProbabilisticFundamentalRule
from math import log
# Renamed between 3.0 and 3.0.4 :-(
if not(hasattr(Chart,'pretty_format_edge')):
Chart.pretty_format_edge=Chart.pp_edge
# nltk.parse.pchart is fundamentally broken, because it adds edges directly
# into the chart, where the fr can see them whether or not they've come
# out of the agenda or not.
# The least-bad fix from outside I can come up with is implemented here:
# add a boolean var called 'pending' which is true by default, only set to
# false when the edge comes off the agenda, and when true causes it
# to be ignored by fr
# Possible bug? Even pending edges _are_ checked for when testing for
# redundancy (i.e. Chart.insert is _not_ changed), but that means any
# failure of best-first might cause a cheaper edge to be discarded
# because an earlier, but still pending, identical-but-more expensive
# edge is in the chart.
nltk.chart.EdgeI.pending=True
def productions_with_left_context(self,lpos=0,leaves=None):
"""
Generate the productions that correspond to the non-terminal nodes of the tree, with their left-context word (or None), as pairs of word and Production.
For each subtree of the form (P: C1 C2 ... Cn) this produces a production of the
form P -> C1 C2 ... Cn and the word to the left of C1
>>> t = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))")
>>> t.productions()
[(None, S -> NP VP),
(None, NP -> D N),
(None, D -> 'the')
('the', N -> 'dog'),
('dog', VP -> V NP),
('dog', V -> 'chased'),
('dog', NP -> D N),
('chased', D -> 'the'),
('the', N -> 'cat')]
:rtype: list(Production)
"""
if leaves is None:
leaves=self.leaves()
#if not isinstance(self._label, string_types):
# raise TypeError('Productions can only be generated from trees having node labels that are strings')
if lpos>0:
lc=leaves[lpos-1]
else:
lc=None
prods = [(lc,Production(Nonterminal(self._label), _child_names(self)))]
for child in self:
if isinstance(child, Tree):
prods += child.productions_with_left_context(lpos,leaves)
# could be much smarter
lpos+=len(child.leaves())
else:
lpos+=1
return prods
Tree.productions_with_left_context=productions_with_left_context
def production_distribution(psents):
""" Creates a frequency distribution of lexical and non-lexical (grammatical) productions """
prod_dict = defaultdict(int)
for psent in psents:
for production in psent.productions():
prod_dict[production]+=1
return prod_dict
def nt_counts(prod_dict):
'''Create a dictionary of non-terminals and their counts'''
nt_dict=defaultdict(int)
for (rule,count) in prod_dict.items():
nt_dict[rule.lhs()]+=count
return nt_dict
def cost(prob):
return 0.0 if prob==1.0 else -log(prob,2)
def production_cost(production,lhs_counts,production_counts):
pcount=production_counts[production]
ntcount=lhs_counts[production.lhs()]
return cost(float(pcount)/float(ntcount))
def get_costed_productions(psents):
""" Creates costed/weighted productions from a given list of parsed sentences."""
prods_dict = production_distribution(psents)
prods_nt_counts=nt_counts(prods_dict)
costed_prods=[CostedProduction(p.lhs(),p.rhs(),production_cost(p, prods_nt_counts, prods_dict))
for p in prods_dict.keys()]
return costed_prods
class BetterPBPR(AbstractChartRule):
NUM_EDGES=1
def apply(self, chart, grammar, edge):
if edge.is_incomplete(): return
for prod in grammar.productions():
if edge.lhs() == prod.rhs()[0]:
# check for X -> X
if prod.lhs()==edge.lhs() and len(prod.rhs())==1:
continue
new_edge = ProbabilisticTreeEdge.from_production(prod, edge.start(), prod.prob())
if chart.insert(new_edge, ()):
yield new_edge
class BetterSEPFR(AbstractChartRule):
NUM_EDGES=1
_fundamental_rule = ProbabilisticFundamentalRule()
def apply(self, chart, grammar, edge1):
fr = self._fundamental_rule
if edge1.is_incomplete():
# edge1 = left_edge; edge2 = right_edge
for edge2 in chart.select(start=edge1.end(), is_complete=True,
lhs=edge1.nextsym()):
if edge2.pending:
continue
for new_edge in fr.apply(chart, grammar, edge1, edge2):
yield new_edge
else:
# edge2 = left_edge; edge1 = right_edge
for edge2 in chart.select(end=edge1.start(), is_complete=False,
nextsym=edge1.lhs()):
if edge2.pending:
continue
for new_edge in fr.apply(chart, grammar, edge2, edge1):
yield new_edge
class BetterICP(InsideChartParser):
'''Implement a more user-friendly InsideChartParser,
which will show intermediate results, and quit after
finding a specified number of parses'''
def parse(self, tokens, notify=True, max=0):
'''Run a probabilistic parse of tokens.
If notify is true, display each complete parse as it is found
If max>0, quit after finding that many parses'''
self._grammar.check_coverage(tokens)
chart = Chart(list(tokens))
chart._trace=self._trace # Bad form. . .
grammar = self._grammar
start = grammar.start()
prod_probs = {}
# Chart parser rules.
bu_init = ProbabilisticBottomUpInitRule()
bu = BetterPBPR() # avoid infinite numbers of parses :-(
fr = BetterSEPFR() # don't look at pending edges
# Our queue
queue = []
# Initialize the chart.
for edge in bu_init.apply(chart, grammar):
if self._trace > 1:
print(' %-50s [%.4g]' % (chart.pretty_format_edge(edge,width=2),
cost(edge.prob())))
queue.append(edge)
found = 0
while len(queue) > 0 and (max<1 or found<max):
# Re-sort the queue.
self.sort_queue(queue, chart)
# Prune the queue to the correct size if a beam was defined
if self.beam_size:
self._prune(queue, chart)
# Get the best edge.
edge = queue.pop()
edge.pending = False
if self._trace > 0:
print(' %-50s [%.4g]' % (chart.pretty_format_edge(edge,width=2),
cost(edge.prob())))
if (edge.start()==0 and
edge.end()==chart._num_leaves and
edge.lhs()==start and
edge.is_complete()):
if len(prod_probs)==0:
for prod in grammar.productions():
prod_probs[prod.lhs(), prod.rhs()] = prod.prob()
if notify:
print "****"
for tree in chart.trees(edge, tree_class=ProbabilisticTree,
complete=True):
self._setprob(tree, prod_probs)
print tree, '%.4g (%.4g)'%(cost(tree.prob()),cost(edge.prob()))
#print tree
print "****"
found+=1
# Apply BU & FR to it.
queue.extend(fr.apply(chart, grammar, edge))
queue.extend(bu.apply(chart, grammar, edge))
# Get a list of complete parses.
parses = list(chart.parses(grammar.start(), ProbabilisticTree))
if not notify:
for parse in parses:
self._setprob(parse,prod_probs)
# Sort by probability
parses.sort(reverse=True, key=lambda tree: tree.prob())
if notify:
print "%s total parses found"%found
return iter(parses)
def _prune(self, queue, chart):
""" Discard items in the queue if the queue is longer than the beam."""
if len(queue) > self.beam_size:
split = len(queue)-self.beam_size
if self._trace > 2:
for edge in queue[:split]:
print(' %-50s [%.4g DISCARDED]' % (chart.pretty_format_edge(edge,2),
cost(edge.prob())))
del queue[:split]
def beam(self,width):
self.beam_size=width
fixesNLTK3.py
# fix buggy NLTK 3 :-(
# different fixes for different versions :-((((
import re, sys
import nltk
from nltk.grammar import _ARROW_RE, _PROBABILITY_RE, _DISJUNCTION_RE, Production
from nltk.draw import CFGEditor
from nltk.probability import ImmutableProbabilisticMixIn
ARROW = u'\u2192'
TOKEN = u'([\\w ]|\\\\((x[0-9a-f][0-9a-f])|(u[0-9a-f][0-9a-f][0-9a-f][0-9a-f])))+'
CFGEditor.ARROW = ARROW
CFGEditor._TOKEN_RE=re.compile(u"->|u?'"+TOKEN+u"'|u?\""+TOKEN+u"\"|\\w+|("+ARROW+u")")
CFGEditor._PRODUCTION_RE=re.compile(ur"(^\s*\w+\s*)" +
ur"(->|("+ARROW+"))\s*" +
ur"((u?'"+TOKEN+"'|u?\""+TOKEN+"\"|''|\"\"|\w+|\|)\s*)*$")
nltk.grammar._TERMINAL_RE = re.compile(ur'( u?"[^"]+" | u?\'[^\']+\' ) \s*', re.VERBOSE)
nltk.grammar._ARROR_RE = re.compile(ur'\s* (->|'+ARROW+') \s*', re.VERBOSE)
from nltk.grammar import _TERMINAL_RE
if sys.version_info[0]>2 or sys.version_info[1]>6:
from nltk.grammar import PCFG, CFG, ProbabilisticProduction as FixPP
parse_grammar=CFG.fromstring
parse_pgrammar=PCFG.fromstring
from nltk import InsideChartParser
def nbest_parse(self,tokens,n=None):
parses=self.parse(tokens)
if n is None:
return [parse for parse in parses]
else:
return [parses.next() for i in range(n)]
InsideChartParser.nbest_parse=nbest_parse
else:
from nltk.grammar import WeightedGrammar as PCFG, WeightedProduction as FixPP
from nltk import parse_cfg, parse_pcfg
parse_grammar=parse_cfg
parse_pgrammar=parse_pcfg
def fix_parse_production(line, nonterm_parser, probabilistic=False):
"""
Parse a grammar rule, given as a string, and return
a list of productions.
"""
pos = 0
# Parse the left-hand side.
lhs, pos = nonterm_parser(line, pos)
# Skip over the arrow.
m = _ARROW_RE.match(line, pos)
if not m: raise ValueError('Expected an arrow')
pos = m.end()
# Parse the right hand side.
probabilities = [0.0]
rhsides = [[]]
while pos < len(line):
# Probability.
m = _PROBABILITY_RE.match(line, pos)
if probabilistic and m:
pos = m.end()
probabilities[-1] = float(m.group(1)[1:-1])
if probabilities[-1] > 1.0:
raise ValueError('Production probability %f, '
'should not be greater than 1.0' %
(probabilities[-1],))
# String -- add terminal.
elif (line[pos] in "\'\"" or line[pos:pos+2] in ('u"',"u'")):
m = _TERMINAL_RE.match(line, pos)
if not m: raise ValueError('Unterminated string')
rhsides[-1].append(eval(m.group(1)))
pos = m.end()
# Vertical bar -- start new rhside.
elif line[pos] == '|':
m = _DISJUNCTION_RE.match(line, pos)
probabilities.append(0.0)
rhsides.append([])
pos = m.end()
# Anything else -- nonterminal.
else:
nonterm, pos = nonterm_parser(line, pos)
rhsides[-1].append(nonterm)
if probabilistic:
return [FixPP(lhs, rhs, prob=probability)
for (rhs, probability) in zip(rhsides, probabilities)]
else:
return [Production(lhs, rhs) for rhs in rhsides]
if sys.version_info[0]>2 or sys.version_info[1]>6:
nltk.grammar._read_production=fix_parse_production
else:
nltk.grammar.parse_production=fix_parse_production
class CostedProduction(FixPP):
"""
A probabilistic context free grammar production using costs.
A PCFG ``ProbabilisticProduction`` is essentially just a ``Production`` that
has an associated probability, which represents how likely it is that
this production will be used. In particular, the probability of a
``ProbabilisticProduction`` records the likelihood that its right-hand side is
the correct instantiation for any given occurrence of its left-hand side.
:see: ``Production``
"""
def __init__(self, lhs, rhs, cost):
"""
Construct a new ``ProbabilisticProduction``.
:param lhs: The left-hand side of the new ``ProbabilisticProduction``.
:type lhs: Nonterminal
:param rhs: The right-hand side of the new ``ProbabilisticProduction``.
:type rhs: sequence(Nonterminal and terminal)
:param prob: Probability parameters of the new ``ProbabilisticProduction``.
"""
ImmutableProbabilisticMixIn.__init__(self, logprob=-cost)
Production.__init__(self, lhs, rhs)
def __str__(self):
return Production.__unicode__(self) + \
(' [0.0]' if (self.logprob() == 0.0) else ' [%g]' % -self.logprob())
def __repr__(self):
return '%s'%str(self)
def cost(self):
return 0.0 if self.logprob() == 0.0 else -self.logprob()