如何减少冗余文本?例如,我有2个输入,我需要将它们减少到以下输出
输入1:丰田是红色的。本田是红色的。宝马是红色的。该 梅赛德斯是绿色的。
输出1:丰田,本田和宝马都是红色但是 梅赛德斯是绿色的。
输入2:丰田是红色的。本田是红色的。宝马是红色的。该 梅赛德斯是红色的。
输出2:所有车辆都是红色的。
我假设这是一个NLP问题。理想情况下我喜欢在Python中这样做(但是任何其他语言都很好,开发简单易用)
答案 0 :(得分:3)
正如我对您的问题发表评论:我认为首先应该定义某种语法。例如,一个简单的赋值语句:The <variable> is <value>.
。我已经为你的第一行创建了一个小例子,我想你会理解这个想法,所以你也可以为下一行做到这一点:
import re
def reducer(text): # Catch statements and add them to a dictionary
catched = dict()
for v, k in re.findall(r'The\s(?P<variable>\w+)\sis\s(?P<value>\w+)', text):
try:
catched[k].append(v)
except KeyError:
catched[k] = [v]
return catched
def comma_and(ls): # Create human-like enumeration with words
output = [ls[0]]
for i in xrange(1, len(ls)):
output.append('%s%s' % (' and ' if i == len(ls) - 1 else ', ', ls[i]))
return ''.join(output)
def rephrase(text): # Rephrase separated statements into one sentence
stmnts = reducer(text)
part1 = str()
part2 = str()
for key in stmnts:
if len(stmnts[key]) <= 1:
part2 = 'but the {variable} is {value}.'.format(
variable=stmnts[key][0], value=key)
else:
part1 = 'The {variables} are {value}'.format(
variables=comma_and(stmnts[key]), value=key)
print part1 + ' ' + part2
演示:
rephraser('The Toyota is red. The Honda is red. The BMW is red. The Mercedes is green.')
输出是:
# The Toyota, Honda and BMW are red but the Mercedes is green.
您的下一行是这样的:检查字典catched
是否只有一个密钥,如果该密钥中的值都是汽车,请使用All <type> are <value>.
答案 1 :(得分:2)
有趣的案例! 我做过类似下面的事情
import re, sys, random
def main():
sentence = get_sentence()
print("Simplifying sentence:")
print(""" \" %s \" """ % sentence )
mapped_elements = map_colors(sentence)
# 3 possible casses
number_of_parts = len(mapped_elements)
if number_of_parts == 1:
color = mapped_elements[0][0]
elements = mapped_elements[0][1]
if elements == 1:
simplified = "The %s is %s." % (elements, color)
else:
simplified = "All cars are %s." % (color)
elif number_of_parts == 2:
part1 = mapped_elements[0]
part2 = mapped_elements[1]
part1_sentence = "The %s %s %s" % (gather_elements(part1[1]) ,pluralize(len(part1[1])),part1[0])
part2_sentence = "the %s %s %s" % (gather_elements(part2[1]), pluralize(len(part2[1])),part2[0])
simplified = "%s but %s." % (part1_sentence, part2_sentence)
else:
all_parts = []
for color, nouns in mapped_elements:
part_sentence = "The %s %s %s. " % (gather_elements(nouns) , pluralize(len(nouns)), color)
all_parts.append(part_sentence)
simplified = ''.join(all_parts)
print("Result: %s" % simplified)
return simplified
def pluralize(n):
if n == 1: return "is"
return "are"
def get_sentence():
if len(sys.argv) > 1:
sentence = sys.argv[1]
elif test_mode:
sentence = random.choice(sample_test)
else:
sentence = raw_input("Please enter a sentence to simplify: ")
return sentence
def gather_elements(elements):
if len(elements) == 1:
return elements[0]
partial = ', '.join(elements[:-1])
partial += " and %s " % (elements[-1])
return partial
def map_colors(sentence):
colors = {}
r = re.compile(r'The (\w+) is (\w+)\.')
pairs = re.findall( r, sentence)
for noun, color in pairs:
if color in colors:
colors[color].append(noun)
else:
colors[color] = [noun]
return colors.items()