我正在基于MedaCy的想法构建单元注释器。首先设置所有基本单元类型,然后将其用于构建更复杂的单元。例如:
m /s²→m距离/ s持续时间²→m / s速度²→m /s² 加速
为此,我更改了部分标记化,以使数字始终与字母字符等分开。
'<2.0 m /s²'→['<','2.0','m','/','s','²']
但是,我当前的问题是,每当识别到一个实体时,我都只能通过合并令牌来实现最后一步(加速)。这导致我明确希望避免的基础令牌的特征和实体信息丢失。因此,我禁用了令牌的合并,但是现在我无法完成注释实体(如加速)的最后一步。这是因为匹配器基于令牌工作。如下所示,匹配器无法检测到该实体,因为它跨越多个令牌。 (请注意,速度已正确标注在多个标记上。)
[{'ENT_TYPE': 'speed'}, {'TEXT': {'REGEX': r'(^)?2|²'}}]
['<', '2.0', 'm', '/', 's', '²']
对我来说,将所有可能的令牌组合添加到加速度匹配器将是一个不二之选,因为这会干扰从下至上构建所有单元的概念。
我想到的另一种解决方案是使用多个实体标尺,因为首先必须标记基本单位,然后再标记更复杂的单位。但是,它似乎遇到了相同的标记化问题,而且我收到了错误消息,即只能有一个实体标尺。 'entity_ruler' already exists in pipeline. Existing names: ['entity_ruler']
总而言之,我想使用跨越多个令牌的实体来注释实体。因此,基于令牌的匹配不起作用。
这是在创建空白spaCy模型后立即调用的。
def remove_units(nlp):
suffixes = list(nlp.Defaults.suffixes)
UNITS = '(?<=[0-9])(?:km|km²|km³|m|m²|m³|dm|dm²|dm³|cm|cm²|cm³|mm|mm²|mm³|ha|µm|nm|yd|in|ft|kg|g|mg|µg|t|lb|oz|m/s|km/h|kmh|mph|hPa|Pa|mbar|mb|MB|kb|KB|gb|GB|tb|TB|T|G|M|K|%|км|км²|км³|м|м²|м³|дм|дм²|дм³|см|см²|см³|мм|мм²|мм³|нм|кг|г|мг|м/с|км/ч|кПа|Па|мбар|Кб|КБ|кб|Мб|МБ|мб|Гб|ГБ|гб|Тб|ТБ|тбكم|كم²|كم³|م|م²|م³|سم|سم²|سم³|مم|مم²|مم³|كم|غرام|جرام|جم|كغ|ملغ|كوب|اكواب)'
suffixes.remove(UNITS)
suffixes = nlp.Defaults.suffixes + (r'(?<=[0-9])(?:[A-z]+[^.,:;]*)', # For splitting numbers from alphabetic characters
r'/',) # For splitting backslashes from other characters 'm/' -> 'm','/'
suffix_regex = spacy.util.compile_suffix_regex(suffixes)
nlp.tokenizer.suffix_search = suffix_regex.search
prefixes = list(nlp.Defaults.prefixes)
prefixes = nlp.Defaults.prefixes + (r'/', ) # For splitting backslashes from other characters '/s' -> '/','s'
prefix_regex = spacy.util.compile_prefix_regex(prefixes)
nlp.tokenizer.prefix_search = prefix_regex.search
from spacy.matcher import Matcher
from spacy.tokens import Span
from spacy.tokens import Token
import re
class UnitAnnotator(object):
name="unit_annotator"
dependencies = []
def __init__(self, nlp):
self.nlp = nlp
Token.set_extension('is_duration_unit', default=False, force=True)
Token.set_extension('is_memory_unit', default=False, force=True)
Token.set_extension('is_fraction_unit', default=False, force=True)
Token.set_extension('is_angle_unit', default=False, force=True)
Token.set_extension('is_distance_unit', default=False, force=True)
Token.set_extension('is_pressure_unit', default=False, force=True)
Token.set_extension('is_voltage_unit', default=False, force=True)
Token.set_extension('is_speed_unit', default=False, force=True)
Token.set_extension('is_acceleration_unit', default=False, force=True)
Token.set_extension('is_frequency_unit', default=False, force=True)
Token.set_extension('is_volume_unit', default=False, force=True)
Token.set_extension('is_torque_unit', default=False, force=True)
Token.set_extension('is_operator', default=False, force=True)
Token.set_extension('is_measurement', default=False, force=True)
# For splitting equations first
self.split_matcher1 = Matcher(nlp.vocab)
self.split_matcher1.add('SPLIT1', None,
[{'TEXT': {'REGEX': r'[/\>\<]'}}], # 'km/h' -> 'km','/','h'
)
self.split_matcher2 = Matcher(nlp.vocab)
self.split_matcher2.add('SPLIT2', None,
[{'TEXT': {'REGEX': r'[²³]'}}], # 'm²' -> 'm','²'
[{'TEXT': {'REGEX': r'\)'}}], # '8)' -> '8',')'
# TODO: FIX splitting of '(mis)interventions'
)
self.duration_matcher = Matcher(nlp.vocab)
self.duration_matcher.add('UNIT_OF_duration', None,
[{'ORTH': 'ms'}],
[{'LOWER': 'msec'}],
[{'LOWER': 'milisecond'}],
[{'LOWER': 'miliseconds'}],
[{'ORTH': 's'}],
[{'LOWER': 'sec'}],
[{'LOWER': 'second'}],
[{'LOWER': 'seconds'}],
[{'LOWER': 'min'}],
[{'LOWER': 'mins'}],
[{'LOWER': 'minute'}],
[{'LOWER': 'minutes'}],
[{'ORTH': 'h'}],
[{'LOWER': 'hour'}],
[{'LOWER': 'hours'}]
)
self.memory_matcher = Matcher(nlp.vocab)
self.memory_matcher.add('UNIT_OF_MEMORY', None,
[{'LOWER': 'kb'}],
[{'LOWER': 'kbs'}],
[{'LOWER': 'kbit'}],
[{'LOWER': 'kbits'}],
[{'LOWER': 'mb'}],
[{'LOWER': 'mbs'}],
[{'LOWER': 'mbit'}],
[{'LOWER': 'mbits'}],
[{'LOWER': 'gb'}],
[{'LOWER': 'gbs'}],
[{'LOWER': 'gbit'}],
[{'LOWER': 'gbits'}],
[{'LOWER': 'tb'}],
[{'LOWER': 'tbs'}],
[{'LOWER': 'bit'}],
[{'LOWER': 'bits'}],
[{'LOWER': 'byte'}],
[{'LOWER': 'bytes'}],
[{'LOWER': 'kilobyte'}],
[{'LOWER': 'kilobytes'}],
[{'LOWER': 'megabyte'}],
[{'LOWER': 'megabytes'}],
[{'LOWER': 'gigabyte'}],
[{'LOWER': 'gigabytes'}],
[{'LOWER': 'terrabyte'}],
[{'LOWER': 'terrabytes'}],
)
self.fraction_matcher = Matcher(nlp.vocab)
self.fraction_matcher.add('UNIT_OF_FRACTION', None,
[{'ORTH': '%'}],
[{'LOWER': 'percent'}],
[{'LOWER': 'per'}, {'LOWER': 'cent'}]
)
self.angle_matcher = Matcher(nlp.vocab)
self.angle_matcher.add('UNIT_OF_ANGLE', None,
[{'LOWER': '°'}],
[{'LOWER': '°c'}],
[{'LOWER': 'deg'}],
[{'LOWER': 'degs'}],
[{'LOWER': 'degree'}],
[{'LOWER': 'degrees'}],
)
self.distance_matcher = Matcher(nlp.vocab)
self.distance_matcher.add('UNIT_OF_DISTANCE', None,
[{'ORTH': 'nm'}],
[{'LOWER': 'nanometer'}],
[{'LOWER': 'nanometers'}],
[{'ORTH': 'µm'}],
[{'LOWER': 'micrometer'}],
[{'LOWER': 'mircometers'}],
[{'ORTH': 'mm'}],
[{'LOWER': 'milimeter'}],
[{'LOWER': 'milimeters'}],
[{'ORTH': 'cm'}],
[{'LOWER': 'cendurationter'}],
[{'LOWER': 'cendurationters'}],
[{'ORTH': 'm'}],
[{'LOWER': 'meter'}],
[{'LOWER': 'meters'}],
[{'ORTH': 'km'}],
[{'LOWER': 'kilometer'}],
[{'LOWER': 'kilometers'}],
[{'LOWER': 'zoll'}],
)
self.pressure_matcher = Matcher(nlp.vocab)
self.pressure_matcher.add('UNIT_OF_PRESSURE', None,
[{'LOWER': 'bar'}] # Maybe add F/A
)
self.voltage_matcher = Matcher(nlp.vocab)
self.voltage_matcher.add('UNIT_OF_VOLTAGE', None,
[{'ORTH': 'V'}],
[{'lower': 'volt'}],
)
self.speed_matcher = Matcher(nlp.vocab)
self.speed_matcher.add('UNIT_OF_SPEED', None,
[{'ENT_TYPE': 'distance'}, {'LOWER': {'REGEX': r'/|p'}}, {'ENT_TYPE': 'duration'}]
)
self.acceleration_matcher = Matcher(nlp.vocab)
self.acceleration_matcher.add('UNIT_OF_ACCELERATION', None,
[{'ENT_TYPE': 'speed'}, {'TEXT': {'REGEX': r'(^)?2|²'}}]
)
self.frequency_matcher = Matcher(nlp.vocab)
self.frequency_matcher.add('UNIT_OF_FREQUENCY', None,
[{'LOWER': 'hz'}],
[{'LOWER': 'herz'}], # common misspelling
[{'LOWER': 'hertz'}],
[{'LOWER': '1'}, {'ORTH': '/'}, {'ENT_TYPE': 'duration'}]
)
self.volume_matcher = Matcher(nlp.vocab)
self.volume_matcher.add('UNIT_OF_VOLUME', None,
[{'LOWER': 'l'}],
[{'LOWER': 'liter'}],
[{'ENT_TYPE': 'distance'}, {'TEXT': {'REGEX': r'(^)?3|³'}}]
)
self.torque_matcher = Matcher(nlp.vocab)
self.torque_matcher.add('UNIT_OF_TORQUE', None,
[{'ORTH': 'Nm'}],
[{'LOWER': 'newtonmeter'}]
)
# TODO: RPM MATCHER
self.operator_matcher = Matcher(nlp.vocab)
self.operator_matcher.add('OPERATOR', None, # For now only < and >
[{'ORTH': '<'}, {'LIKE_NUM': True}],
[{'ORTH': '>'}, {'LIKE_NUM': True}],
[{'ORTH': '<'}, {'ORTH': '='}, {'LIKE_NUM': True}],
[{'ORTH': '>'}, {'ORTH': '='}, {'LIKE_NUM': True}],
[{'ORTH': '+'}, {'ORTH': '/'}, {'LIKE_NUM': True}], # LIKE_NUM already includes + and -
)
self.measurement_matcher = Matcher(nlp.vocab)
self.measurement_matcher.add('MEASUREMENT', None,
[{'LIKE_NUM': True}, {'ENT_TYPE': 'duration'}],
[{'LIKE_NUM': True}, {'ENT_TYPE': 'memory'}],
[{'LIKE_NUM': True}, {'ENT_TYPE': 'fraction'}],
[{'LIKE_NUM': True}, {'ENT_TYPE': 'angle'}],
[{'LIKE_NUM': True}, {'ENT_TYPE': 'distance'}],
[{'LIKE_NUM': True}, {'ENT_TYPE': 'pressure'}],
[{'LIKE_NUM': True}, {'ENT_TYPE': 'voltage'}],
[{'LIKE_NUM': True}, {'ENT_TYPE': 'speed'}],
[{'LIKE_NUM': True}, {'ENT_TYPE': 'acceleration'}],
[{'LIKE_NUM': True}, {'ENT_TYPE': 'frequency'}],
[{'LIKE_NUM': True}, {'ENT_TYPE': 'volume'}],
[{'LIKE_NUM': True}, {'ENT_TYPE': 'torque'}],
[{'ENT_TYPE': 'operator'},{'ENT_TYPE': 'duration'}],
[{'ENT_TYPE': 'operator'},{'ENT_TYPE': 'memory'}],
[{'ENT_TYPE': 'operator'},{'ENT_TYPE': 'fraction'}],
[{'ENT_TYPE': 'operator'},{'ENT_TYPE': 'angle'}],
[{'ENT_TYPE': 'operator'},{'ENT_TYPE': 'distance'}],
[{'ENT_TYPE': 'operator'},{'ENT_TYPE': 'pressure'}],
[{'ENT_TYPE': 'operator'},{'ENT_TYPE': 'voltage'}],
[{'ENT_TYPE': 'operator'},{'ENT_TYPE': 'speed'}],
[{'ENT_TYPE': 'operator'},{'ENT_TYPE': 'acceleration'}],
[{'ENT_TYPE': 'operator'},{'ENT_TYPE': 'frequency'}],
[{'ENT_TYPE': 'operator'},{'ENT_TYPE': 'volume'}],
[{'ENT_TYPE': 'operator'},{'ENT_TYPE': 'torque'}],
# TODO: 20 ... 30 UNIT, 20 to 30 UNIT, 20 of 60 UNIT
)
def __call__(self, doc):
nlp = self.nlp
# Split tokens containg a backslash 'km/h' -> 'km','/','h'
with doc.retokenize() as retokenizer:
matches1 = self.split_matcher1(doc)
for match_id, start, end in matches1:
span = Span(doc, start, end)
if len(span.text) > 1:
if '/' in span.text:
split = re.split('(/)', span.text)
if '>' in span.text:
split = re.split('(\>)', span.text)
if '<' in span.text:
split = re.split('(\<)', span.text)
heads = [(doc[start], i) for i,_ in enumerate(split)]
retokenizer.split(doc[start], split, heads=heads)
# Split tokens containg ')²³'
with doc.retokenize() as retokenizer:
matches2 = self.split_matcher2(doc)
for match_id, start, end in matches2:
span = Span(doc, start, end)
if len(span.text) > 1:
split = [x for x in span.text]
heads = [(doc[start], i) for i,_ in enumerate(split)]
retokenizer.split(doc[start], split, heads=heads)
def annotate(matcher, unit_type: str, attribute):
with doc.retokenize() as retokenizer:
#match and tag units
matches = matcher(doc)
entities = list(doc.ents)
add_flag = True
for match_id, start, end in matches:
span = Span(doc, start, end, label=unit_type)
for token in span:
setattr(token._, attribute, True)
try:
if len(span) > 1:
#retokenizer.merge(span)
pass
except ValueError:
pass
for e in entities[:]:
r_e = range(e.start+1,e.end+1)
r_n = range(start+1,end+1)
# Remove smaller entities which would overlap with the new one
if (end-start > e.end-e.start and (start+1 in r_e or end in r_e)) or (start < e.start and end > e.end):
entities.remove(e)
continue
# Check if entity to be added would overlap with an existing bigger one
if (e.end-e.start > end-start and (e.start+1 in r_n or e.end in r_n)) or (e.start < start and e.end > end):
add_flag = False
if(add_flag):
entities.append(span)
add_flag = True
doc.ents = entities
annotate(self.duration_matcher, 'duration', 'is_duration_unit')
annotate(self.memory_matcher, 'memory', 'is_memory_unit')
annotate(self.fraction_matcher, 'fraction', 'is_fraction_unit')
annotate(self.angle_matcher, 'angle', 'is_angle_unit')
annotate(self.distance_matcher, 'distance', 'is_distance_unit')
annotate(self.pressure_matcher, 'pressure', 'is_pressure_unit')
annotate(self.voltage_matcher, 'voltage', 'is_voltage_unit')
annotate(self.speed_matcher, 'speed', 'is_speed_unit')
annotate(self.acceleration_matcher, 'acceleration', 'is_acceleration_unit')
annotate(self.frequency_matcher, 'frequency', 'is_frequency_unit')
annotate(self.volume_matcher, 'volume', 'is_volume_unit')
annotate(self.torque_matcher, 'torque', 'is_torque_unit')
annotate(self.operator_matcher, 'operator', 'is_operator')
annotate(self.measurement_matcher, 'measurement', 'is_measurement')
return doc
答案 0 :(得分:0)
此后,我找到了一个显而易见的解决方案。
['<','2.0','m','/','s','²']
m SPEED / SPEED s SPEED
这是实体类型为SPEED的三个令牌。因此,使用“一个或多个”量词就足够了。
[{'ENT_TYPE': 'speed', 'OP': '+'}, {'TEXT': {'REGEX': r'(^)?2|²'}}]
在此解决方案中,实体类型仍然被覆盖,但是基础单元仍作为要素存储在每个令牌上。