Question

我正在基于MedaCy的想法构建单元注释器。首先设置所有基本单元类型，然后将其用于构建更复杂的单元。例如：

m /s²→m距离/ s持续时间²→m / s速度²→m /s² 加速

为此，我更改了部分标记化，以使数字始终与字母字符等分开。

'<2.0 m /s²'→['<'，'2.0'，'m'，'/'，'s'，'²']

但是，我当前的问题是，每当识别到一个实体时，我都只能通过合并令牌来实现最后一步（加速）。这导致我明确希望避免的基础令牌的特征和实体信息丢失。因此，我禁用了令牌的合并，但是现在我无法完成注释实体（如加速）的最后一步。这是因为匹配器基于令牌工作。如下所示，匹配器无法检测到该实体，因为它跨越多个令牌。（请注意，速度已正确标注在多个标记上。）

[{'ENT_TYPE': 'speed'}, {'TEXT': {'REGEX': r'(^)?2|²'}}]
['<', '2.0', 'm', '/', 's', '²']

对我来说，将所有可能的令牌组合添加到加速度匹配器将是一个不二之选，因为这会干扰从下至上构建所有单元的概念。

我想到的另一种解决方案是使用多个实体标尺，因为首先必须标记基本单位，然后再标记更复杂的单位。但是，它似乎遇到了相同的标记化问题，而且我收到了错误消息，即只能有一个实体标尺。 'entity_ruler' already exists in pipeline. Existing names: ['entity_ruler']

总而言之，我想使用跨越多个令牌的实体来注释实体。因此，基于令牌的匹配不起作用。

这是在创建空白spaCy模型后立即调用的。

def remove_units(nlp):
    suffixes = list(nlp.Defaults.suffixes)
    UNITS = '(?<=[0-9])(?:km|km²|km³|m|m²|m³|dm|dm²|dm³|cm|cm²|cm³|mm|mm²|mm³|ha|µm|nm|yd|in|ft|kg|g|mg|µg|t|lb|oz|m/s|km/h|kmh|mph|hPa|Pa|mbar|mb|MB|kb|KB|gb|GB|tb|TB|T|G|M|K|%|км|км²|км³|м|м²|м³|дм|дм²|дм³|см|см²|см³|мм|мм²|мм³|нм|кг|г|мг|м/с|км/ч|кПа|Па|мбар|Кб|КБ|кб|Мб|МБ|мб|Гб|ГБ|гб|Тб|ТБ|тбكم|كم²|كم³|م|م²|م³|سم|سم²|سم³|مم|مم²|مم³|كم|غرام|جرام|جم|كغ|ملغ|كوب|اكواب)'
    suffixes.remove(UNITS)
    suffixes = nlp.Defaults.suffixes + (r'(?<=[0-9])(?:[A-z]+[^.,:;]*)', # For splitting numbers from alphabetic characters
                                        r'/',) # For splitting backslashes from other characters 'm/' -> 'm','/'
    suffix_regex = spacy.util.compile_suffix_regex(suffixes)
    nlp.tokenizer.suffix_search = suffix_regex.search

    prefixes = list(nlp.Defaults.prefixes)
    prefixes = nlp.Defaults.prefixes + (r'/', ) # For splitting backslashes from other characters '/s' -> '/','s'
    prefix_regex = spacy.util.compile_prefix_regex(prefixes)
    nlp.tokenizer.prefix_search = prefix_regex.search

from spacy.matcher import Matcher
from spacy.tokens import Span
from spacy.tokens import Token
import re


class UnitAnnotator(object):
    name="unit_annotator"
    dependencies = []

    def __init__(self, nlp):
        self.nlp = nlp

        Token.set_extension('is_duration_unit', default=False, force=True)
        Token.set_extension('is_memory_unit', default=False, force=True)
        Token.set_extension('is_fraction_unit', default=False, force=True)
        Token.set_extension('is_angle_unit', default=False, force=True)
        Token.set_extension('is_distance_unit', default=False, force=True)
        Token.set_extension('is_pressure_unit', default=False, force=True)
        Token.set_extension('is_voltage_unit', default=False, force=True)
        Token.set_extension('is_speed_unit', default=False, force=True)
        Token.set_extension('is_acceleration_unit', default=False, force=True)
        Token.set_extension('is_frequency_unit', default=False, force=True)
        Token.set_extension('is_volume_unit', default=False, force=True)
        Token.set_extension('is_torque_unit', default=False, force=True)
        Token.set_extension('is_operator', default=False, force=True)
        Token.set_extension('is_measurement', default=False, force=True)

        # For splitting equations first
        self.split_matcher1 = Matcher(nlp.vocab)
        self.split_matcher1.add('SPLIT1', None,
            [{'TEXT': {'REGEX': r'[/\>\<]'}}], # 'km/h' -> 'km','/','h'

        )
        self.split_matcher2 = Matcher(nlp.vocab)
        self.split_matcher2.add('SPLIT2', None,
            [{'TEXT': {'REGEX': r'[²³]'}}], # 'm²' -> 'm','²'
            [{'TEXT': {'REGEX': r'\)'}}], # '8)' -> '8',')'
            # TODO: FIX splitting of '(mis)interventions'
        )       


        self.duration_matcher = Matcher(nlp.vocab)
        self.duration_matcher.add('UNIT_OF_duration', None,
                            [{'ORTH': 'ms'}],
                            [{'LOWER': 'msec'}],
                            [{'LOWER': 'milisecond'}],
                            [{'LOWER': 'miliseconds'}],                            
                            [{'ORTH': 's'}],
                            [{'LOWER': 'sec'}],
                            [{'LOWER': 'second'}],
                            [{'LOWER': 'seconds'}],  
                            [{'LOWER': 'min'}],
                            [{'LOWER': 'mins'}],
                            [{'LOWER': 'minute'}],
                            [{'LOWER': 'minutes'}],                            
                            [{'ORTH': 'h'}],
                            [{'LOWER': 'hour'}],
                            [{'LOWER': 'hours'}]
        )

        self.memory_matcher = Matcher(nlp.vocab)
        self.memory_matcher.add('UNIT_OF_MEMORY', None,
                            [{'LOWER': 'kb'}],
                            [{'LOWER': 'kbs'}],
                            [{'LOWER': 'kbit'}],
                            [{'LOWER': 'kbits'}],
                            [{'LOWER': 'mb'}],
                            [{'LOWER': 'mbs'}],
                            [{'LOWER': 'mbit'}],
                            [{'LOWER': 'mbits'}],
                            [{'LOWER': 'gb'}],
                            [{'LOWER': 'gbs'}],
                            [{'LOWER': 'gbit'}],
                            [{'LOWER': 'gbits'}],
                            [{'LOWER': 'tb'}],
                            [{'LOWER': 'tbs'}],
                            [{'LOWER': 'bit'}],
                            [{'LOWER': 'bits'}],
                            [{'LOWER': 'byte'}],
                            [{'LOWER': 'bytes'}],
                            [{'LOWER': 'kilobyte'}],
                            [{'LOWER': 'kilobytes'}],
                            [{'LOWER': 'megabyte'}],
                            [{'LOWER': 'megabytes'}],
                            [{'LOWER': 'gigabyte'}],
                            [{'LOWER': 'gigabytes'}],
                            [{'LOWER': 'terrabyte'}],
                            [{'LOWER': 'terrabytes'}],                        
        )

        self.fraction_matcher = Matcher(nlp.vocab)
        self.fraction_matcher.add('UNIT_OF_FRACTION', None,
                            [{'ORTH': '%'}],
                            [{'LOWER': 'percent'}],
                            [{'LOWER': 'per'}, {'LOWER': 'cent'}]
        )

        self.angle_matcher = Matcher(nlp.vocab)
        self.angle_matcher.add('UNIT_OF_ANGLE', None,
                            [{'LOWER': '°'}],
                            [{'LOWER': '°c'}],
                            [{'LOWER': 'deg'}],
                            [{'LOWER': 'degs'}],
                            [{'LOWER': 'degree'}],
                            [{'LOWER': 'degrees'}],
        )

        self.distance_matcher = Matcher(nlp.vocab)
        self.distance_matcher.add('UNIT_OF_DISTANCE', None,
                            [{'ORTH': 'nm'}],    
                            [{'LOWER': 'nanometer'}],
                            [{'LOWER': 'nanometers'}],
                            [{'ORTH': 'µm'}],    
                            [{'LOWER': 'micrometer'}],
                            [{'LOWER': 'mircometers'}],
                            [{'ORTH': 'mm'}],    
                            [{'LOWER': 'milimeter'}],
                            [{'LOWER': 'milimeters'}],
                            [{'ORTH': 'cm'}], 
                            [{'LOWER': 'cendurationter'}],
                            [{'LOWER': 'cendurationters'}],
                            [{'ORTH': 'm'}],
                            [{'LOWER': 'meter'}],
                            [{'LOWER': 'meters'}],
                            [{'ORTH': 'km'}],
                            [{'LOWER': 'kilometer'}],
                            [{'LOWER': 'kilometers'}],
                            [{'LOWER': 'zoll'}],                        
        )

        self.pressure_matcher = Matcher(nlp.vocab)
        self.pressure_matcher.add('UNIT_OF_PRESSURE', None,
                            [{'LOWER': 'bar'}] # Maybe add F/A
        )

        self.voltage_matcher = Matcher(nlp.vocab)
        self.voltage_matcher.add('UNIT_OF_VOLTAGE', None,
                            [{'ORTH': 'V'}],
                            [{'lower': 'volt'}],
        )        

        self.speed_matcher = Matcher(nlp.vocab)
        self.speed_matcher.add('UNIT_OF_SPEED', None,
                            [{'ENT_TYPE': 'distance'}, {'LOWER': {'REGEX': r'/|p'}}, {'ENT_TYPE': 'duration'}]
        )

        self.acceleration_matcher = Matcher(nlp.vocab)
        self.acceleration_matcher.add('UNIT_OF_ACCELERATION', None,
                            [{'ENT_TYPE': 'speed'}, {'TEXT': {'REGEX': r'(^)?2|²'}}]
        )

        self.frequency_matcher = Matcher(nlp.vocab)
        self.frequency_matcher.add('UNIT_OF_FREQUENCY', None,
                            [{'LOWER': 'hz'}],
                            [{'LOWER': 'herz'}], # common misspelling
                            [{'LOWER': 'hertz'}],
                            [{'LOWER': '1'}, {'ORTH': '/'}, {'ENT_TYPE': 'duration'}]
        )

        self.volume_matcher = Matcher(nlp.vocab)
        self.volume_matcher.add('UNIT_OF_VOLUME', None,
                            [{'LOWER': 'l'}],
                            [{'LOWER': 'liter'}],
                            [{'ENT_TYPE': 'distance'}, {'TEXT': {'REGEX': r'(^)?3|³'}}]
        )

        self.torque_matcher = Matcher(nlp.vocab)
        self.torque_matcher.add('UNIT_OF_TORQUE', None,
                            [{'ORTH': 'Nm'}],
                            [{'LOWER': 'newtonmeter'}]
        )

        # TODO: RPM MATCHER

        self.operator_matcher = Matcher(nlp.vocab)
        self.operator_matcher.add('OPERATOR', None, # For now only < and >
                            [{'ORTH': '<'}, {'LIKE_NUM': True}],
                            [{'ORTH': '>'}, {'LIKE_NUM': True}],
                            [{'ORTH': '<'}, {'ORTH': '='}, {'LIKE_NUM': True}],
                            [{'ORTH': '>'}, {'ORTH': '='}, {'LIKE_NUM': True}],
                            [{'ORTH': '+'}, {'ORTH': '/'}, {'LIKE_NUM': True}], # LIKE_NUM already includes + and -
        )

        self.measurement_matcher = Matcher(nlp.vocab)
        self.measurement_matcher.add('MEASUREMENT', None,
                            [{'LIKE_NUM': True}, {'ENT_TYPE': 'duration'}],
                            [{'LIKE_NUM': True}, {'ENT_TYPE': 'memory'}],
                            [{'LIKE_NUM': True}, {'ENT_TYPE': 'fraction'}],
                            [{'LIKE_NUM': True}, {'ENT_TYPE': 'angle'}],
                            [{'LIKE_NUM': True}, {'ENT_TYPE': 'distance'}],
                            [{'LIKE_NUM': True}, {'ENT_TYPE': 'pressure'}],
                            [{'LIKE_NUM': True}, {'ENT_TYPE': 'voltage'}],
                            [{'LIKE_NUM': True}, {'ENT_TYPE': 'speed'}],
                            [{'LIKE_NUM': True}, {'ENT_TYPE': 'acceleration'}],
                            [{'LIKE_NUM': True}, {'ENT_TYPE': 'frequency'}],
                            [{'LIKE_NUM': True}, {'ENT_TYPE': 'volume'}],
                            [{'LIKE_NUM': True}, {'ENT_TYPE': 'torque'}],
                            [{'ENT_TYPE': 'operator'},{'ENT_TYPE': 'duration'}],
                            [{'ENT_TYPE': 'operator'},{'ENT_TYPE': 'memory'}],
                            [{'ENT_TYPE': 'operator'},{'ENT_TYPE': 'fraction'}],
                            [{'ENT_TYPE': 'operator'},{'ENT_TYPE': 'angle'}],
                            [{'ENT_TYPE': 'operator'},{'ENT_TYPE': 'distance'}],
                            [{'ENT_TYPE': 'operator'},{'ENT_TYPE': 'pressure'}],
                            [{'ENT_TYPE': 'operator'},{'ENT_TYPE': 'voltage'}],
                            [{'ENT_TYPE': 'operator'},{'ENT_TYPE': 'speed'}],
                            [{'ENT_TYPE': 'operator'},{'ENT_TYPE': 'acceleration'}],
                            [{'ENT_TYPE': 'operator'},{'ENT_TYPE': 'frequency'}],
                            [{'ENT_TYPE': 'operator'},{'ENT_TYPE': 'volume'}],
                            [{'ENT_TYPE': 'operator'},{'ENT_TYPE': 'torque'}],
                            # TODO: 20 ... 30 UNIT, 20 to 30 UNIT, 20 of 60 UNIT
        )

    def __call__(self, doc):
        nlp = self.nlp

        # Split tokens containg a backslash 'km/h' -> 'km','/','h'
        with doc.retokenize() as retokenizer:    
            matches1 = self.split_matcher1(doc)
            for match_id, start, end in matches1:
                span = Span(doc, start, end)       
                if len(span.text) > 1:
                    if '/' in span.text:
                        split = re.split('(/)', span.text)
                    if '>' in span.text:
                        split = re.split('(\>)', span.text)
                    if '<' in span.text:
                        split = re.split('(\<)', span.text)
                    heads = [(doc[start], i) for i,_ in enumerate(split)]
                    retokenizer.split(doc[start], split, heads=heads)
        # Split tokens containg ')²³'
        with doc.retokenize() as retokenizer:
            matches2 = self.split_matcher2(doc)
            for match_id, start, end in matches2:
                span = Span(doc, start, end)       
                if len(span.text) > 1:
                    split = [x for x in span.text]
                    heads = [(doc[start], i) for i,_ in enumerate(split)]
                    retokenizer.split(doc[start], split, heads=heads)

        def annotate(matcher, unit_type: str, attribute):
            with doc.retokenize() as retokenizer:
                #match and tag units
                matches = matcher(doc)
                entities = list(doc.ents)
                add_flag = True
                for match_id, start, end in matches:                    
                    span = Span(doc, start, end, label=unit_type)
                    for token in span:
                        setattr(token._, attribute, True)                   
                    try:
                        if len(span) > 1:
                            #retokenizer.merge(span)
                            pass
                    except ValueError:
                        pass

                    for e in entities[:]:
                        r_e = range(e.start+1,e.end+1)
                        r_n = range(start+1,end+1)
                        # Remove smaller entities which would overlap with the new one
                        if (end-start > e.end-e.start and (start+1 in r_e or end in r_e)) or (start < e.start and end > e.end):
                            entities.remove(e)
                            continue
                        # Check if entity to be added would overlap with an existing bigger one 
                        if (e.end-e.start > end-start and (e.start+1 in r_n or e.end in r_n)) or (e.start < start and e.end > end): 
                            add_flag = False

                    if(add_flag):
                        entities.append(span)

                    add_flag = True

                doc.ents = entities

        annotate(self.duration_matcher, 'duration', 'is_duration_unit')
        annotate(self.memory_matcher, 'memory', 'is_memory_unit')
        annotate(self.fraction_matcher, 'fraction', 'is_fraction_unit')        
        annotate(self.angle_matcher, 'angle', 'is_angle_unit')
        annotate(self.distance_matcher, 'distance', 'is_distance_unit')
        annotate(self.pressure_matcher, 'pressure', 'is_pressure_unit')
        annotate(self.voltage_matcher, 'voltage', 'is_voltage_unit')
        annotate(self.speed_matcher, 'speed', 'is_speed_unit')
        annotate(self.acceleration_matcher, 'acceleration', 'is_acceleration_unit')
        annotate(self.frequency_matcher, 'frequency', 'is_frequency_unit')
        annotate(self.volume_matcher, 'volume', 'is_volume_unit')
        annotate(self.torque_matcher, 'torque', 'is_torque_unit')
        annotate(self.operator_matcher, 'operator', 'is_operator')
        annotate(self.measurement_matcher, 'measurement', 'is_measurement')

        return doc

Answer 1

此后，我找到了一个显而易见的解决方案。

['<'，'2.0'，'m'，'/'，'s'，'²']

m SPEED / SPEED s SPEED

这是实体类型为SPEED的三个令牌。因此，使用“一个或多个”量词就足够了。

[{'ENT_TYPE': 'speed', 'OP': '+'}, {'TEXT': {'REGEX': r'(^)?2|²'}}]

在此解决方案中，实体类型仍然被覆盖，但是基础单元仍作为要素存储在每个令牌上。

来自多个跨令牌实体的注释实体

1 个答案: