如何将注释ann文件转换为xml?

时间:2014-11-02 23:17:29

标签: xml annotations brat

我有一个使用brat注释的数据。我想将.ann文件转换为.xml以简化计算

有没有工具?

由于

1 个答案:

答案 0 :(得分:1)

我遇到了同样的问题,我创建了自己的转换器。

def find_sentence(ary, cond):
    for index, item in enumerate(ary):
        if((item[0]-1)<cond[0] and (item[1]+1)>cond[1]):
            return ary[index]
        elif(item[1]<cond[0]):
            pass
        elif((item[0]-1)<cond[0] and item[1]<cond[1]):
            _start = item[0]
            _index = index
            while index < len(ary):
                _index = _index+1
                if((ary[_index][1]+1)>cond[1]):
                    _end = ary[_index][1]
                    return((_start, _end))

def isPossible(errors, err):
    possible = False
    try:
        errors.index(err)
        possible = True
    except:
        possible = False
    return possible


def compose(collection, document, filters = 'all'):
    source_version = get_version(document)
    source_language = get_lang(document)
    document = get_document(collection, document)
    text = document['text']
    sentences = document['sentence_offsets']
    text_errors = {}
    e = {}
    c = {}
    a = {}
    for err in document['entities']:
        e[err[0]] = err
    for attr in document['attributes']:
        a[attr[2]] = [attr]
    for note in document['comments']:
        c[note[0]] = [note]
    errors = {}
    errored_sentences = []
    for key in e.keys():
        try:
            a[key]
            _attr = []
            for attr in a[key]:
                _attr.append(attr)
        except:
            _attr = None
        try:
            c[key]
            _note = []
            for note in c[key]:
                _note.append(note)
        except:
            _note = None
        try:
            errors[key]
            if(_attr != None):
                for attr in _attr:
                    errors[key][1].append(attr)
            if(_note != None):
                for note in _note:
                    errors[key][2].append(note)
        except:
            errors[key] = (e[key],[],[])
            if(_attr != None):
                for attr in _attr:
                    errors[key][1].append(attr)
            if(_note != None):
                for note in _note:
                    errors[key][2].append(note)
    _errors = errors
    _tags = {}
    for err in errors.keys():
        _error = errors[err]
        _type = _error[0][1]
        _textual_error = text[_error[0][2][0][0]:_error[0][2][0][1]]
        if(filters == 'all' or isPossible(filters.split('::'), _type)):
            try:
                text_errors[_type.lower()].append(_textual_error)
            except:
                text_errors[_type.lower()] = []
                text_errors[_type.lower()].append(_textual_error)
            _tag = u'<error type="{0}"'.format(_type)
            _revert_offset = len(_tag)
            if _error[1]:
                for e in _error[1]:
                    _tag += u' {0}="{1}" '.format(e[1], e[3])
            _tag += u'><incorrect>{0}</incorrect>'.format(_textual_error)
            if _error[2]:
                for e in _error[2]:
                    _tag += u'<note>{0}</note>'.format(e[2])
            _tag += u'</error>'
            _tags[err] = [_tag, find_sentence(sentences, (_error[0][2][0][0], _error[0][2][0][1])), (_error[0][2][0][0], _error[0][2][0][1])]

    indexes = {}
    for err in errors.keys():
      _error = errors[err]  
      indexes[_error[0][0]] = _error[0][2][0]
    sorted_errors = sorted(indexes.items(), key=operator.itemgetter(1))
    sorted_errors.reverse()

    _last = 0
    _separated_errors = ''
    for error in sorted_errors:
        try:
            if(error[1][1]<_last[1][1]):
                _start = error[1][0]
                _end = error[1][1]
                _begin = text[:_start]
                _ending = text[_end:]

            else:
                _start = error[1][0]
                _end = error[1][1]+_tags[error[0]][1]
                _begin = text[:_start]
                _ending = text[_end:]

                new_tag = sub(r'(<error.*><incorrect>)(.*)(</incorrect>.*</error>)', r'\1'+text[_start:_end]+r'\3', _tags[error[0]][0])

            _last = error
        except:
            _last = error
    text = ''
    for err in _tags.keys():
        _new_sent = '<sentence>'
        _error = _tags[err]
        _sentence = document['text'][_error[1][0]:_error[1][1]]
        _offset = (_error[2][0]-_error[1][0],len(_sentence)-(_error[1][1]-_error[2][1]))
        _new_sent += _sentence[:_offset[0]]+_error[0]+_sentence[_offset[1]:]
        _new_sent += '</sentence>'
        text += u'\n' 
        text += _new_sent
    return text

我把这些方法放在brat / server / document.py中。并在brat / server / download.py中创建最终有效的xml文件