Can't parse YAML correctly

时间:2015-10-06 09:03:15

标签: python yaml pyyaml

I parse the following YAML data in python:

>>> import yaml
>>> yaml.load("""
... ---
... categories: {1: Yes, 2: No}
... increasing: [00, 01, 02, 03, 04, 05, 06, 07, 08, 09, 10]
... ...
... """)

And get this as output:

{'increasing': [0, 1, 2, 3, 4, 5, 6, 7, '08', '09', 10], 'categories': {1: True, 2: False}}
  • Why are "Yes" and "No" converted to True and False?
  • Why are "08" and "09" parsed as strings whereas the other digits are parsed as numbers with leading zeros truncated?

2 个答案:

答案 0 :(得分:5)

您对0007前导零被截断的推断是不正确的。这些都是八进制字符,因为前导0并且如此解释。

由于八进制字符不能包含890809不能只是字符串,而您的YAML解析器会加载它们。

这实际上是YAML 1.1YAML 1.2 octal numbersruamel.yaml的剩余(向后兼容性),应该以{{1​​}}

开头

0oYes加载为NoTrue。也是YAML-1.1-ishm。 1.2规范不再涉及这些替代方案。如果引用这些字符串,则不会转换它们

通过添加以下规则,您可以相对轻松地构建一个不接受True / False的Yes / No / On / Off变体的解析器:

False

或使用普通MyResolver.add_implicit_resolver( u'tag:yaml.org,2002:bool', re.compile(u'''^(?:true|True|TRUE|false|False|FALSE)$''', re.X), list(u'tTfF')) 并删除相应的开始符号条目:

Resolver

给你:

import ruamel.yaml as yaml
from ruamel.yaml.resolver import Resolver

yaml_str = """\
categories: {1: Yes, 2: No}
"""

for ch in list(u'yYnNoO'):
    del Resolver.yaml_implicit_resolvers[ch]


data = yaml.load(yaml_str, Loader=yaml.Loader)
print(data)

将所有以0开头的数字字符串识别为普通整数并不是那么简单,因为如果更改{'categories': {1: 'Yes', 2: 'No'}} 的隐式解析器并将该字符串开始的字符串传递给0,则会得到一个解析问题,因为int是基于八进制¹转换的:

08

并打印:

import re
import ruamel.yaml as yaml
from ruamel.yaml.reader import Reader
from ruamel.yaml.resolver import BaseResolver, Resolver
from ruamel.yaml.scanner import RoundTripScanner
from ruamel.yaml.parser_ import Parser
from ruamel.yaml.composer import Composer
from ruamel.yaml.constructor import RoundTripConstructor
from ruamel.yaml import RoundTripLoader
from ruamel.yaml.compat import to_str


yaml_str = """\
categories: {1: Yes, 2: No}
increasing: [00, 01, 02, 03, 04, 05, 06, 07, 08, 09, 10]
"""


class MyResolver(BaseResolver):
    pass

MyResolver.add_implicit_resolver(
    u'tag:yaml.org,2002:bool',
    re.compile(u'''^(?:true|True|TRUE|false|False|FALSE)$''', re.X),
    list(u'tTfF'))

MyResolver.add_implicit_resolver(
    u'tag:yaml.org,2002:float',
    re.compile(u'''^(?:
     [-+]?(?:[0-9][0-9_]*)\\.[0-9_]*(?:[eE][-+]?[0-9]+)?
    |[-+]?(?:[0-9][0-9_]*)(?:[eE][-+]?[0-9]+)
    |\\.[0-9_]+(?:[eE][-+][0-9]+)?
    |[-+]?[0-9][0-9_]*(?::[0-5]?[0-9])+\\.[0-9_]*
    |[-+]?\\.(?:inf|Inf|INF)
    |\\.(?:nan|NaN|NAN))$''', re.X),
    list(u'-+0123456789.'))

MyResolver.add_implicit_resolver(
    u'tag:yaml.org,2002:int',
    re.compile(u'''^(?:[-+]?0b[0-1_]+
    |[-+]?[0-9]+
    |[-+]?0o?[0-7_]+
    |[-+]?(?:0|[1-9][0-9_]*)
    |[-+]?0x[0-9a-fA-F_]+
    |[-+]?[1-9][0-9_]*(?::[0-5]?[0-9])+)$''', re.X),
    list(u'-+0123456789'))

MyResolver.add_implicit_resolver(
    u'tag:yaml.org,2002:merge',
    re.compile(u'^(?:<<)$'),
    [u'<'])

MyResolver.add_implicit_resolver(
    u'tag:yaml.org,2002:null',
    re.compile(u'''^(?: ~
    |null|Null|NULL
    | )$''', re.X),
    [u'~', u'n', u'N', u''])

MyResolver.add_implicit_resolver(
    u'tag:yaml.org,2002:timestamp',
    re.compile(u'''^(?:[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]
    |[0-9][0-9][0-9][0-9] -[0-9][0-9]? -[0-9][0-9]?
    (?:[Tt]|[ \\t]+)[0-9][0-9]?
    :[0-9][0-9] :[0-9][0-9] (?:\\.[0-9]*)?
    (?:[ \\t]*(?:Z|[-+][0-9][0-9]?(?::[0-9][0-9])?))?)$''', re.X),
    list(u'0123456789'))

MyResolver.add_implicit_resolver(
    u'tag:yaml.org,2002:value',
    re.compile(u'^(?:=)$'),
    [u'='])

# The following resolver is only for documentation purposes. It cannot work
# because plain scalars cannot start with '!', '&', or '*'.
MyResolver.add_implicit_resolver(
    u'tag:yaml.org,2002:yaml',
    re.compile(u'^(?:!|&|\\*)$'),
    list(u'!&*'))


class MyRoundTripConstructor(RoundTripConstructor):
    def construct_yaml_int(self, node):
        value = to_str(self.construct_scalar(node))
        value = value.replace('_', '')
        sign = +1
        if value[0] == '-':
            sign = -1
        if value[0] in '+-':
            value = value[1:]
        if value == '0':
            return 0
        elif value.startswith('0b'):
            return sign*int(value[2:], 2)
        elif value.startswith('0x'):
            return sign*int(value[2:], 16)
        elif value.startswith('0o'):
            return sign*int(value[2:], 8)
        #elif value[0] == '0':
        #    return sign*int(value, 8)
        elif ':' in value:
            digits = [int(part) for part in value.split(':')]
            digits.reverse()
            base = 1
            value = 0
            for digit in digits:
                value += digit*base
                base *= 60
            return sign*value
        else:
            return sign*int(value)

MyRoundTripConstructor.add_constructor(
    u'tag:yaml.org,2002:int',
    MyRoundTripConstructor.construct_yaml_int)


class MyRoundTripLoader(Reader, RoundTripScanner, Parser,
                      Composer, MyRoundTripConstructor, MyResolver):
    def __init__(self, stream):
        Reader.__init__(self, stream)
        RoundTripScanner.__init__(self)
        Parser.__init__(self)
        Composer.__init__(self)
        MyRoundTripConstructor.__init__(self)
        MyResolver.__init__(self)

for ch in list(u'yYnNoO'):
    del Resolver.yaml_implicit_resolvers[ch]

data = yaml.load(yaml_str, Loader=MyRoundTripLoader)
print(data['increasing'])

(它也可以将Yes / No作为字符串,而不首先在内部查找表中插入识别模式)

¹我使用了PyYAML,其中我是作者。 ruamel.yaml所基于的{{3}}应该能够支持类似的派生。

答案 1 :(得分:0)

YesNo在YAML中具有特殊含义。看看Wikipedia article。为了避免这种情况,你可以改变你的YAML以包含引号,看起来像这样

>>> yaml.load("""
... ---
... categories: {1: "Yes", 2: "No"}
... increasing: [00, 01, 02, 03, 04, 05, 06, 07, 08, 09, 10]
... ...
... """)

关于08和09的前导零,我不太清楚为什么会这样,但它确实似乎是一个蟒蛇问题