Question

假设我有一个node.js应用程序，该应用程序以一种奇怪的格式接收输入：任意插入JSON的字符串，如下所示：

This is a string {"with":"json","in":"it"} followed by more text {"and":{"some":["more","json"]}} and more text

对于此输入文本，我有一些保证：

JSON对象之间的文字文本始终没有花括号。
推入文本中的顶级JSON对象始终是对象文字，而不是数组。

我的目标是将其拆分为一个数组，使文字文本保留下来，并解析JSON，如下所示：

[
    "This is a string ",
    {"with":"json","in":"it"},
    " followed by more text ",
    {"and":{"some":["more","json"]}},
    " and more text"
]

到目前为止，我已经写了一个naive solution，它只用花括号来决定JSON的起始和终止位置。但是，如果JSON包含{"like":"this one } right here"}中带有花括号的字符串，则此方法将无效。我可以尝试通过类似的报价计算数学来解决该问题，但随后我还必须考虑转义的报价。那时，我感觉像是在重做JSON.parse的工作。有解决这个问题的更好方法吗？

Answer 1

您可以检查JSON.parse是否引发错误，以确定该块是否为有效的JSON对象。如果抛出错误，则不带引号的}不平衡：

const tests = [
  '{"just":"json }}{}{}{{[]}}}}","x":[1,2,3]}',
  'Just a string',
  'This string has a tricky case: {"like":"this one } right here"}',
  'This string {} has a tiny JSON object in it.',
  '.{}.',
  'This is a string {"with":"json","in":"it"} followed by more text {"and":{"some":["more","json"]}} and more text',
];

tests.forEach( test => console.log( parse_json_interleaved_string( test ) ) );

function parse_json_interleaved_string ( str ) {
  const chunks = [ ];
  let last_json_end_index = -1;
  let json_index = str.indexOf( '{', last_json_end_index + 1 );
  for ( ; json_index !== -1; json_index = str.indexOf( '{', last_json_end_index + 1 ) ) {

    // Push the plain string before the JSON
    if ( json_index !== last_json_end_index + 1 )
        chunks.push( str.substring( last_json_end_index, json_index ) );

    let json_end_index = str.indexOf( '}', json_index + 1 );

    // Find the end of the JSON
    while ( true ) {
       try { 
         JSON.parse( str.substring( json_index, json_end_index + 1 ) );
         break;
       } catch ( e ) {
         json_end_index = str.indexOf( '}', json_end_index + 1 );
         if ( json_end_index === -1 )
           throw new Error( 'Unterminated JSON object in string' );
       }
    }

    // Push JSON
    chunks.push( str.substring( json_index, json_end_index + 1 ) );
    last_json_end_index = json_end_index + 1;
  }

  // Push final plain string if any
  if ( last_json_end_index === - 1 )
    chunks.push( str );
  else if ( str.length !== last_json_end_index )
    chunks.push( str.substr( last_json_end_index ) );

  return chunks;
}

Answer 2

这是一种相对简单的暴力破解方法：将整个输入字符串放在花括号上，然后按顺序遍历数组。每当遇到开放括号时，都从该起点找到成功解析为JSON的数组中最长的块。冲洗并重复。

如果输入包含无效的JSON和/或不平衡的括号，这将不起作用（请参见下面的最后两个测试用例。）

const tryJSON = input => {
  try {
    return JSON.parse(input);
  } catch (e) {
    return false;
  }
}

const parse = input => {
  let output = [];
  let chunks = input.split(/([{}])/);

  for (let i = 0; i < chunks.length; i++) {
    if (chunks[i] === '{') {
      // found some possible JSON; start at the last } and backtrack until it works.
      for (let j = chunks.lastIndexOf('}'); j > i; j--) {
        if (chunks[j] === '}') {
          // Does it blend?
          let parsed = tryJSON(chunks.slice(i, j + 1).join(""))
          if (parsed) {
            // it does! Grab the whole thing and skip ahead
            output.push(parsed);
            i = j;
          }
        }
      }
    } else if (chunks[i]) {
      // neither JSON nor empty
      output.push(chunks[i])
    }
  }

  console.log(output)
  return output
}

parse(`{"foo": "bar"}`)
parse(`test{"foo": "b}ar{{[[[{}}}}{}{}}"}`)
parse(`this {"is": "a st}ri{ng"} with {"json": ["in", "i{t"]}`)
parse(`{}`)
parse(`this {"i{s": invalid}`)
parse(`So is {this: "one"}`)

Answer 3

我可以尝试通过类似的报价计算数学来解决该问题，但随后我还必须考虑转义的报价。到那时，感觉就像我在重做JSON.parse的工作。有解决这个问题的更好方法吗？

我不这么认为。您的输入与JSON相差甚远。但是考虑所有这些事情并不难。

以下代码段应该起作用：

function construct(str) {
  const len = str.length
  let lastSavedIndex = -1
  let bracketLevel = 0
  let inJsonString = false
  let lastCharWasEscapeChar = false
  
  let result = []
  
  for(let i = 0; i < len; ++i) {
    if(bracketLevel !== 0 && !lastCharWasEscapeChar && str[i] === '"') {
      inJsonString = !inJsonString
    }
    else if (!inJsonString && str[i] === '{') {
      if (bracketLevel === 0) {
        result.push(str.substring(lastSavedIndex + 1, i))
        lastSavedIndex = i - 1
      }
      
      ++bracketLevel
    }
    else if (!inJsonString && str[i] === '}') {
      --bracketLevel
      
      if (bracketLevel === 0) {
        result.push(JSON.parse(str.substring(lastSavedIndex + 1, i + 1)))
        lastSavedIndex = i
      }
    }
    else if (inJsonString && str[i] === '\\') {
      lastCharWasEscapeChar = !lastCharWasEscapeChar
    }
    else {
      lastCharWasEscapeChar = false
    }
  }
  if(lastSavedIndex !== len -1) {
    result.push(str.substring(lastSavedIndex + 1, len))
  }
  return result
}


const standardText = 'This is a string {"with":"json","in":"it"} followed by more text {"and":{"some":["more","json"]}} and more text. {"foo": "bar}"}'

const inputTA = document.getElementById('input')
const outputDiv = document.getElementById('output')

function updateOutput() {
  outputDiv.innerText =
    JSON.stringify(
      construct(inputTA.value),
      null,
      2
    )
}

inputTA.oninput = updateOutput
inputTA.value = standardText
updateOutput()

<textarea id="input" rows="5" cols="50"></textarea>

<pre id="output"><pre>

Answer 4

这里是一种逐字符迭代char的方法。首先，我们从输入创建一个数组，然后在其上使用reduce()。当我们检测到大括号{时，将当前累积的块推送到检测到的结果数组上，然后在accumulator上使用的reduce对象上设置标志。当此标志设置为true时，我们将尝试解析JSON，只有成功后，我们才将表示chunk的{{1}}放在检测到的结果数组上并进行设置再次将标志添加到JSON。

false方法的accumulator将保存下一个数据：

reduce()：具有检测到的结果的数组：res或strings。
jsons：一个字符串，代表当前累积的字符。
chunk：一个布尔值，指示当前的isJson是否为chunk。

json

Answer 5

您可以使用RegExp /(\s(?=[{]))|\s(?=[\w\s]+[{])/ig到.split()空格字符，然后打开大括号{或空格字符，然后是一个或多个单词或空格字符，然后再打开大括号大括号，.filter()从结果数组中删除undefined个值，创建一个新数组，然后while得到的拆分数组中有.length个索引，其中值仅包含空格字符，如果数组.splice()为1 .length空字符串0，否则为空格字符，则为索引加.push()的匹配数组的开头''，其中匹配项' '由空格字符.join() ' '的最后一个空格字符和.replace()匹配的数组.shift()组成，然后是数组的下一个元素匹配的数组。

JSON

Answer 6

强制性回答：这是不正确的格式（由于这种复杂性，如果解析器的设计不当，则保证存在安全漏洞）；理想情况下应重新设计。（很抱歉，必须这么说。）

除非如此，否则您可以使用自己喜欢的解析器生成器生成解析器，该生成器会将javascript输出为目标语言。它甚至可能具有demo的JSON语法。

但是，明显的安全性问题令人难以置信（如果有任何JSON超过了“保证”，那么它就是一个向量）。数组散布的表示似乎更好，但有assert(text.length == markup.length+1)的约束：

'{
    "text": ['Hello', 'this is red text!'],
    "markup": [{"text":"everyone", "color":"red"}]
}'

甚至更好：

'[
    {"type":"text", "text":"Hello"},
    {"type":"markup", "text":"everyone", "color":"red"}    # or  ,"val":{"text":.., "color":..}}
    {"type":"text", "text":"this is red text!"},
    ...
]'

理想地压缩存储。无需担心JSON.parse即可反序列化。

如何将不可预测的JSON解析为字符串？

6 个答案: