将转译的代码映射回原始标记脚本

时间:2019-06-13 01:36:10

标签: javascript algorithm transpiler

最近有人问是否有如下简单的方法来转换自定义标记,包括嵌套标记。示例包括...

  • 对于\k[hello],输出将为<b>hello</b>
  • 对于\i[world],输出将为<em>world</em>
  • 对于hello \k[dear \i[world]],输出将为hello <b>dear <em>world</em></b>
  • 对于\b[some text](url),输出将为<a href=”url”>some text</a>
  • 对于\r[some text](url),输出将为<img alt=”some text” src=”url” />

足够有趣的是,将上述内容转换为javascript包括嵌套在内的操作非常简单,尤其是在标记语法一致的情况下。

//
// Define the syntax and translation to javascript.
//
const grammar = {

  syntax: {
    k:      {markUp: `\k[`, javascript: `"+grammar.oneArg("k","`,  pre: `<b>`,  post: `</b>`},
    i:      {markUp: `\i[`, javascript: `"+grammar.oneArg("i","`,  pre: `<em>`, post: `</em>`},
    b:      {markUp: `\b[`, javascript: `"+grammar.twoArgs("b","`, pattern: `<a href="$2">$1</a>`},
    r:      {markUp: `\r[`, javascript: `"+grammar.twoArgs("r","`, pattern: `<img alt="$1" src="$2"/>`},
    close0: {markUp: `](`,   javascript: `","`},
    close1: {markUp: `)`,    javascript: `")+"`},
    close2: {markUp: `]`,    javascript: `")+"`}
  },

  oneArg: function( command, arg1 ) {
    return grammar.syntax[ command ].pre + arg1 + grammar.syntax[ command ].post;
  },

  twoArgs: function( command, arg1, arg2 ) {
    return grammar.syntax[ command ].pattern.split( `$1` ).join( arg1 ).split( `$2` ).join( arg2 );
  }
}


function transpileAndExecute( markUpString ) {
  // Convert the markUp to javascript.
  for ( command in grammar.syntax ) {
    markUpString = markUpString.split( grammar.syntax[ command ].markUp ).join( grammar.syntax[ command ].javascript );
  }

  // With the markUp now converted to javascript, let's execute it!
  return new Function( `return "${markUpString}"` )();
}

var markUpTest = `Hello \k[dear \i[world!]] \b[\i[Search:] \k[Engine 1]](http://www.google.com) \r[\i[Search:] \k[Engine 2]](http://www.yahoo.com)`;

console.log( transpileAndExecute( markUpTest ) );

请注意,显然还有一些预处理问题必须解决,例如如何处理普通文本中包含标记的问题。例如,在文本字符串中包含“]”将使编译器抛出曲线球,因此强制执行诸如使用“ \]”表示“]”的规则,然后替换所有出现的“ \]”在转译之前使用无害的文本,然后在之后重新替换,可以简单地解决此问题...

在转码方面,使用上面定义的语法,以下标记...

Hello \k[dear \i[world!]] \b[\i[Search:] \k[Engine 1]](http://www.google.com) \r[\i[Search:] \k[Engine 2]](http://www.yahoo.com)

...被翻译为...

"Hello world! "+grammar.oneArg("k","dear "+grammar.oneArg("i","world")+"")+" "+grammar.twoArgs("b",""+grammar.oneArg("i","Search:")+" "+grammar.oneArg("k","Engine 1")+"","http://www.google.com")+" "+grammar.twoArgs("r",""+grammar.oneArg("i","Search:")+" "+grammar.oneArg("k","Engine 2")+"","http://www.yahoo.com")+""

...并且一旦作为javascript函数执行,就会导致...

Hello <b>dear <em>world!</em></b> <a href="http://www.google.com"><em>Search:</em> <b>Engine 1</b></a> <img alt="<em>Search:</em> <b>Engine 2</b>" src="http://www.yahoo.com"/>

尽管,真正的挑战是语法错误的处理,特别是如果有大量标记需要转换时。肯定的答案(参见Find details of SyntaxError thrown by javascript new Function() constructor)清晰明了,提供了一种从动态编译的javascript函数中捕获语法错误的行号和字符数的方法,但还不确定映射语法错误的最佳方法的代码转换回原始标记。

例如,如果多余的“]”不正确(在“再见”之后)...

Hello World! \b[\i[Goodbye]]] \k[World!]]

...这转成...

"Hello World! "+grammar.twoArgs("b",""+grammar.oneArg("i","Goodbye")+"")+"")+" "+grammar.oneArg("k","World!")+"")+""
                                                                           ^

......并且SomePerformance的checkSyntax函数按预期返回“抛出错误:1:76”,并在上面标有“ ^”。

问题是,如何将其映射回原始标记以帮助缩小标记中的错误? (显然,在这种情况下,很容易看到标记中的错误,但是如果必须转换标记的页面,则必须缩小语法错误的范围。)维护标记和转换代码之间的映射似乎棘手的是,由于编译器在遍历语法转换矩阵时逐步将标记更改为javascript代码。我的直觉告诉我,有一种更简单的方法...感谢您的关注。

2 个答案:

答案 0 :(得分:1)

我建议您编写一个语法检查器,类似于jsonlint或jslint等...,以便在将文本实际编译为人类可读的文本之前检查是否已正确检查和关闭所有内容。

这允许进行调试,并防止格式错误的代码运行haywire,并允许您在编辑文本时提供突出显示错误的文档编辑器。

下面的概念验证仅检查括号是否正确闭合。

var grammarLint = function(text) {
  var nestingCounter = 0;
  var isCommand = char => char == '\\';
  var isOpen = char => char == '[';
  var isClose = char => char == ']';
  var lines = text.split('\n');
  for(var i = 0; i < lines.length; i++) {
    text = lines[i];
    for(var c = 0; c < text.length; c++) {
     var char = text.charAt(c);
     if(isCommand(char) && isOpen(text.charAt(c+2))) {
        c += 2;
        nestingCounter++;
        continue;
     }
     if(isClose(char)) {
        nestingCounter--;
        if(nestingCounter < 0) {
            throw new Error('Command closed but not opened at on line '+(i+1)+' char '+(c+1));
        }
      }
    }
  }
  if(nestingCounter > 0) {
     throw new Error(nestingCounter + ' Unclosed command brackets found');
  }
}
text = 'Hello World! \\b[\\i[Goodbye]]] \\k[World!]]';
try {
   grammarLint(text);
}
catch(e) {
   console.error(e.message);
}
text = 'Hello World! \\b[\\i[Goodbye \\k[World!]]';
try {
   grammarLint(text);
}
catch(e) {
   console.error(e.message);
}

答案 1 :(得分:1)

降低了利用javascript编译器捕获转译代码中语法错误并将其引用回原始标记的能力。简而言之,这涉及将注释合并到已编译的代码中以允许引用返回标记的方案,从而提供了缩小标记错误的方法。 (错误消息的确是一种翻译错误,它并不一定与标记错误完全对应,但给人一个挣扎的机会,找出标记问题所在的地方。)

该算法还利用了使用setTimeout捕获已转译代码的语法错误的SomePerformance技术(Find details of SyntaxError thrown by javascript new Function() constructor)的概念。我穿插了一个JavaScript Promise,可以使流程顺畅。

"use strict";

//
// Define the syntax and translation to javascript.
//
class Transpiler {

  static _syntaxCheckCounter = 0;
  static _syntaxCheck = {};
  static _currentSyntaxCheck = null;

  constructor() {
    this.grammar = {

      syntax: {
        k:      {markUp: `\k[`, javascript: `"►+grammar.oneArg("k",◄"`,  pre: `<b>`,  post: `</b>`},
        i:      {markUp: `\i[`, javascript: `"►+grammar.oneArg("i",◄"`,  pre: `<em>`, post: `</em>`},
        b:      {markUp: `\b[`, javascript: `"►+grammar.twoArgs("b",◄"`, pattern: `<a href="$2">$1</a>`},
        r:      {markUp: `\r[`, javascript: `"►+grammar.twoArgs("r",◄"`, pattern: `<img alt="$1" src="$2"/>`},
        close0: {markUp: `](`,   javascript: `"►,◄"`},
        close1: {markUp: `)`,    javascript: `"►)+◄"`},
        close2: {markUp: `]`,    javascript: `"►)+◄"`}
      },

      marker: {           // https://www.w3schools.com/charsets/ref_utf_geometric.asp
        begMarker: `►`,   // 25ba
        endMarker: `◄`,   // 25c4
        begComment: `◆`,  // 25c6
        endComment: `◇`,  // 25c7
        fillerChar: `●`   // 25cf
      },

      oneArg: function( command, arg1 ) {
        return this.syntax[ command ].pre + arg1 + this.syntax[ command ].post;
      },

      twoArgs: function( command, arg1, arg2 ) {
        return this.syntax[ command ].pattern.split( `$1` ).join( arg1 ).split( `$2` ).join( arg2 );
      }
    };
  };

  static transpilerSyntaxChecker(err) {
    // Uncomment the following line to disable default console error message.
    //err.preventDefault();

    let transpiledLine = Transpiler._syntaxCheck[ Transpiler._currentSyntaxCheck ].transpiledFunction.split(`\n`)[1];

    let lo = parseInt( transpiledLine.substr( transpiledLine.substr( 0, err.colno ).lastIndexOf( `●` ) + 1 ) );
    let hi = parseInt( transpiledLine.substr( transpiledLine.substr( err.colno ).indexOf( `●` ) + err.colno + 1 ) );

    let markUpLine = Transpiler._syntaxCheck[ Transpiler._currentSyntaxCheck ].markUp;
    let errString = markUpLine.substring( lo - 40, hi + 40 ).split(`\n`).join(`↵`) + `\n`;
    errString += ( `.`.repeat( lo ) + `^`.repeat( hi - lo ) ).substring( lo - 40, hi + 40 );

    Transpiler._syntaxCheck[Transpiler._currentSyntaxCheck].rejectFunction( new Error(`'${ err.message }' in transpiled code, corresponding to character range ${ lo }:${ hi } in the markup.\n${ errString }`) );

    window.removeEventListener('error', Transpiler.transpilerSyntaxChecker);
    delete Transpiler._syntaxCheck[Transpiler._currentSyntaxCheck];
  };

  async transpileAndExecute( markUpString ) {
    // Convert the markUp to javascript.

    console.log( markUpString );

    let gm = this.grammar.marker;
    let markUpIndex = markUpString;
    let transpiled = markUpString;
    for ( let n in this.grammar.syntax ) {
      let command = this.grammar.syntax[ n ];
      let markUpIndexSplit = markUpIndex.split( command.markUp );
      let transpiledSplit = transpiled.split( command.markUp );

      if ( markUpIndexSplit.length !== transpiledSplit.length ) {
        throw `Ambiguous grammar when searching for "${ command.markUp }" to replace with "${ command.javascript }".`;
      }

      for ( let i = 0; i < markUpIndexSplit.length; i++ ) {
        if ( i === 0 ) {
          markUpIndex = markUpIndexSplit[ 0 ];
          transpiled = transpiledSplit[ 0 ];
        } else {
          let js = command.javascript.replace( gm.begMarker, gm.begComment + gm.fillerChar + markUpIndex.length + gm.endComment );
          markUpIndex += gm.fillerChar.repeat( command.markUp.length );
          js = js.replace( gm.endMarker, gm.begComment + gm.fillerChar + markUpIndex.length + gm.endComment );
          markUpIndex += markUpIndexSplit[ i ];
          transpiled += js + transpiledSplit[ i ];
        }
      }
    };

    transpiled = transpiled.split( gm.begComment ).join( `/*` );
    transpiled = transpiled.split( gm.endComment ).join( `*/` );
    transpiled = `/*${ gm.fillerChar }0*/"${ transpiled }"/*${ gm.fillerChar }${ markUpIndex.length + 1 }*/`;

    console.log( markUpIndex );
    console.log( transpiled );

    let self = this;

    var id = ++Transpiler._syntaxCheckCounter;
    Transpiler._syntaxCheck[id] = {};

    let transpiledFunction = `"use strict"; if ( run ) return\n${ transpiled.split(`\n`).join(` `) }`;
    Transpiler._syntaxCheck[id].markUp = markUpString;
    Transpiler._syntaxCheck[id].transpiledFunction = transpiledFunction;

    //
    // Here's where it gets tricky.  (See "CertainPerformance's" post at
    // https://stackoverflow.com/questions/35252731
    // for details behind the concept.)  In this implementation a Promise
    // is created, which on success of the JS compiler syntax check, is resolved
    // immediately.  Otherwise, if there is a syntax error, the transpilerSyntaxChecker
    // routine, which has access to a reference to the Promise reject function,
    // calls the reject function to resolve the promise, returning the error back
    // to the calling process. 
    // 
    let checkSyntaxPromise = new Promise((resolve, reject) => {
      setTimeout( () => {
        Transpiler._currentSyntaxCheck = id;
        window.addEventListener('error', Transpiler.transpilerSyntaxChecker);

        // Perform the syntax check by attempting to compile the transpiled function.
        new Function( `grammar`, `run`, transpiledFunction )( self.grammar );

        resolve( null );
        window.removeEventListener('error', Transpiler.transpilerSyntaxChecker);
        delete Transpiler._syntaxCheck[id];
      });
      Transpiler._syntaxCheck[id].rejectFunction = reject;
    });

    let result = await checkSyntaxPromise;

    // With the markUp now converted to javascript and syntax checked, let's execute it!
    return ( new Function( `grammar`, `run`, transpiledFunction.replace(`return\n`,`return `) )( this.grammar, true ) );

  };

}

这里有一些带有错误标记的示例运行,以及相应的控制台输出。以下标记具有额外的] ...

let markUp = `Hello World \k[Goodbye]] World`;
new Transpiler().transpileAndExecute( markUp ).then(result => console.log( result )).catch( err => console.log( err ));

...导致...的转码

/*●0*/""/*●0*/+grammar.oneArg("i",/*●2*/"Hello World"/*●13*/)+/*●14*/" "/*●15*/+grammar.oneArg("k",/*●17*/""/*●17*/+grammar.oneArg("i",/*●19*/"Goodbye"/*●26*/)+/*●27*/" World"/*●34*/

请注意穿插的注释,这些注释指向原始标记中的字符位置。然后,当javascript编译器抛出错误时,它会被transpilerSyntaxChecker捕获,该错误使用嵌入的注释来标识标记中的位置,并将以下结果转储到控制台中。

Uncaught SyntaxError: Unexpected token )
    at new Function (<anonymous>)
    at markUp.html:127
Error: 'Uncaught SyntaxError: Unexpected token )' in transpiled code, corresponding to character range 22:23 in the markup.
Hello World k[Goodbye]] World
......................^
    at transpilerSyntaxChecker (markUp.html:59)

请注意,Unexpected token )消息引用的是已转码的代码,而不是标记脚本,但是输出指向有问题的]

这是另一个示例运行,在这种情况下,缺少一个] ...

let markUp = `\i[Hello World] \k[\i[Goodbye] World`;
new Transpiler().transpileAndExecute( markUp ).then(result => console.log( result )).catch(err => console.log( err ));

...会产生以下代码转换的代码...

/*●0*/""/*●0*/+grammar.oneArg("i",/*●2*/"Hello World"/*●13*/)+/*●14*/" "/*●15*/+grammar.oneArg("k",/*●17*/""/*●17*/+grammar.oneArg("i",/*●19*/"Goodbye"/*●26*/)+/*●27*/" World"/*●34*/

...引发以下错误...

Uncaught SyntaxError: missing ) after argument list
    at new Function (<anonymous>)
    at markUp.html:127
Error: 'Uncaught SyntaxError: missing ) after argument list' in transpiled code, corresponding to character range 27:34 in the markup.
i[Hello World] k[i[Goodbye] World
...........................^^^^^^^
    at transpilerSyntaxChecker (markUp.html:59)

也许不是最好的解决方案,但是懒惰的解决方案。 Tschallacka的响应在对标记执行真正的语法检查时具有优点(即,使用自定义语法检查器或使用诸如Jison之类的东西),而没有setTimeout / Promise复杂性,也没有使用Transpiler错误消息引用原始标记的某种不精确的方法...