恢复:我目前正在编写一个将源代码转换为令牌的ActionScript 3词法分析器。我选择通过代码点解释输入,一个包含在类UString
中的可选代理对的String。在引擎盖下,我使用UStringPos
类缓存最后一个读取位置。
我已经测试了它如何使用...
扫描标识符"huehuehue"
'use strict';
import {Lexer} from 'core/Lexer';
import {UString} from 'utils/UString';
import ErrorHandler from 'core/ErrorHandler';
const errorHandler = new ErrorHandler(true);
// Tell the length to the `Lexer` manually.
const lexer = new Lexer(
new UString('huehuehue'), 9, errorHandler);
// Scan first token
lexer.next();
const id = lexer.lookahead.value;
console.log(
id,
id.length
);
它应该记录"huehuehue", 9
,但是另一个故事......
为什么它丢失了最后一个'e'
?与扫描相关的最内层方法是Lexer#getCommonIdentifier
。我已经测试了我的UString
部分,顺便说一下,它的工作正常。
Lexer相关定义
/*
* Class that turns AS3 code into tokens.
*/
export class Lexer
{
/*
* @param {UString} source
* @param {Number} length
* @param {ErrorHandler} errorHandler
*/
constructor(source, length, errorHandler)
{
this.source = source;
this.length = length;
this.index = 0;
this.lineStart = 0;
this.lineNumber = 1;
this.comments = [];
this.errorHandler = errorHandler;
this.previousToken = null;
this.token = null;
this.lookahead = null;
this._special = [];
}
/*
* Verifies the end of file.
*/
eof()
{
return this.index >= this.length;
}
/*
* Advance the previous, current and lookahead tokens.
* The lexer however does not depend on these tokens.
*/
next()
{
this.previousToken = this.token;
this.token = this.lookahead;
this.lookahead = this.lex();
}
/*
* Consumes the next token and return it.
*/
lex()
{
this.consumeWhiteSpaces();
while (this.consumeComment())
this.consumeWhiteSpaces();
let cp = this.source.codePointAt(this.index);
let pureIdentifier =
Character.isIdentifierStart(cp);
if (pureIdentifier || (cp === 0x5C))
return this.scanIdentifierOrKeyword(!pureIdentifier);
if (this.eof())
{
let loc = [ this.index, this.lineNumber ];
return new Token(TokenType.EOF, loc, loc, '<end>');
}
}
/*
* Scan an identifier, keyword or boolean literal.
*/
scanIdentifierOrKeyword(usingEscape)
{
const start = this.index;
let id;
/* Like Esprima does: only identifiers containing
* escapes need some overheads. */
if (usingEscape)
{
id = this.getEscapedIdentifier(
String.fromCodePoint(this.scanUnicodeEscapeSequence()));
}
else
id = this.getCommonIdentifier();
return new Token(
TokenType.IDENTIFIER,
[ start , this.lineNumber ],
[ this.index, this.lineNumber ],
id
);
}
/*
* Interprets an identifier. If any escape appears, switches to
* getEscapedIdentifier().
*/
getCommonIdentifier()
{
const start = this.source.position.offset;
let cp = 0;
// Jump the starting symbol.
++this.index;
while (!this.eof())
{
cp = this.source.codePointAt(this.index);
if (Character.isIdentifierPart(cp))
++this.index;
// Switches to escape-minded task...
else if (cp === 0x5C)
return this.getUnicodeEscapedIdentifier(
this.source.string.slice(
start, this.source.position.offset
)
);
else break;
}
return this.source.string.slice(
start, this.source.position.offset
);
}
/* ... */
}
utils的/ UString.js
'use strict';
/*
* String wrapper with methods _based_ on code points.
*/
export class UString
{
/*
* Constructs the {UString}.
*
* @param {String} s String to be wrapped.
*/
constructor(s)
{
/*
* @type {String}
*/
this.string = s;
/*
* Tracks the last accessed position.
*
* @type {UStringPos}
*/
this.position = new UStringPos(0, 0);
}
/*
* Reads a code point at specific index.
*
* @param {Number} index
* @return {Number}
*/
codePointAt(index)
{
this.position.walk(this.string, index);
return this.string.codePointAt(this.position.offset);
}
/*
* Slices the internal string by code point indices.
*
* @param {Number} i
* @param {Number} j
* @return {String}
*/
slice(i, j)
{
this.position.walk(this.string, i);
i = this.position.offset;
this.position.walk(this.string, j);
j = this.position.offset;
return this.string.slice(i, j);
}
};
/*
* Class that tracks the position of a code point on a string.
*/
export class UStringPos
{
/*
* Constructs the {UStringPos}.
*
* @param {Number} index The initial index.
* @param {Number} offset The initial offset.
*/
constructor(index, offset)
{
/*
* @type {Number}
*/
this.index = index;
/*
* @type {Number}
*/
this.offset = offset;
}
/*
* Walks to the given index.
*
* @param {String} s
* @param {Number} index
* @note No backward. Track the previous position instead.
* @return {void}
*/
walk(s, index)
{
for (; this.index < index; ++this.index)
this.offset += (
this._usingSurrogates(
s.charCodeAt(this.offset)
) ? 2 : 1
);
}
/*
* @private
*/
_usingSurrogates(ch)
{
return (ch >= 0xD800) && (ch <= 0xDBFF);
}
};
什么吗?
答案 0 :(得分:1)
好。因此this.source.position.offset
存在问题:当我执行++this.index
时,UStringPos
的偏移量不会更新。问题出在切片上。
this.source.string.slice(
start, this.source.position.offset
);
此切片基于偏移量,因为我必须跟踪标识符开始的先前偏移量。
<强>解决方案强>
我可以使用我自己的UString
类的切片,并将第一个参数用作偏移量,将最后一个参数用作普通索引。
'use strict';
export class UString
{
// ...
/*
* Slices the internal string by using a pair of
* offset and code point indices.
*
* @param {Number} i Offset
* @param {Number} j
* @return {String}
*/
slice(i, j)
{
this.position.walk(this.string, j);
j = this.position.offset;
return this.string.slice(i, j);
}
};