在超级功能中进行分词时,仅当字符串是一行中的第一行时才如何匹配字符串(注意:这是与this one不同的问题)?
例如,假设我使用的语言仅包含以下4个字符('',':','X','Y'),每个字符都是一个标记。还有一个“ Header”令牌来捕获以下正则表达式模式/ ^ [XY] +:/(任何数量的Xs和Ys,后跟冒号,仅当它们开始该行时)。
这是一个快速测试类(第四个测试用例失败):
using System;
using Superpower;
using Superpower.Parsers;
using Superpower.Tokenizers;
public enum Tokens { Space, Colon, Header, X, Y }
public class XYTokenizer
{
static void Main(string[] args)
{
Test("X", Tokens.X);
Test("XY", Tokens.X, Tokens.Y);
Test("X Y:", Tokens.X, Tokens.Space, Tokens.Y, Tokens.Colon);
Test("X: X", Tokens.Header, Tokens.Space, Tokens.X);
}
public static readonly Tokenizer<Tokens> tokenizer = new TokenizerBuilder<Tokens>()
.Match(Character.EqualTo('X'), Tokens.X)
.Match(Character.EqualTo('Y'), Tokens.Y)
.Match(Character.EqualTo(':'), Tokens.Colon)
.Match(Character.EqualTo(' '), Tokens.Space)
.Build();
static void Test(string input, params Tokens[] expected)
{
var tokens = tokenizer.Tokenize(input);
var i = 0;
foreach (var t in tokens)
{
if (t.Kind != expected[i])
{
Console.WriteLine("tokens[" + i + "] was Tokens." + t.Kind
+ " not Tokens." + expected[i] + " for '" + input + "'");
return;
}
i++;
}
Console.WriteLine("OK");
}
}
答案 0 :(得分:1)
我想出了一个基于the example found here的自定义Tokenizer
。我在整个代码中添加了注释,以便您可以跟踪发生的情况。
public class MyTokenizer : Tokenizer<Tokens>
{
protected override IEnumerable<Result<Tokens>> Tokenize(TextSpan input)
{
Result<char> next = input.ConsumeChar();
bool checkForHeader = true;
while (next.HasValue)
{
// need to check for a header when starting a new line
if (checkForHeader)
{
var headerStartLocation = next.Location;
var tokenQueue = new List<Result<Tokens>>();
while (next.HasValue && (next.Value == 'X' || next.Value == 'Y'))
{
tokenQueue.Add(Result.Value(next.Value == 'X' ? Tokens.X : Tokens.Y, next.Location, next.Remainder));
next = next.Remainder.ConsumeChar();
}
// only if we had at least one X or one Y
if (tokenQueue.Any())
{
if (next.HasValue && next.Value == ':')
{
// this is a header token; we have to return a Result of the start location
// along with the remainder at this location
yield return Result.Value(Tokens.Header, headerStartLocation, next.Remainder);
next = next.Remainder.ConsumeChar();
}
else
{
// this isn't a header; we have to return all the tokens we parsed up to this point
foreach (Result<Tokens> tokenResult in tokenQueue)
{
yield return tokenResult;
}
}
}
if (!next.HasValue)
yield break;
}
checkForHeader = false;
if (next.Value == '\r')
{
// skip over the carriage return
next = next.Remainder.ConsumeChar();
continue;
}
if (next.Value == '\n')
{
// line break; check for a header token here
next = next.Remainder.ConsumeChar();
checkForHeader = true;
continue;
}
if (next.Value == 'A')
{
var abcStart = next.Location;
next = next.Remainder.ConsumeChar();
if (next.HasValue && next.Value == 'B')
{
next = next.Remainder.ConsumeChar();
if (next.HasValue && next.Value == 'C')
{
yield return Result.Value(Tokens.ABC, abcStart, next.Remainder);
next = next.Remainder.ConsumeChar();
}
else
{
yield return Result.Empty<Tokens>(next.Location, $"unrecognized `AB{next.Value}`");
}
}
else
{
yield return Result.Empty<Tokens>(next.Location, $"unrecognized `A{next.Value}`");
}
}
else if (next.Value == 'X')
{
yield return Result.Value(Tokens.X, next.Location, next.Remainder);
next = next.Remainder.ConsumeChar();
}
else if (next.Value == 'Y')
{
yield return Result.Value(Tokens.Y, next.Location, next.Remainder);
next = next.Remainder.ConsumeChar();
}
else if (next.Value == ':')
{
yield return Result.Value(Tokens.Colon, next.Location, next.Remainder);
next = next.Remainder.ConsumeChar();
}
else if (next.Value == ' ')
{
yield return Result.Value(Tokens.Space, next.Location, next.Remainder);
next = next.Remainder.ConsumeChar();
}
else
{
yield return Result.Empty<Tokens>(next.Location, $"unrecognized `{next.Value}`");
next = next.Remainder.ConsumeChar(); // Skip the character anyway
}
}
}
}
您可以这样称呼它:
var tokens = new MyTokenizer().Tokenize(input);