我正在关注编译器最近的Destroy All Software截屏视频。所有示例代码都是用Ruby编写的,但我试图用Elixir实现相同的输出。
用简单的英语,我的方法是这样的:
call_tokenizer
)call_tokenizer
。另外,:halt
`reduce_while循环,并返回累积的标记。我在Elixir中编写了一些工作的函数,因为我从输入中得到了预期的输出。但是,我想我的代码至少有一个问题。
我正在使用Enum.reduce_while
,但我不认为我正在使用它,因为它打算使用它。我的假设是有一些更好的方法来重新编写reduce_while
作为递归函数。
如果我需要在一个问题中总结一下,那么如何在不依赖reduce_while
给我的暂停/继续属性的情况下实现相同的结果?我应该注意这个代码示例是否有任何其他问题?
这是我的代码和预期输出:
// Code
defmodule Compiler.Token do
defstruct [:type, :value]
end
defmodule Compiler.Tokenizer do
@token_types [
{:def, "\\bdef\\b"},
{:end, "\\bend\\b"},
{:identifier, "\\b[a-zA-Z]+\\b"},
{:integer, "\\b[0-9]+\\b"},
{:oparen, "\\("},
{:cparen, "\\)"}
]
def tokenize() do
code = "def f()
1
end"
IO.inspect call_tokenize(code, [])
end
def call_tokenize("", accumulator) do
accumulator
end
def call_tokenize(code, accumulator) do
Enum.reduce_while(@token_types, "", fn {type, re}, acc ->
result = Regex.run(~r/\A#{re}/, code)
if result do
value = hd(result)
base = byte_size(value)
token = %Compiler.Token{type: type, value: value}
tokens = binary_part(code, base, byte_size(code) - base)
|> String.trim()
|> call_tokenize(accumulator ++ [token])
{:halt, tokens}
else
{:cont, acc}
end
end)
end
end
// Expected output
[%Compiler.Token{type: :def, value: "def"},
%Compiler.Token{type: :identifier, value: "f"},
%Compiler.Token{type: :oparen, value: "("},
%Compiler.Token{type: :cparen, value: ")"},
%Compiler.Token{type: :integer, value: "1"},
%Compiler.Token{type: :end, value: "end"}]
答案 0 :(得分:3)
我知道你已经找到了如何在答案中用显式递归替换reduce_while
。这是一个更惯用的方法,你会看到Elixir和Erlang使用的大多数手写标记器。这种方法可以比天真的基于正则表达式的标记化器快得多,并且还允许添加基于纯正则表达式的标记化器不能的逻辑(尽管在这种情况下你不需要它)。
以下是包含一些内联注释的代码:
defmodule Compiler.Tokenizer.Dogbert do
def tokenize(code), do: tokenize(code, [])
# We're done. Reverse the tokens since we collected them in reverse order.
defp tokenize("", acc), do: Enum.reverse(acc)
# Remove leading whitespace.
defp tokenize(<<h, rest::binary>>, acc) when h in ' \t\r\n', do: tokenize(rest, acc)
# Identifier
defp tokenize(binary = <<h, _::binary>>, acc) when h in ?a..?z do
{value, rest} = take_while(binary, fn b -> b in ?a..?z end)
type = case value do
"def" -> :def
"end" -> :end
_ -> :identifier
end
tokenize(rest, [%Compiler.Token{type: type, value: value} | acc])
end
# Number
defp tokenize(binary = <<h, _::binary>>, acc) when h in ?0..?9 do
{value, rest} = take_while(binary, fn b -> b in ?0..?9 end)
tokenize(rest, [%Compiler.Token{type: :integer, value: value} | acc])
end
# (
defp tokenize("(" <> rest, acc), do: tokenize(rest, [%Compiler.Token{type: :oparen, value: "("} | acc])
# )
defp tokenize(")" <> rest, acc), do: tokenize(rest, [%Compiler.Token{type: :cparen, value: ")"} | acc])
# A simple helper function that extracts the leading part of the binary as long as `fun` returns `true` when called with a byte, starting from the first byte of the binary. It returns the extracted binary and the remaining binary.
# We use indexes to track the position for efficiency.
# Using accumulators like for lists can be inefficient for binaries since we have to allocate memory which can be avoided if we deal with byte offsets and do a `binary:part` at the end.
defp take_while(binary, fun), do: take_while(binary, fun, 0)
defp take_while(binary, fun, byte) do
if byte < byte_size(binary) && fun.(:binary.at(binary, byte)) do
take_while(binary, fun, byte + 1)
else
<<value::binary-size(byte), rest::binary>> = binary
{value, rest}
end
end
end
测试:
code = "def f()
1
end"
IO.inspect Compiler.Tokenizer.Dogbert.tokenize(code)
输出:
[%Compiler.Token{type: :def, value: "def"},
%Compiler.Token{type: :identifier, value: "f"},
%Compiler.Token{type: :oparen, value: "("},
%Compiler.Token{type: :cparen, value: ")"},
%Compiler.Token{type: :integer, value: "1"},
%Compiler.Token{type: :end, value: "end"}]
这是使用benchee
比较您的实施与我的基准的基准。你的实现有一些容易解决效率低下的问题(比如没有在每次运行时构建Regex并避免使用++
),但我希望基于Regex的方法总是慢于我使用的方法。
defmodule Compiler.Token do
defstruct [:type, :value]
end
defmodule Compiler.Tokenizer.Dogbert do
def tokenize(code), do: tokenize(code, [])
# We're done. Reverse the tokens since we collected them in reverse order.
defp tokenize("", acc), do: Enum.reverse(acc)
# Remove leading whitespace.
defp tokenize(<<h, rest::binary>>, acc) when h in ' \t\r\n', do: tokenize(rest, acc)
# Identifier
defp tokenize(binary = <<h, _::binary>>, acc) when h in ?a..?z do
{value, rest} = take_while(binary, fn b -> b in ?a..?z end)
type = case value do
"def" -> :def
"end" -> :end
_ -> :identifier
end
tokenize(rest, [%Compiler.Token{type: type, value: value} | acc])
end
# Number
defp tokenize(binary = <<h, _::binary>>, acc) when h in ?0..?9 do
{value, rest} = take_while(binary, fn b -> b in ?0..?9 end)
tokenize(rest, [%Compiler.Token{type: :integer, value: value} | acc])
end
# (
defp tokenize("(" <> rest, acc), do: tokenize(rest, [%Compiler.Token{type: :oparen, value: "("} | acc])
# )
defp tokenize(")" <> rest, acc), do: tokenize(rest, [%Compiler.Token{type: :cparen, value: ")"} | acc])
# A simple helper function that extracts the leading part of the binary as long as `fun` returns `true` when called with a byte, starting from the first byte of the binary. It returns the extracted binary and the remaining binary.
# We use indexes to track the position for efficiency.
# Using accumulators like for lists can be inefficient for binaries since we have to allocate memory which can be avoided if we deal with byte offsets and do a `binary:part` at the end.
defp take_while(binary, fun), do: take_while(binary, fun, 0)
defp take_while(binary, fun, byte) do
if byte < byte_size(binary) && fun.(:binary.at(binary, byte)) do
take_while(binary, fun, byte + 1)
else
<<value::binary-size(byte), rest::binary>> = binary
{value, rest}
end
end
end
defmodule Compiler.Tokenizer.PaulRuescher do
@token_types [
{:def, "\\bdef\\b"},
{:end, "\\bend\\b"},
{:identifier, "\\b[a-zA-Z]+\\b"},
{:integer, "\\b[0-9]+\\b"},
{:oparen, "\\("},
{:cparen, "\\)"}
]
def tokenize(code_string) do
call_tokenize(code_string, [])
end
def call_tokenize("", accumulator) do
accumulator
end
def call_tokenize(code_string, accumulator) do
{type, value} = attempt_tokenize(@token_types, code_string)
base = byte_size(value)
token = %Compiler.Token{type: type, value: value}
binary_part(code_string, base, byte_size(code_string) - base)
|> String.trim()
|> call_tokenize(accumulator ++ [token])
end
def attempt_tokenize(token_types, code_string, index \\ 0) do
{type, re} = Enum.at(token_types, index)
case Regex.run(~r/\A#{re}/, code_string) do
nil -> attempt_tokenize(token_types, code_string, index + 1)
value -> {type, hd(value)}
end
end
end
code = String.duplicate("def f()
1
end", 1000)
IO.inspect Compiler.Tokenizer.PaulRuescher.tokenize(code) == Compiler.Tokenizer.Dogbert.tokenize(code)
Benchee.run(%{
"@paulruescher" => fn -> Compiler.Tokenizer.PaulRuescher.tokenize(code) end,
"@Dogbert" => fn -> Compiler.Tokenizer.Dogbert.tokenize(code) end,
})
结果:
true
...
Name ips average deviation median
@Dogbert 442.18 2.26 ms ±17.03% 2.43 ms
@paulruescher 11.78 84.92 ms ±8.37% 83.67 ms
Comparison:
@Dogbert 442.18
@paulruescher 11.78 - 37.55x slower
答案 1 :(得分:0)
我重构了我之前的代码示例,并且对此已经非常满意了。我可以看到一些边缘情况,比如令牌器永远不会匹配正则表达式,但我认为我现在可以使用它。
defmodule Compiler.Token do
defstruct [:type, :value]
end
defmodule Compiler.Tokenizer do
@token_types [
{:def, "\\bdef\\b"},
{:end, "\\bend\\b"},
{:identifier, "\\b[a-zA-Z]+\\b"},
{:integer, "\\b[0-9]+\\b"},
{:oparen, "\\("},
{:cparen, "\\)"}
]
def tokenize(code_string) do
call_tokenize(code_string, [])
end
def call_tokenize("", accumulator) do
accumulator
end
def call_tokenize(code_string, accumulator) do
{type, value} = attempt_tokenize(@token_types, code_string)
base = byte_size(value)
token = %Compiler.Token{type: type, value: value}
binary_part(code_string, base, byte_size(code_string) - base)
|> String.trim()
|> call_tokenize(accumulator ++ [token])
end
def attempt_tokenize(token_types, code_string, index \\ 0) do
{type, re} = Enum.at(token_types, index)
case Regex.run(~r/\A#{re}/, code_string) do
nil -> attempt_tokenize(token_types, code_string, index + 1)
value -> {type, hd(value)}
end
end
end