Question

我正在关注编译器最近的Destroy All Software截屏视频。所有示例代码都是用Ruby编写的，但我试图用Elixir实现相同的输出。

用简单的英语，我的方法是这样的：

将一串代码传递给递归函数（call_tokenizer）
循环使用令牌类型，并使用当前令牌类型
如果找到匹配项，则将结果追加到累加器中删除从字符串开头的结果，并传递剩余的字符串和累加器到call_tokenizer。另外，:halt`reduce_while循环，并返回累积的标记。
如果未找到结果，请继续循环

我在Elixir中编写了一些工作的函数，因为我从输入中得到了预期的输出。但是，我想我的代码至少有一个问题。

我正在使用Enum.reduce_while，但我不认为我正在使用它，因为它打算使用它。我的假设是有一些更好的方法来重新编写reduce_while作为递归函数。

如果我需要在一个问题中总结一下，那么如何在不依赖reduce_while给我的暂停/继续属性的情况下实现相同的结果？我应该注意这个代码示例是否有任何其他问题？

这是我的代码和预期输出：

// Code
defmodule Compiler.Token do
  defstruct [:type, :value]
end

defmodule Compiler.Tokenizer do
  @token_types [
    {:def, "\\bdef\\b"},
    {:end, "\\bend\\b"},
    {:identifier, "\\b[a-zA-Z]+\\b"},
    {:integer, "\\b[0-9]+\\b"},
    {:oparen, "\\("},
    {:cparen, "\\)"}
  ]

  def tokenize() do
    code = "def f()
      1
    end"

    IO.inspect call_tokenize(code, [])
  end

  def call_tokenize("", accumulator) do
    accumulator
  end

  def call_tokenize(code, accumulator) do
    Enum.reduce_while(@token_types, "", fn {type, re}, acc ->
      result = Regex.run(~r/\A#{re}/, code)

      if result do
        value = hd(result)
        base = byte_size(value)
        token = %Compiler.Token{type: type, value: value}
        tokens = binary_part(code, base, byte_size(code) - base)
          |> String.trim()
          |> call_tokenize(accumulator ++ [token])
        {:halt, tokens}
      else
        {:cont, acc}
      end
    end)
  end
end


// Expected output
[%Compiler.Token{type: :def, value: "def"},
 %Compiler.Token{type: :identifier, value: "f"},
 %Compiler.Token{type: :oparen, value: "("},
 %Compiler.Token{type: :cparen, value: ")"},
 %Compiler.Token{type: :integer, value: "1"},
 %Compiler.Token{type: :end, value: "end"}]

Answer 1

我知道你已经找到了如何在答案中用显式递归替换reduce_while。这是一个更惯用的方法，你会看到Elixir和Erlang使用的大多数手写标记器。这种方法可以比天真的基于正则表达式的标记化器快得多，并且还允许添加基于纯正则表达式的标记化器不能的逻辑（尽管在这种情况下你不需要它）。

以下是包含一些内联注释的代码：

defmodule Compiler.Tokenizer.Dogbert do
  def tokenize(code), do: tokenize(code, [])

  # We're done. Reverse the tokens since we collected them in reverse order.
  defp tokenize("", acc), do: Enum.reverse(acc)
  # Remove leading whitespace.
  defp tokenize(<<h, rest::binary>>, acc) when h in ' \t\r\n', do: tokenize(rest, acc)
  # Identifier
  defp tokenize(binary = <<h, _::binary>>, acc) when h in ?a..?z do
    {value, rest} = take_while(binary, fn b -> b in ?a..?z end)
    type = case value do
      "def" -> :def
      "end" -> :end
      _ -> :identifier
    end
    tokenize(rest, [%Compiler.Token{type: type, value: value} | acc])
  end
  # Number
  defp tokenize(binary = <<h, _::binary>>, acc) when h in ?0..?9 do
    {value, rest} = take_while(binary, fn b -> b in ?0..?9 end)
    tokenize(rest, [%Compiler.Token{type: :integer, value: value} | acc])
  end
  # (
  defp tokenize("(" <> rest, acc), do: tokenize(rest, [%Compiler.Token{type: :oparen, value: "("} | acc])
  # )
  defp tokenize(")" <> rest, acc), do: tokenize(rest, [%Compiler.Token{type: :cparen, value: ")"} | acc])

  # A simple helper function that extracts the leading part of the binary as long as `fun` returns `true` when called with a byte, starting from the first byte of the binary. It returns the extracted binary and the remaining binary.
  # We use indexes to track the position for efficiency.
  # Using accumulators like for lists can be inefficient for binaries since we have to allocate memory which can be avoided if we deal with byte offsets and do a `binary:part` at the end.
  defp take_while(binary, fun), do: take_while(binary, fun, 0)
  defp take_while(binary, fun, byte) do
    if byte < byte_size(binary) && fun.(:binary.at(binary, byte)) do
      take_while(binary, fun, byte + 1)
    else
      <<value::binary-size(byte), rest::binary>> = binary
      {value, rest}
    end
  end
end

测试：

code = "def f()
  1
end"

IO.inspect Compiler.Tokenizer.Dogbert.tokenize(code)

输出：

[%Compiler.Token{type: :def, value: "def"},
 %Compiler.Token{type: :identifier, value: "f"},
 %Compiler.Token{type: :oparen, value: "("},
 %Compiler.Token{type: :cparen, value: ")"},
 %Compiler.Token{type: :integer, value: "1"},
 %Compiler.Token{type: :end, value: "end"}]

这是使用benchee比较您的实施与我的基准的基准。你的实现有一些容易解决效率低下的问题（比如没有在每次运行时构建Regex并避免使用++），但我希望基于Regex的方法总是慢于我使用的方法。

defmodule Compiler.Token do
  defstruct [:type, :value]
end

defmodule Compiler.Tokenizer.Dogbert do
  def tokenize(code), do: tokenize(code, [])

  # We're done. Reverse the tokens since we collected them in reverse order.
  defp tokenize("", acc), do: Enum.reverse(acc)
  # Remove leading whitespace.
  defp tokenize(<<h, rest::binary>>, acc) when h in ' \t\r\n', do: tokenize(rest, acc)
  # Identifier
  defp tokenize(binary = <<h, _::binary>>, acc) when h in ?a..?z do
    {value, rest} = take_while(binary, fn b -> b in ?a..?z end)
    type = case value do
      "def" -> :def
      "end" -> :end
      _ -> :identifier
    end
    tokenize(rest, [%Compiler.Token{type: type, value: value} | acc])
  end
  # Number
  defp tokenize(binary = <<h, _::binary>>, acc) when h in ?0..?9 do
    {value, rest} = take_while(binary, fn b -> b in ?0..?9 end)
    tokenize(rest, [%Compiler.Token{type: :integer, value: value} | acc])
  end
  # (
  defp tokenize("(" <> rest, acc), do: tokenize(rest, [%Compiler.Token{type: :oparen, value: "("} | acc])
  # )
  defp tokenize(")" <> rest, acc), do: tokenize(rest, [%Compiler.Token{type: :cparen, value: ")"} | acc])

  # A simple helper function that extracts the leading part of the binary as long as `fun` returns `true` when called with a byte, starting from the first byte of the binary. It returns the extracted binary and the remaining binary.
  # We use indexes to track the position for efficiency.
  # Using accumulators like for lists can be inefficient for binaries since we have to allocate memory which can be avoided if we deal with byte offsets and do a `binary:part` at the end.
  defp take_while(binary, fun), do: take_while(binary, fun, 0)
  defp take_while(binary, fun, byte) do
    if byte < byte_size(binary) && fun.(:binary.at(binary, byte)) do
      take_while(binary, fun, byte + 1)
    else
      <<value::binary-size(byte), rest::binary>> = binary
      {value, rest}
    end
  end
end

defmodule Compiler.Tokenizer.PaulRuescher do
  @token_types [
    {:def, "\\bdef\\b"},
    {:end, "\\bend\\b"},
    {:identifier, "\\b[a-zA-Z]+\\b"},
    {:integer, "\\b[0-9]+\\b"},
    {:oparen, "\\("},
    {:cparen, "\\)"}
  ]

  def tokenize(code_string) do
    call_tokenize(code_string, [])
  end

  def call_tokenize("", accumulator) do
    accumulator
  end

  def call_tokenize(code_string, accumulator) do
    {type, value} = attempt_tokenize(@token_types, code_string)
    base = byte_size(value)
    token = %Compiler.Token{type: type, value: value}
    binary_part(code_string, base, byte_size(code_string) - base)
      |> String.trim()
      |> call_tokenize(accumulator ++ [token])
  end

  def attempt_tokenize(token_types, code_string, index \\ 0) do
    {type, re} = Enum.at(token_types, index)

    case Regex.run(~r/\A#{re}/, code_string) do
      nil -> attempt_tokenize(token_types, code_string, index + 1)
      value -> {type, hd(value)}
    end
  end
end

code = String.duplicate("def f()
  1
end", 1000)

IO.inspect Compiler.Tokenizer.PaulRuescher.tokenize(code) == Compiler.Tokenizer.Dogbert.tokenize(code)

Benchee.run(%{
  "@paulruescher" => fn -> Compiler.Tokenizer.PaulRuescher.tokenize(code) end,
  "@Dogbert" => fn -> Compiler.Tokenizer.Dogbert.tokenize(code) end,
})

结果：

true
...

Name                    ips        average  deviation         median
@Dogbert             442.18        2.26 ms    ±17.03%        2.43 ms
@paulruescher         11.78       84.92 ms     ±8.37%       83.67 ms

Comparison:
@Dogbert             442.18
@paulruescher         11.78 - 37.55x slower

Answer 2

我重构了我之前的代码示例，并且对此已经非常满意了。我可以看到一些边缘情况，比如令牌器永远不会匹配正则表达式，但我认为我现在可以使用它。

defmodule Compiler.Token do
  defstruct [:type, :value]
end

defmodule Compiler.Tokenizer do
  @token_types [
    {:def, "\\bdef\\b"},
    {:end, "\\bend\\b"},
    {:identifier, "\\b[a-zA-Z]+\\b"},
    {:integer, "\\b[0-9]+\\b"},
    {:oparen, "\\("},
    {:cparen, "\\)"}
  ]

  def tokenize(code_string) do
    call_tokenize(code_string, [])
  end

  def call_tokenize("", accumulator) do
    accumulator
  end

  def call_tokenize(code_string, accumulator) do
    {type, value} = attempt_tokenize(@token_types, code_string)
    base = byte_size(value)
    token = %Compiler.Token{type: type, value: value}
    binary_part(code_string, base, byte_size(code_string) - base)
      |> String.trim()
      |> call_tokenize(accumulator ++ [token])
  end

  def attempt_tokenize(token_types, code_string, index \\ 0) do
    {type, re} = Enum.at(token_types, index)

    case Regex.run(~r/\A#{re}/, code_string) do
      nil -> attempt_tokenize(token_types, code_string, index + 1)
      value -> {type, hd(value)}
    end
  end
end

在Elixir

2 个答案: