在Elixir

时间:2017-08-14 03:35:46

标签: elixir

我正在关注编译器最近的Destroy All Software截屏视频。所有示例代码都是用Ruby编写的,但我试图用Elixir实现相同的输出。

用简单的英语,我的方法是这样的:

  1. 将一串代码传递给递归函数(call_tokenizer
  2. 循环使用令牌类型,并使用当前令牌类型
  3. 运行正则表达式
  4. 如果找到匹配项,则将结果追加到累加器中删除 从字符串开头的结果,并传递剩余的 字符串和累加器到call_tokenizer。另外,:halt`reduce_while循环,并返回累积的标记。
  5. 如果未找到结果,请继续循环
  6. 我在Elixir中编写了一些工作的函数,因为我从输入中得到了预期的输出。但是,我想我的代码至少有一个问题。

    我正在使用Enum.reduce_while,但我不认为我正在使用它,因为它打算使用它。我的假设是有一些更好的方法来重新编写reduce_while作为递归函数。

    如果我需要在一个问题中总结一下,那么如何在不依赖reduce_while给我的暂停/继续属性的情况下实现相同的结果?我应该注意这个代码示例是否有任何其他问题?

    这是我的代码和预期输出:

    // Code
    defmodule Compiler.Token do
      defstruct [:type, :value]
    end
    
    defmodule Compiler.Tokenizer do
      @token_types [
        {:def, "\\bdef\\b"},
        {:end, "\\bend\\b"},
        {:identifier, "\\b[a-zA-Z]+\\b"},
        {:integer, "\\b[0-9]+\\b"},
        {:oparen, "\\("},
        {:cparen, "\\)"}
      ]
    
      def tokenize() do
        code = "def f()
          1
        end"
    
        IO.inspect call_tokenize(code, [])
      end
    
      def call_tokenize("", accumulator) do
        accumulator
      end
    
      def call_tokenize(code, accumulator) do
        Enum.reduce_while(@token_types, "", fn {type, re}, acc ->
          result = Regex.run(~r/\A#{re}/, code)
    
          if result do
            value = hd(result)
            base = byte_size(value)
            token = %Compiler.Token{type: type, value: value}
            tokens = binary_part(code, base, byte_size(code) - base)
              |> String.trim()
              |> call_tokenize(accumulator ++ [token])
            {:halt, tokens}
          else
            {:cont, acc}
          end
        end)
      end
    end
    
    
    // Expected output
    [%Compiler.Token{type: :def, value: "def"},
     %Compiler.Token{type: :identifier, value: "f"},
     %Compiler.Token{type: :oparen, value: "("},
     %Compiler.Token{type: :cparen, value: ")"},
     %Compiler.Token{type: :integer, value: "1"},
     %Compiler.Token{type: :end, value: "end"}]    
    

2 个答案:

答案 0 :(得分:3)

我知道你已经找到了如何在答案中用显式递归替换reduce_while。这是一个更惯用的方法,你会看到Elixir和Erlang使用的大多数手写标记器。这种方法可以比天真的基于正则表达式的标记化器快得多,并且还允许添加基于纯正则表达式的标记化器不能的逻辑(尽管在这种情况下你不需要它)。

以下是包含一些内联注释的代码:

defmodule Compiler.Tokenizer.Dogbert do
  def tokenize(code), do: tokenize(code, [])

  # We're done. Reverse the tokens since we collected them in reverse order.
  defp tokenize("", acc), do: Enum.reverse(acc)
  # Remove leading whitespace.
  defp tokenize(<<h, rest::binary>>, acc) when h in ' \t\r\n', do: tokenize(rest, acc)
  # Identifier
  defp tokenize(binary = <<h, _::binary>>, acc) when h in ?a..?z do
    {value, rest} = take_while(binary, fn b -> b in ?a..?z end)
    type = case value do
      "def" -> :def
      "end" -> :end
      _ -> :identifier
    end
    tokenize(rest, [%Compiler.Token{type: type, value: value} | acc])
  end
  # Number
  defp tokenize(binary = <<h, _::binary>>, acc) when h in ?0..?9 do
    {value, rest} = take_while(binary, fn b -> b in ?0..?9 end)
    tokenize(rest, [%Compiler.Token{type: :integer, value: value} | acc])
  end
  # (
  defp tokenize("(" <> rest, acc), do: tokenize(rest, [%Compiler.Token{type: :oparen, value: "("} | acc])
  # )
  defp tokenize(")" <> rest, acc), do: tokenize(rest, [%Compiler.Token{type: :cparen, value: ")"} | acc])

  # A simple helper function that extracts the leading part of the binary as long as `fun` returns `true` when called with a byte, starting from the first byte of the binary. It returns the extracted binary and the remaining binary.
  # We use indexes to track the position for efficiency.
  # Using accumulators like for lists can be inefficient for binaries since we have to allocate memory which can be avoided if we deal with byte offsets and do a `binary:part` at the end.
  defp take_while(binary, fun), do: take_while(binary, fun, 0)
  defp take_while(binary, fun, byte) do
    if byte < byte_size(binary) && fun.(:binary.at(binary, byte)) do
      take_while(binary, fun, byte + 1)
    else
      <<value::binary-size(byte), rest::binary>> = binary
      {value, rest}
    end
  end
end

测试:

code = "def f()
  1
end"

IO.inspect Compiler.Tokenizer.Dogbert.tokenize(code)

输出:

[%Compiler.Token{type: :def, value: "def"},
 %Compiler.Token{type: :identifier, value: "f"},
 %Compiler.Token{type: :oparen, value: "("},
 %Compiler.Token{type: :cparen, value: ")"},
 %Compiler.Token{type: :integer, value: "1"},
 %Compiler.Token{type: :end, value: "end"}]

这是使用benchee比较您的实施与我的基准的基准。你的实现有一些容易解决效率低下的问题(比如没有在每次运行时构建Regex并避免使用++),但我希望基于Regex的方法总是慢于我使用的方法。

defmodule Compiler.Token do
  defstruct [:type, :value]
end

defmodule Compiler.Tokenizer.Dogbert do
  def tokenize(code), do: tokenize(code, [])

  # We're done. Reverse the tokens since we collected them in reverse order.
  defp tokenize("", acc), do: Enum.reverse(acc)
  # Remove leading whitespace.
  defp tokenize(<<h, rest::binary>>, acc) when h in ' \t\r\n', do: tokenize(rest, acc)
  # Identifier
  defp tokenize(binary = <<h, _::binary>>, acc) when h in ?a..?z do
    {value, rest} = take_while(binary, fn b -> b in ?a..?z end)
    type = case value do
      "def" -> :def
      "end" -> :end
      _ -> :identifier
    end
    tokenize(rest, [%Compiler.Token{type: type, value: value} | acc])
  end
  # Number
  defp tokenize(binary = <<h, _::binary>>, acc) when h in ?0..?9 do
    {value, rest} = take_while(binary, fn b -> b in ?0..?9 end)
    tokenize(rest, [%Compiler.Token{type: :integer, value: value} | acc])
  end
  # (
  defp tokenize("(" <> rest, acc), do: tokenize(rest, [%Compiler.Token{type: :oparen, value: "("} | acc])
  # )
  defp tokenize(")" <> rest, acc), do: tokenize(rest, [%Compiler.Token{type: :cparen, value: ")"} | acc])

  # A simple helper function that extracts the leading part of the binary as long as `fun` returns `true` when called with a byte, starting from the first byte of the binary. It returns the extracted binary and the remaining binary.
  # We use indexes to track the position for efficiency.
  # Using accumulators like for lists can be inefficient for binaries since we have to allocate memory which can be avoided if we deal with byte offsets and do a `binary:part` at the end.
  defp take_while(binary, fun), do: take_while(binary, fun, 0)
  defp take_while(binary, fun, byte) do
    if byte < byte_size(binary) && fun.(:binary.at(binary, byte)) do
      take_while(binary, fun, byte + 1)
    else
      <<value::binary-size(byte), rest::binary>> = binary
      {value, rest}
    end
  end
end

defmodule Compiler.Tokenizer.PaulRuescher do
  @token_types [
    {:def, "\\bdef\\b"},
    {:end, "\\bend\\b"},
    {:identifier, "\\b[a-zA-Z]+\\b"},
    {:integer, "\\b[0-9]+\\b"},
    {:oparen, "\\("},
    {:cparen, "\\)"}
  ]

  def tokenize(code_string) do
    call_tokenize(code_string, [])
  end

  def call_tokenize("", accumulator) do
    accumulator
  end

  def call_tokenize(code_string, accumulator) do
    {type, value} = attempt_tokenize(@token_types, code_string)
    base = byte_size(value)
    token = %Compiler.Token{type: type, value: value}
    binary_part(code_string, base, byte_size(code_string) - base)
      |> String.trim()
      |> call_tokenize(accumulator ++ [token])
  end

  def attempt_tokenize(token_types, code_string, index \\ 0) do
    {type, re} = Enum.at(token_types, index)

    case Regex.run(~r/\A#{re}/, code_string) do
      nil -> attempt_tokenize(token_types, code_string, index + 1)
      value -> {type, hd(value)}
    end
  end
end

code = String.duplicate("def f()
  1
end", 1000)

IO.inspect Compiler.Tokenizer.PaulRuescher.tokenize(code) == Compiler.Tokenizer.Dogbert.tokenize(code)

Benchee.run(%{
  "@paulruescher" => fn -> Compiler.Tokenizer.PaulRuescher.tokenize(code) end,
  "@Dogbert" => fn -> Compiler.Tokenizer.Dogbert.tokenize(code) end,
})

结果:

true
...

Name                    ips        average  deviation         median
@Dogbert             442.18        2.26 ms    ±17.03%        2.43 ms
@paulruescher         11.78       84.92 ms     ±8.37%       83.67 ms

Comparison:
@Dogbert             442.18
@paulruescher         11.78 - 37.55x slower

答案 1 :(得分:0)

我重构了我之前的代码示例,并且对此已经非常满意了。我可以看到一些边缘情况,比如令牌器永远不会匹配正则表达式,但我认为我现在可以使用它。

defmodule Compiler.Token do
  defstruct [:type, :value]
end

defmodule Compiler.Tokenizer do
  @token_types [
    {:def, "\\bdef\\b"},
    {:end, "\\bend\\b"},
    {:identifier, "\\b[a-zA-Z]+\\b"},
    {:integer, "\\b[0-9]+\\b"},
    {:oparen, "\\("},
    {:cparen, "\\)"}
  ]

  def tokenize(code_string) do
    call_tokenize(code_string, [])
  end

  def call_tokenize("", accumulator) do
    accumulator
  end

  def call_tokenize(code_string, accumulator) do
    {type, value} = attempt_tokenize(@token_types, code_string)
    base = byte_size(value)
    token = %Compiler.Token{type: type, value: value}
    binary_part(code_string, base, byte_size(code_string) - base)
      |> String.trim()
      |> call_tokenize(accumulator ++ [token])
  end

  def attempt_tokenize(token_types, code_string, index \\ 0) do
    {type, re} = Enum.at(token_types, index)

    case Regex.run(~r/\A#{re}/, code_string) do
      nil -> attempt_tokenize(token_types, code_string, index + 1)
      value -> {type, hd(value)}
    end
  end
end