在clojure中缓慢的词法分析器

时间:2016-11-16 14:14:16

标签: clojure

我正在尝试用clojure编写一个简单的词法分析器。目前,它仅识别以空格分隔的标识符。

(refer 'clojure.set :only '[union])

(defn char-range-set
  "Generate set containing all characters in the range [from; to]"
  [from to]
  (set (map char (range (int from) (inc (int to))))))

(def ident-initial (union (char-range-set \A \Z) (char-range-set \a \z) #{\_}))

(def ident-subseq (union ident-initial (char-range-set \0 \9)))

(defn update-lex [lex token source]
  (assoc (update lex :tokens conj token) :source source))

(defn scan-identifier [lex]
  (assert (ident-initial (first (:source lex))))
  (loop [[c & cs :as source] (rest (:source lex))
         value [(first (:source lex))]]
    (if (ident-subseq c)
      (recur cs (conj value c))
      (update-lex lex {:type :identifier :value value} source))))

(defn scan [{tokens :tokens [c & cs :as source] :source :as lex}]
  (cond
    (Character/isWhitespace c) (assoc lex :source cs)
    (ident-initial c) (scan-identifier lex)))

(defn tokenize [source]
  (loop [lex {:tokens [] :source source}]
    (if (empty? (:source lex))
      (:tokens lex)
      (recur (scan lex)))))

(defn measure-tokenizer [n]
  (let [s (clojure.string/join (repeat n "abcde "))]
    (time (tokenize s))
    (* n (count "abcde "))))

Lexer处理大约600万个字符,持续15秒。

=> (measure-tokenizer 1000000)
"Elapsed time: 15865.909399 msecs"

之后,我将所有地图和矢量转换为瞬变。这没有改善。

另外,我在C ++中实现了类比算法。同一输入只需0.2秒。

我的问题是:如何改进我的代码?也许我错误地使用了clojure数据结构?

更新

所以这是我的C ++代码。

#include <iostream>
#include <vector>
#include <chrono>
#include <unordered_set>
#include <cstdlib>
#include <string>
#include <cctype>
using namespace std;

struct Token
{
   enum { IDENTIFIER = 1 };
   int type;
   string value;
};

class Lexer
{
public:
   Lexer(const std::string& source)
      : mSource(source)
      , mIndex(0)
   {
      initCharSets();
   }

   std::vector<Token> tokenize()
   {
      while (mIndex < mSource.size())
      {
         scan();
      }

      return mResult;
   }

private:

   void initCharSets()
   {
      for (char c = 'a'; c <= 'z'; ++c)
         mIdentifierInitial.insert(c);
      for (char c = 'A'; c <= 'Z'; ++c)
         mIdentifierInitial.insert(c);
      mIdentifierInitial.insert('_');

      mIdentifierSubsequent = mIdentifierInitial;
      for (char c = '0'; c <= '9'; ++c)
         mIdentifierSubsequent.insert(c);
   }

   void scan()
   {
      skipSpaces();

      if (mIndex < mSource.size())
      {
         if (mIdentifierInitial.find(mSource[mIndex]) != mIdentifierInitial.end())
         {
            scanIdentifier();
         }

         mResult.push_back(mToken);
      }
   }

   void scanIdentifier()
   {
      size_t i = mIndex;

      while ((i < mSource.size()) && (mIdentifierSubsequent.find(mSource[i]) != mIdentifierSubsequent.end()))
         ++i;

      mToken.type = Token::IDENTIFIER;
      mToken.value = mSource.substr(mIndex, i - mIndex);
      mIndex = i;
   }

   void skipSpaces()
   {
      while ((mIndex < mSource.size()) && std::isspace(mSource[mIndex]))
         ++mIndex;
   }

   unordered_set<char> mIdentifierInitial;
   unordered_set<char> mIdentifierSubsequent;
   string mSource;
   size_t mIndex;
   vector<Token> mResult;
   Token mToken;
};

void measureBigString(int n)
{
   std::string substri = "jobbi ";
   std::string bigstr;
   for (int i =0 ;i < n;++i)
      bigstr += substri;

   Lexer lexer(bigstr);

   std::chrono::steady_clock::time_point begin = std::chrono::steady_clock::now();

   lexer.tokenize();

   std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now();

   std::cout << n << endl;
   std::cout << "Time difference = " << std::chrono::duration_cast<std::chrono::milliseconds>(end - begin).count() <<std::endl;
   std::cout << "\n\n\n";
}



int main()
{
   measureBigString(1000000);


   return 0;
}

2 个答案:

答案 0 :(得分:2)

我没有看到此代码有任何明显错误。我不希望瞬态过多帮助你,因为你不是批量加载,而是每循环更新一次(加上我怀疑它实际上是最慢的部分)。

我猜哪些事情很慢:

  • 检查集合中的字符(需要散列并遍历内部哈希树)。而不是构建集合,创建实际上对字符范围进行基于int检查的函数(&gt; this,&lt; that等)不会那么漂亮,但几乎肯定会更快,特别是如果你小心使用原始类型提示并避免装箱对象。
  • 每次循环时都会嵌入一个嵌套在hashmap中的值。这不会是最快的操作。如果你确实将那个东西保持为一个独立的瞬态向量,它会更快,并避免重建上层树。根据您想要去的外部惯用语Clojure和Java的范围,您还可以使用可变的ArrayList。这很脏,但它很快 - 如果你限制了谁暴露于那个可变状态的范围,那么我会考虑这样的事情。从概念上讲,与瞬态向量相同。

答案 1 :(得分:2)

<强>更新

另一个重要的调整是矢量解构。通过替换这样的代码:

(let [[c & cs] xs] ...)

使用:

(let [c  (first xs)
      cs (rest xs)] ...)

将提供另一个 x2 性能改进。总之,你将获得 x26 加速 - 这应该与C ++实现相同。

简而言之:

  1. 类型提示可以避免所有反射调用
  2. Record为您提供了对属性的优化访问/更新
  3. 首先避免向量解析​​ - 使用nth / nthFrom并对seq执行顺序访问。
  4. 希望矢量解构可以优化,以避免nthFrom这样的常见情况(在绑定中只有第一个和其余部分)。

    FIRST TUNING - 使用类型提示和记录:

    您还可以使用记录而不是通用地图:

    (refer 'clojure.set :only '[union])
    
    (defn char-range-set
      "Generate set containing all characters in the range [from; to]"
      [from to]
      (set (map char (range (int from) (inc (int to))))))
    
    (def ident-initial (union (char-range-set \A \Z) (char-range-set \a \z) #{\_}))
    
    (def ident-subseq (union ident-initial (char-range-set \0 \9)))
    
    (defrecord Token [type value])
    (defrecord Lex [tokens source])
    
    (defn update-lex [^Lex lex ^Token token source]
      (assoc (update lex :tokens conj token) :source source))
    
    (defn scan-identifier [^Lex lex]
      (let [[x & xs] (:source lex)]
        (loop [[c & cs :as source] xs
               value               [x]]
          (if (ident-subseq c)
            (recur cs (conj value c))
            (update-lex lex (Token. :identifier value) source)))))
    
    (defn scan [^Lex lex]
      (let [[c & cs] (:source lex)
            tokens   (:tokens lex)]
        (cond
          (Character/isWhitespace ^char c) (assoc lex :source cs)
          (ident-initial c)                (scan-identifier lex))))
    
    (defn tokenize [source]
      (loop [lex (Lex. [] source)]
        (if (empty? (:source lex))
          (:tokens lex)
          (recur (scan lex)))))
    
    (use 'criterium.core)
    
    (defn measure-tokenizer [n]
      (let [s (clojure.string/join (repeat n "abcde "))]
        (bench (tokenize s))
        (* n (count "abcde "))))
    
    (measure-tokenizer 1000)
    

    使用标准:

    Evaluation count : 128700 in 60 samples of 2145 calls.
                 Execution time mean : 467.378916 µs
        Execution time std-deviation : 329.455994 ns
       Execution time lower quantile : 466.867909 µs ( 2.5%)
       Execution time upper quantile : 467.984646 µs (97.5%)
                       Overhead used : 1.502982 ns
    

    与原始代码相比:

    Evaluation count : 9960 in 60 samples of 166 calls.
                 Execution time mean : 6.040209 ms
        Execution time std-deviation : 6.630519 µs
       Execution time lower quantile : 6.028470 ms ( 2.5%)
       Execution time upper quantile : 6.049443 ms (97.5%)
                       Overhead used : 1.502982 ns
    

    优化版本大致 x13 加速。当n = 1,000,000时,现在需要约0.5秒。