提高java解析器性能

时间:2016-07-18 12:13:20

标签: java parsing javacc

我有一个用.jjt文件编写的解析器。它与this非常相似。在我的情况下,唯一的变化是插入我自己的表达式评估方法。目前,1个表达式需要大约1毫秒才能成为解析器。我需要提高这个解析器的性能。我使用VisualVM进行性能分析,发现在44.5秒内运行我的代码,其中包括将文件行读入ArrayList<String>并评估93个表达式(其中包含从文件行导出的参数值) ArrayList<String>)使用我的解析器,在parseStream方法中花了大约43秒。 我已经提到this link来改进我的解析器,我也尝试将ERROR_REPORTING选项设置为FALSE,但它没有帮助。

编辑1:

这是parser.jjt文件。 我在文件中分析了1000行的应用程序,大部分时间占用的方法是Start()

options {
    JAVA_UNICODE_ESCAPE = true;
    MULTI = true;
    VISITOR = true;
    VISITOR_EXCEPTION = "ParseException";
    NODE_DEFAULT_VOID = true;
//  NODE_PACKAGE = "org.nfunk.jep.parser";
//  BUILD_NODE_FILES=false;
    STATIC = false;
//  DEBUG_TOKEN_MANAGER = true;
//  DEBUG_PARSER = true;
//  DEBUG_LOOKAHEAD = true;
}


/***************************************************************
PARSER BEGIN
***************************************************************/

PARSER_BEGIN(Parser)
package org.nfunk.jep;

import java.util.Vector;
import org.nfunk.jep.function.*;
import org.nfunk.jep.type.*;

public class Parser {
    private JEP     jep;
    private SymbolTable symTab;
    private OperatorSet opSet;
    private int initialTokenManagerState = DEFAULT;

public Node parseStream(java.io.Reader stream, JEP jep_in)
                        throws ParseException {
    restart(stream,jep_in);
    // Parse the expression, and return the 
    enable_tracing();
    Node node = Start();
    if (node == null) throw new ParseException("No expression entered");
    return node.jjtGetChild(0);
}

/** 
 * Restart the parse with the given stream.
 * @since 2.3.0 beta 1
 */
public void restart(java.io.Reader stream, JEP jep_in)
{
    ReInit(stream);
    this.token_source.SwitchTo(initialTokenManagerState);
    jep = jep_in;
    symTab = jep.getSymbolTable();
    opSet = jep.getOperatorSet();
}
/**
 * Continue parsing without re-initilising stream.
 * Allows renetrancy of parser so that strings like
 * "x=1; y=2; z=3;" can be parsed.
 * When a semi colon is encountered parsing finishes leaving the rest of the string unparsed.
 * Parsing can be resumed from the current position by using this method.
 * For example
 * <pre>
 * XJep j = new XJep();
 * Parser parse = j.getParse();
 * StringReader sr = new StringReader("x=1; y=2; z=3;");
 * parse.restart(sr,j);
 * Node node;
 * try {
 * while((node = j.continueParse())!=null) {
 *    j.println(node);
 * } }catch(ParseException e) {}
 * </pre>
 */
public Node continueParse() throws ParseException
{
    ASTStart node = Start();
    if (node==null) return null;
    return node.jjtGetChild(0);
}

private void addToErrorList(String errorStr) {
    jep.errorList.addElement(errorStr);     
}

/**
 * Sets the initial state that the token manager is in.
 * Can be used to change how x.x is interpreted, either as a single
 * identifier (DEFAULT) or as x <DOT> x (NO_DOT_IN_IDENTIFIERS)
 * @param state the state to be in. Currently the only legal values are DEFAULT and NO_DOT_IN_IDENTIFIER
 */
public void setInitialTokenManagerState(int state)
{
    initialTokenManagerState = state;
}
/**
 * Translate all escape sequences to characters. Inspired by Rob Millar's
 * unescape() method in rcm.util.Str fron the Web Sphinx project.
 *
 * @param inputStr String containing escape characters.
 * @return String with all escape sequences replaced.
 */
private String replaceEscape(String inputStr) {
    int len = inputStr.length();
    int p = 0;
    int i;
    String metachars = "tnrbf\\\"'";
    String chars = "\t\n\r\b\f\\\"'";

    StringBuffer output = new StringBuffer();

    while ((i = inputStr.indexOf('\\', p)) != -1) {
        output.append(inputStr.substring(p, i));

        if (i+1 == len) break;

        // find metacharacter
        char metac = inputStr.charAt(i+1);

        // find the index of the metac
        int k = metachars.indexOf(metac);
        if (k == -1) {
            // didn't find the metachar, leave sequence as found.
            // This code should be unreachable if the parser
            // is functioning properly because strings containing
            // unknown escape characters should not be accepted.
            output.append('\\');
            output.append(metac);
        } else {
            // its corresponding true char
            output.append(chars.charAt(k));   
        }

        // skip over both escape character & metacharacter
        p = i + 2;
    }

    // add the end of the input string to the output
    if (p < len)
        output.append(inputStr.substring(p));

    return output.toString();
}
}

PARSER_END(Parser)

/***************************************************************
SKIP
***************************************************************/

<*> SKIP :
{
  " "
  | "\t"
  | "\n"
  | "\r"

  | <"//" (~["\n","\r"])* ("\n"|"\r"|"\r\n")>
  | <"/*" (~["*"])* "*" (~["/"] (~["*"])* "*")* "/">
}


/***************************************************************
TOKENS
***************************************************************/

<*> TOKEN : /* LITERALS */
{
    < INTEGER_LITERAL:
        <DECIMAL_LITERAL>
    >
|
    < #DECIMAL_LITERAL: ["0"-"9"] (["0"-"9"])* >
|
    < FLOATING_POINT_LITERAL:
        (["0"-"9"])+ "." (["0"-"9"])* (<EXPONENT>)?
        | "." (["0"-"9"])+ (<EXPONENT>)?
        | (["0"-"9"])+ <EXPONENT>
    >
|
    < #EXPONENT: ["e","E"] (["+","-"])? (["0"-"9"])+ >
|
    < STRING_LITERAL:
        "\""
        ( (~["\"","\\","\n","\r"])
        | ("\\" ["n","t","b","r","f","\\","'","\""] )
        )*
        "\""
    >
}

/* IDENTIFIERS 

    Letters before version 2.22
    < #LETTER: ["_","a"-"z","A"-"Z"] >

    In Ver 2.3.0.1 presence of . in an identifier is switchable.
    In the DEFAULT lexical state identifiers can contain a .
    In the NO_DOT_IN_IDENTIFIERS state identifiers cannot contain a .
    the state can be set by using
    Parser.setInitialTokenManagerState
*/

<DEFAULT> TOKEN:
{
    <INDENTIFIER1: <LETTER1>(<LETTER1>|<DIGIT1>|".")*>
    |
    < #LETTER1:
    [
        "\u0024",           // $
        "\u0041"-"\u005a",  // A - Z
        "\u005f",           // _
        "\u0061"-"\u007a",  // a - z
        "\u00c0"-"\u00d6",  // Upper case symbols of Latin-1 Supplement
        "\u00d8"-"\u00f6",  // Lower case symbols of Latin-1 Supplement
        "\u00f8"-"\u00ff",  // More lower case symbols of Latin-1 Supplement
        "\u0100"-"\u1fff",  // Many languages (including Greek)
        "\u3040"-"\u318f",  // Hiragana, Katakana, Bopomofo, Hangul Compatibility Jamo
        "\u3300"-"\u337f",  // CJK Compatibility
        "\u3400"-"\u3d2d",  // CJK Unified Ideographs Extension A
        "\u4e00"-"\u9fff",  // CJK Unified Ideographs
        "\uf900"-"\ufaff"   // CJK Compatibility Ideographs
    ]
    >   
|
    < #DIGIT1: ["0"-"9"] >
}

<NO_DOT_IN_IDENTIFIERS> TOKEN:
{
    <INDENTIFIER2: <LETTER2>(<LETTER2>|<DIGIT2>)*>
    |
    < #LETTER2:
    [
        "\u0024",           // $
        "\u0041"-"\u005a",  // A - Z
        "\u005f",           // _
        "\u0061"-"\u007a",  // a - z
        "\u00c0"-"\u00d6",  // Upper case symbols of Latin-1 Supplement
        "\u00d8"-"\u00f6",  // Lower case symbols of Latin-1 Supplement
        "\u00f8"-"\u00ff",  // More lower case symbols of Latin-1 Supplement
        "\u0100"-"\u1fff",  // Many languages (including Greek)
        "\u3040"-"\u318f",  // Hiragana, Katakana, Bopomofo, Hangul Compatibility Jamo
        "\u3300"-"\u337f",  // CJK Compatibility
        "\u3400"-"\u3d2d",  // CJK Unified Ideographs Extension A
        "\u4e00"-"\u9fff",  // CJK Unified Ideographs
        "\uf900"-"\ufaff"   // CJK Compatibility Ideographs
    ]
    >   
    |
    < #DIGIT2: ["0"-"9"] >
}

/* OPERATORS */
<*> TOKEN:
{
    < ASSIGN:"="  > // rjm
|   < SEMI: ";" >   // rjm
|   < COMMA: "," >  // rjm
|   < GT:   ">"  >
|   < LT:   "<"  >
|   < EQ:   "==" >
|   < LE:   "<=" >
|   < GE:   ">=" >
|   < NE:   "!=" >
|   < AND:  "&&" >
|   < OR:   "||" >
|   < PLUS: "+"  >
|   < MINUS:"-"  >
|   < MUL:  "*"  >
|   < DOT:  "."  >  // rjm
|   < DIV:  "/"  >
|   < MOD:  "%"  >
|   < NOT:  "!"  >
|   < POWER:"^"  >
|   < CROSS:"^^" > // rjm
|   < LSQ:  "["  >  // rjm
|   < RSQ:  "]"  >  // rjm
|   < LRND: "("  >  // rjm
|   < RRND: ")"  >  // rjm
|   < COLON: ":" >  // rjm
}


/***************************************************************
GRAMMAR START
***************************************************************/

ASTStart Start() #Start :
{
}
{
    Expression() ( <EOF> | <SEMI> ) { return jjtThis; }
    |  ( <EOF> | <SEMI> )
    {
        // njf - The next line is commented out in 2.3.0 since
        //       two "No expression entered" errors are reported
        //       in EvaluatorVisitor and Console (one from here
        //       the other from ParseStream() )
        //       Decided to just return null, and handle the error
        //       in ParseStream.
        // addToErrorList("No expression entered");
        return null;
    }
}

// Expresions can be like
// x=3
// x=y=3 parsed as x=(y=3)

void Expression() : {}
{
    LOOKAHEAD(LValue() <ASSIGN>)        // need to prevent javacc warning with left recusion
    AssignExpression() // rjm changes from OrExpresion
    |
    RightExpression()
}

void AssignExpression() : {} // rjm addition
{

    ( LValue() <ASSIGN> Expression()
        {
            if (!jep.getAllowAssignment()) throw new ParseException(
            "Syntax Error (assignment not enabled)");

            jjtThis.setOperator(opSet.getAssign());
        }
      )
      #FunNode(2)
}

void RightExpression() :
{
}
{
    OrExpression()
}

void OrExpression() :
{
}
{
    AndExpression()
    (
      ( <OR> AndExpression()
        {
            jjtThis.setOperator(opSet.getOr());
        }
      ) #FunNode(2)
    )*
}


void AndExpression() :
{
}
{
    EqualExpression()
    (
      ( <AND> EqualExpression()
        {
            jjtThis.setOperator(opSet.getAnd());
        }
      ) #FunNode(2)
    )*
}



void EqualExpression() :
{
}
{
    RelationalExpression()
    (
      ( <NE> RelationalExpression()
        {
        jjtThis.setOperator(opSet.getNE());
        }
      ) #FunNode(2)
    |
      ( <EQ> RelationalExpression()
        {
          jjtThis.setOperator(opSet.getEQ());
        }
      ) #FunNode(2)
    )*
}



void RelationalExpression() :
{
}
{
  AdditiveExpression()
  (
    ( <LT> AdditiveExpression()
      {
        jjtThis.setOperator(opSet.getLT());
      }
    ) #FunNode(2)
    |
    ( <GT> AdditiveExpression()
      {
        jjtThis.setOperator(opSet.getGT());
      }
    ) #FunNode(2)
    |
    ( <LE> AdditiveExpression()
      {
        jjtThis.setOperator(opSet.getLE());
      }
    ) #FunNode(2)
    |
    ( <GE> AdditiveExpression()
      {
        jjtThis.setOperator(opSet.getGE());
      }
    ) #FunNode(2)
  )*
}


void AdditiveExpression() :
{
}
{
  MultiplicativeExpression()
  (
    ( <PLUS> MultiplicativeExpression()
      {
        jjtThis.setOperator(opSet.getAdd());
      }
    ) #FunNode(2)
    |
    ( <MINUS> MultiplicativeExpression()
      {
        jjtThis.setOperator(opSet.getSubtract());
      }
    ) #FunNode(2)
  )*
}


void MultiplicativeExpression() :
{
}
{
  UnaryExpression()
  (
    (       
      PowerExpression()
      {
        if (!jep.implicitMul) throw new ParseException(
            "Syntax Error (implicit multiplication not enabled)");

        jjtThis.setOperator(opSet.getMultiply());
      }
    ) #FunNode(2)
    |
    ( <MUL> UnaryExpression()
      {
        jjtThis.setOperator(opSet.getMultiply());
      }
    ) #FunNode(2)
    |
    ( <DOT> UnaryExpression()
      {
        jjtThis.setOperator(opSet.getDot());
      }
    ) #FunNode(2)
    |
    ( <CROSS> UnaryExpression()
      {
        jjtThis.setOperator(opSet.getCross());
      }
    ) #FunNode(2)
    |
    ( <DIV> UnaryExpression()
      {
        jjtThis.setOperator(opSet.getDivide());
      }
    ) #FunNode(2)
    |
    ( <MOD> UnaryExpression()
      {
        jjtThis.setOperator(opSet.getMod());
      }
    ) #FunNode(2)
  )*
}


void UnaryExpression() :
{
}
{
  ( <PLUS> UnaryExpression())
|
  ( <MINUS> UnaryExpression()
    {
      jjtThis.setOperator(opSet.getUMinus());
    }
  ) #FunNode(1)
|
  ( <NOT> UnaryExpression()
    {
      jjtThis.setOperator(opSet.getNot());
    }
  ) #FunNode(1)
|
  PowerExpression()
}


void PowerExpression() :
{
}
{
  UnaryExpressionNotPlusMinus()
  [
  ( <POWER> UnaryExpression()
    {
      jjtThis.setOperator(opSet.getPower());
    }
  ) #FunNode(2)
  ]
}


void UnaryExpressionNotPlusMinus() :
{
    String identString = "";
    int type;
}
{
    AnyConstant()
    |
    LOOKAHEAD(ArrayAccess())
    ArrayAccess()
    |
    LOOKAHEAD({ (getToken(1).kind == INDENTIFIER1 || getToken(1).kind == INDENTIFIER2) &&
                  jep.funTab.containsKey(getToken(1).image) })
    Function()
    |
    Variable()
    |
    <LRND> Expression() <RRND>
    |
//  LOOKAHEAD(<LSQ> Expression() <COLON>)
//  RangeExpression()
//  |
    ListExpression()
}

void ListExpression() #FunNode:
{
    jjtThis.setOperator(opSet.getList());
}
{
    <LSQ> Expression() ( <COMMA> Expression() )* <RSQ> 
}

/*
void RangeExpression()  #FunNode:
{
    jjtThis.setOperator(opSet.getRange());
}
{
    <LSQ> Expression() ( <COLON> Expression() )+ <RSQ>
}
*/

void LValue() :
{
}
{
    LOOKAHEAD(ArrayAccess())
    ArrayAccess()
    |   Variable()
}

void ArrayAccess() : 
{
}
{
    Variable() ListExpression()
    {
     jjtThis.setOperator(opSet.getElement());
    } #FunNode(2)

}
void Variable() :
{
    String identString = "";
}
{
    (identString = Identifier()
    {
        if (symTab.containsKey(identString)) {
            jjtThis.setVar(symTab.getVar(identString));
        } else {
            if (jep.allowUndeclared) {
                jjtThis.setVar(symTab.makeVarIfNeeded(identString));
            } else {
                addToErrorList("Unrecognized symbol \"" + identString +"\"");
            }
        }
    }
    ) #VarNode
}



void Function() :
{
    int reqArguments = 0;
    String identString = "";
}
{
    ( identString = Identifier()
        {
            if (jep.funTab.containsKey(identString)) {
                //Set number of required arguments
                reqArguments =
                    ((PostfixMathCommandI)jep.funTab.get(identString)).getNumberOfParameters();
                jjtThis.setFunction(identString,
                    (PostfixMathCommandI)jep.funTab.get(identString));
            } else {
                addToErrorList("!!! Unrecognized function \"" + identString +"\"");
            }
        }

        <LRND> ArgumentList(reqArguments, identString) <RRND>

    ) #FunNode
}

void ArgumentList(int reqArguments, String functionName) :
{
    int count = 0;
    String errorStr = "";
}
{
    [
    Expression() { count++; }
    (
        <COMMA>
        Expression() { count++; }
    )*
    ]
    {
        if(reqArguments == -1) {
            if(!((PostfixMathCommandI)jep.funTab.get(functionName)).checkNumberOfParameters(count))
            {
                errorStr = "Function \"" + functionName +"\" illegal number of arguments " + count;
                addToErrorList(errorStr);
            }
        }
        else if (reqArguments != count) {
            errorStr = "Function \"" + functionName +"\" requires "
                       + reqArguments + " parameter";
            if (reqArguments!=1) errorStr += "s";
            addToErrorList(errorStr);
        }
    }
}



String Identifier() :
{
  Token t;
}
{
    ( t = <INDENTIFIER1> |  t = <INDENTIFIER2> ) { return t.image; }
}


void AnyConstant() #Constant:
{
    Token t;
    Object value;
}
{
    t=<STRING_LITERAL> {
        // strip away double quotes at end of string
        String temp = (t.image).substring(1,t.image.length()-1);

        // replace escape characters
        temp = replaceEscape(temp);

        jjtThis.setValue(temp);
    }
    |
    value = RealConstant() {
        jjtThis.setValue(value);
//  }
//  |
//  value = Array() {
//      jjtThis.setValue(value);
    }
}

/*
Vector Array() :
{
    Object value;
    Vector result = new Vector();
}
{
    <LSQ>
    value = RealConstant()
    {
        result.addElement(value);
    }
    (
        <COMMA>
        value = RealConstant()
        {
            result.addElement(value);
        }
    )* 
    <RSQ>
    {
        return result;
    }
}
*/




Object RealConstant() :
{
  Token t;
  Object value;
}
{
    (t=<INTEGER_LITERAL>    |   t=<FLOATING_POINT_LITERAL>)
    {
        try {
            value = jep.getNumberFactory().createNumber(t.image);
        } catch (Exception e) {
            value = null;
            addToErrorList("Can't parse \"" + t.image + "\"");
        }

        return value;
    }
}

1 个答案:

答案 0 :(得分:1)

在分析时应该看一下瓶颈,readStream花费的时间最多,但是哪个功能呢?一些分析器以图形方式对这些关键热点进行了红线化。

引起我注意的是replaceEscape

private static final String METACHARS = "tnrbf\\\"'";
private static final String CHARS = "\t\n\r\b\f\\\"'";

private String replaceEscape(String inputStr) {
    int i = inputStr.indexOf('\\');
    if (i == -1) { // 1. Heuristic strings without backslash
        return inputStr;
    }
    int len = inputStr.length();
    int p = 0;
    StringBuilder output = new StringBuilder(); // 2. Faster StringBuilder

    while (i != -1) {
        if (i + 1 == len) break;

        if (p < i) output.append(inputStr.substring(p, i));
        p = i + 1;
        char metac = inputStr.charAt(i+1);

        // find the index of the metac
        int k = METACHARS.indexOf(metac);
        if (k != -1) {
            // its corresponding true char
            metac = CHARS.charAt(k));   
            ++p; // Start copying after metachar
        }
        output.append(metac);
    }

    // add the end of the input string to the output
    if (p < len)
        output.append(inputStr.substring(p));
    return output.toString();
}