使用Antlr4导致奇怪的输入不匹配

时间:2014-07-21 08:44:15

标签: sql parsing antlr

我目前正在尝试实现一个能够解析db2 create table语句的DB2解析器。因此,我使用了官方的IBM语法DB2-Syntax (IBM)

到目前为止,这是我的语法:

grammar db2; 

list: sql_expression;

sql_expression: 
    (create_statement SEMICOLON)+;


/* ------------------------------------------------------------------------------------------------
                                                TABLES MAIN
-------------------------------------------------------------------------------------------------*/

create_statement: 
'CREATE' 'TABLE' tableName=table_name tableMainBlock=table_main_block (initialLoggin=initial_loggin)? (volatilityLabel=volatility)? (rcdfmtLabel=rcdfmt)? (clauseLabel=clause)? (inTS=in_ts)?  #createTable
;

in_ts:
'IN' '"' tsName=NAME '"'
;

table_main_block:
    LEFT_PAREN tableParens=table_parens  RIGHT_PAREN    #tableMainParens
    | tableLike=table_like  #createTableLike
    | asSubqueryClause=as_subquery_clause   #tableAsSubqueryClause  
;

table_parens: 
    ((column_defs=column_definitions)| (tableLike=table_like) | (tableUnique=table_unique) | (tableRef=table_ref) | (tableCheck=table_check))+  #tableParens
;

as_subquery_clause:
    'TABLESUBQUERYCLAUSE'
;
table_name : ('"' dbName=NAME '"' '.')? '"'tableName=NAME'"' #tableNameBunnys
| ('"'dbName=NAME'"''.') ?tableName=NAME    #tableName
;

like_table_name : '"'tableName=NAME'"' #likeTableNameBunnys
| tableName=NAME    #likeTableName
;

table_like:
'LIKE' likeTableName=like_table_name    #tableLikeTable
| 'LIKE' likeViewName=like_view_name copyOptions=copy_options #tableLikeView
;

like_view_name:
    '"'viewName=NAME'"' #viewNameBunnys
| viewName=NAME #viewName
;

initial_loggin:
'NOT LOGGED INITIALLY'  #initialLogging
;

volatility:
'NOT VOLATILE'  #nVolatile
|'NOT VOLATILE' cardinality_volatility  #NVolatileCard
|'VOLATILE' #Volatile
|'VOLATILE' cardinality_volatility  #VolatileCard
;

cardinality_volatility:
'CARDVOL'
;

rcdfmt:
'RCDFMT' rcdfmt_format_name #rcdfmtFormatName
;

rcdfmt_format_name:
'RCDFMTNAME'
;

clause: 
distribution_clause
| partitioning_clause
;

distribution_clause:
'DISTRIBUTIONCLAUSE'    #distributionClause
;

partitioning_clause:
'PARTITIONINGCLAUSE'    #partitioningClause
;

copy_options:
    'EXCLUDING IDENTITY...' #copyOptionsExclude
;

table_unique:
'UNIQUECONSTRAINT'
;

table_ref:
'FOREIGN KEY'   #foreignKeyRef
| 'CONSTRAINT' constraint_name #constraintRef
;

constraint_name:
NAME
;

table_check:
'CHECKCONSTRAINT'   
;


/* -------------------------------------------------------------------------------------------------------
                                                 COLUMNS                            
--------------------------------------------------------------------------------------------------------- */

column_definitions:
    column (COMMA column)*  #colDefs 
    ;

column:
    colName=col_name (forColumn=col_for_column)? colType=col_type (colOptions=col_options)*  #col 
    ;   

col_name: '"' colName=NAME '"'  #colNameBunnys
    |   colName=NAME    #colName;   

col_for_column:
    'FOR' ('COLUMN')? systemColumnName=col_system_column_name   #colForSysColName
;

col_system_column_name:
    'COLSYSTEMCOLUMNNAME'
;

col_options: 
    'NOT NULL' #colUnique
    | col_default_clause    #colDefaultClause
    | ('GENERATED ALWAYS'|'GENERATED BY DEFAULT') (IdentityOptions=col_options_identity_options)? #colGenAlwaysDefault
    | col_datalink_options  #colDataLinkOptions
    | col_constraint    #colConstraint
;

col_default_clause:
('WITH')? 'DEFAULT'
| ('WITH')? 'DEFAULT' (NUMBER|NAME|NUMBER DOT NUMBER)
| ('WITH')? 'DEFAULT' 'USER'
| ('WITH')? 'DEFAULT' 'NULL'
| ('WITH')? 'DEFAULT' 'CURRENT_DATE'
| ('WITH')? 'DEFAULT' 'CURRENT_TIME'
| ('WITH')? 'DEFAULT' 'CURRENT_TIMESTAMP'
| ('WITH')? 'DEFAULT' cast_function_name LEFT_PAREN ((NUMBER|NAME)|'USER'|'CURRENT_DATE'|'CURRENT_TIME'|'CURRENT_TIMESTAMP') RIGHT_PAREN
;


col_options_identity_options:
    'COLOPTIONSIDENTITYOPTIONS'
;

col_datalink_options:
    'COLDATALINKOPTIONS'
;

col_constraint:
    ('CONSTRAINT' constraintName=NAME)? (constrType1='PRIMARY KEY'|constrType2='UNIQUE'|constrType3=references_clause|constrType4='CHECK' LEFT_PAREN conCondition=check_condition RIGHT_PAREN)  #colConstraintDef
;

references_clause:
'REFCLAU'
;

check_condition: 
'CHECKCONDI'
;

cast_function_name:
'CFN'
; 




/* -----------------------------------------------------------------------------------------------------
                                                DATATYPES
------------------------------------------------------------------------------------------------------*/
col_type: 
    col_type_simple #colTypeSimple
    |col_type_dec   #colTypeDec
    |col_type_float #colTypeFloat
    |col_type_chars #colTypeChars
    |col_type_graphic   #colTypeGraphic
    |col_type_binary    #colTypeBinary
    |col_type_date  #colTypeDate
    |'DATALINK' (NUMBER)? (allocate_clause)? (ccsid_clause)?    #colTypeDatalink
    |'ROWID'    #colTypeRowId
    ;

allocate_clause:
'ALLOCATE' NUMBER   #allocateClause
;

ccsid_clause:
'CCSID' NUMBER ('NORMALIZED'|'NOT NORMALIZED')? #ccsidClause
;

col_type_simple:
  'SMALLINT'    #colTypeSmallInt
 |'INT' #colTypeInt
 |'BIGINT' #colTypeBigInt  
 | 'INTEGER' #colTypeInteger 
 ;

col_type_dec:
(dataType='NUMERIC'|(dataType='DECIMAL'|dataType='DEC')) (LEFT_PAREN dataTypePrecision=NUMBER (COMMA dataTypePrecision2=NUMBER)? RIGHT_PAREN)?  #colTypeDecimal
;

col_type_float:
'REAL'  #colTypeReal
| 'FLOAT' (LEFT_PAREN precision=NUMBER RIGHT_PAREN)?    #colTypeFloatDef
| 'DOUBLE' ('PRECISION')?   #colTypeDoubleDef
;

col_type_chars:
('CHAR'|'CHARACTER')  (LEFT_PAREN precision=NUMBER RIGHT_PAREN)?    (col_type_chars_mixed)? #colTypeChar
| ('VARCHAR'|'CHAR' 'VARYING'|'CHARACTER' 'VARYING') LEFT_PAREN precision=NUMBER RIGHT_PAREN (allocate_clause)?     col_type_chars_mixed #colTypeVarChar
| ('CHARACTER' 'LARGE OBJECT'|'CHAR' 'LARGE OBJECT'|'CLOB') ('(1M)'|NUMBER ('K'|'M'|'G'))? (allocate_clause)? col_type_chars_mixed  #colTypeCLOB
;

col_type_chars_mixed:
'FOR' ('BIT'|'SBCS'|'MIXED') 'DATA' #colTypeCharBitType
| ccsid_clause  #colTypeCharCCSID
;

col_type_graphic:
'GRAPHIC' ('(1)'|LEFT_PAREN NUMBER RIGHT_PAREN)? (ccsid_clause)?    #colTypeGraphicDef
| ('VARGRAPHIC'|'GRAPHIC' 'VARYING') LEFT_PAREN NUMBER RIGHT_PAREN (allocate_clause)? (ccsid_clause)? #colTypeVarGraphic
| 'DBCLOB' ('(1M)'| LEFT_PAREN NUMBER ('K'|'M'|'G') RIGHT_PAREN)? (allocate_clause)? (ccsid_clause)? #colTypeDBCLOB
;

col_type_binary:
'BINARY' ('(1)'|LEFT_PAREN NUMBER RIGHT_PAREN)?     #colTypeBinaryDef
| ('VARBINARY'|'BINARY' 'VARYING') LEFT_PAREN NUMBER RIGHT_PAREN (allocate_clause)?  #colTypeVarBinary
| ('BLOB'|'BINARY LARGE OBJECT') ('(1M)'| LEFT_PAREN NUMBER ('K'|'M'|'G') RIGHT_PAREN)? (allocate_clause)?  #colTypeBLOB
;

col_type_date:
'DATE'  #colTypeDateDef
| 'TIME' (LEFT_PAREN '0' RIGHT_PAREN)?  #colTypeTime 
| 'TIMESTAMP' (LEFT_PAREN '6' RIGHT_PAREN)? #colTypeTimestamp
;





/* LITERALS */

NUMBER  :   (DIGIT)+;  
NAME: [a-zA-Z0-9_]+ ;
FNUMBER: (DIGIT)+ DOT (DIGIT)+;
fragment DIGIT :   '0'..'9' ;
LEFT_PAREN : '(';
RIGHT_PAREN : ')';
COMMA : ',';
SEMICOLON : ';';
DOT :    '.';
WS : ( '\t' | ' ' | '\r' | '\n')+ -> skip;

当我现在尝试解析像这样的字符列时

CREATE TABLE“TEST”。“TEST2”(                   “IBMSNAP_OPERATION”CHAR(1));

Antlr导致错误

extraneous input '(1)' expecting {'FOREIGN KEY', 'CONSTRAINT', 'LIKE', 'UNIQUECONSTRAINT', 'UNIQUE', 'COLDATALINKOPTIONS', 'CHECKCONSTRAINT', 'CHECK', 'DEFAULT', 'GENERATED BY DEFAULT', 'NOT NULL', '"', 'WITH', 'PRIMARY KEY', 'REFCLAU', 'GENERATED ALWAYS', NAME, ')', ','}

但是尝试

CREATE TABLE "TEST"."TEST2"  (
                  "IBMSNAP_OPERATION" CHAR(3));

工作得很好! 谁知道这个奇怪的问题?

THX!

1 个答案:

答案 0 :(得分:0)

问题是你的语法包含内联文字'(1)',例如:

col_type_graphic:
    'GRAPHIC' ('(1)'...

正因为如此,词法分析器会将(1)视为单个令牌,并且无法再按( NUMBER )解析CHAR col_type_graphic 'K'|'M'|'G' 1}}。

这不是你唯一的问题,你的语法中有很多这些问题。例如,规则K包含M,它会将字母GKfragment声明为特殊代币(将其视为特殊关键字)和例如,一旦您尝试命名列K,就会遇到问题。

解决方案是您必须将此类规则作为{{1}}规则提取到词法分析器语法中,然后词法分析器将知道{{1}}本身不是词法标记,只与规则的其余部分结合使用。但这不是微不足道的,因为周围的规则不是词法而是解析器规则,因此您必须将其拆分为词法子规则和大解析器规则。

总而言之,在Antlr中运行此语法并生成有意义的解析树将是一项相当大的工作量。它的问题在于它包含许多不是关键字的文字,但是当你将它们定义为文字时,Antlr(以及许多其他解析器生成器)会将它们视为关键字。