使用Java正则表达式找到BUG

时间:2013-11-22 21:42:48

标签: java regex

我们似乎发现了java正则表达式的错误。

我们正在尝试匹配一行中出现两次的不同月份模式或仅出现在一行中的两年模式。

但是java似乎把部分空间与空间分隔符混为一谈。 我向我的教授展示了这个问题,我们无法解决它

具体我们要匹配“1/2013 - 2014年1月”,以及“2013 - 2014”。 在2013年,我们得到0来匹配月份和年份之间的分隔符,即使0不在分隔符模式中。所以我们最终获得与2/13相同的结果 这是代码

    import java.io.IOException;
    import java.util.ArrayList;
    import java.util.GregorianCalendar;
    import java.util.HashMap;
    import java.util.concurrent.CountDownLatch;
    import java.util.regex.Matcher;
    import java.util.regex.Pattern;

    import org.w3c.dom.css.Counter;

    public class DatePattens {
        //private ArrayList<MatchedDateObject> arryLstOfDates = new ArrayList<MatchedDateObject>();
        private ArrayList<String> matchedString = new ArrayList<String>();
        private HashMap<String,Integer> map ;


        private String monthPattern = "((0[1-9]|1[012]|[1-9])|(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sept|Sep|Oct|Nov|Dec)[a-z]*)";  // 3 groups
        private String monthAndYearSeperator="\\s*(\\s*|,|;|~|--|-|.|\\/)\\s*";      // 1 group
        private String twoOrFourDigitYearPattern="(19[0-9]{2}|[2-9][0-9]{3}|[0-9]{2})\\s*";         // 1 group  
        private String presentPattern = "(Current|Present|Now|Currently|Presently|Till Date|Todate|Today)";
        private String twoDatesSeperator = "\\s*(\\s*|-|~|--|,|to|til|till|until)\\s*";    // 1 group
        private String twoOrFourDigitOrPresentYearPattern = presentPattern + "|" + twoOrFourDigitYearPattern;  // 2 groups
        private String secondIdenticalMonthPattern="(([1-9]|0[1-9]|1[012])|(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sept|Sep|Oct|Nov|Dec|January|February|March|April|May|June|July|August|September|October|November|December))";  // 3 groups
        private String dateToDateCompletePatternOne=
        monthPattern + monthAndYearSeperator + twoOrFourDigitYearPattern + twoDatesSeperator +  
        "((" + secondIdenticalMonthPattern +
        monthAndYearSeperator +
        twoOrFourDigitYearPattern +")|" +
        presentPattern +")" 
    ;               
        private Pattern patternAry = null;
        private Matcher matcher = null;
        public DatePattens() {
            map = new HashMap<String,Integer>();
            patternAry = Pattern.compile(dateToDateCompletePatternOne, Pattern.CASE_INSENSITIVE);
            matcher = patternAry.matcher("");   
        }
        //
        // extract the two dates to look for duration afterwards
        // 1. check if the a year pattern exists
        //    1.1 if not skip to else at the end and return false
        // 2. if yes get the rest of the line past year 1
        // 3. check for year 2 or CURRENT/Present/...

        public boolean matchTwoYearPattern(String inputLine){
            String fname="matchTwoYearPattern";
            Pattern firstYearPattern = Pattern
                    .compile(twoOrFourDigitYearPattern,Pattern.CASE_INSENSITIVE);
            Matcher matcher1 = firstYearPattern.matcher("");


            Pattern secondPattern = Pattern.compile(twoOrFourDigitOrPresentYearPattern,
                    Pattern.CASE_INSENSITIVE);
            Matcher matcher2 = secondPattern.matcher("");
            //long startTime = System.currentTimeMillis();

            matcher1.reset(inputLine);
            if (matcher1.find()) {  // 1
                String remaingString = inputLine.substring(matcher1.end(),
                        inputLine.length());   // 2
                matcher2.reset(remaingString);
                if (matcher2.find()) {  // 3
                    return true;
                }

            }       
            return false;   // 1.1 and end 

        }
        public String matchAllDatePatterns(String line, int lineNum){
            String fname = "matchAllPatterns:: ";
             if (matchTwoYearPattern(line) == false) {  // check if two years (or year and CURRENT/today...) present, if not return false
                 return("false:" + line);
             }
             else {
             }
            String matched = "";
            int i = 0;
                matcher.reset(line);
                if (matcher.find()) {// here we are matching the pattern dateToDateCompletePatternOne
                 System.out.println(fname + "line: " +line);
                    System.out.println("group count "+matcher.groupCount());                
                    System.out.println("group1 " +matcher.group(1));
                    System.out.println("group2 " +matcher.group(2));
                    System.out.println("group3 " +matcher.group(3));
                    System.out.println("group4 " +matcher.group(4));//so for 2013 - Jan 2013 input
                    //here matcher.group(4) is matching to 0 which we dont have in the pattern
                    System.out.println("group5 " +matcher.group(5));
                    System.out.println("group6 " +matcher.group(6));
                    System.out.println("group7 " +matcher.group(7));
                    System.out.println("group8 " +matcher.group(8));
                    System.out.println("group9 " +matcher.group(9));
                    System.out.println("group10 " +matcher.group(10));
                    System.out.println("group11 " +matcher.group(11));
                    System.out.println("group12 " +matcher.group(12));
                    System.out.println("group13 " +matcher.group(13));
                    System.out.println("group14 " + matcher.group(14));        
            }

                return matched;

        }
        public static void main(String args[]){
            DatePattens dp= new DatePattens();
            String fileName = "Resume.txt";

            try {
                ReadFile file = new ReadFile(fileName);
                String[] aryLines = file.openFile();
                int i=0;
                 long startTime =System.currentTimeMillis();


                    for (String input : aryLines) {
                        String output = dp.matchAllDatePatterns(input, i);
                        i++;
                    }

                long endTime =System.currentTimeMillis();
                System.out.println("Time required for this operation :" + ((endTime-startTime)*0.001));

            } catch (IOException e) {
                System.out.println(e);
            }

        }

    }

所以这里这个程序应该匹配两个日期模式,即月份,然后是月份 但是当我给出像2013年1月到2014年1月的输入时所以这个模式匹配2作为月和0作为 分隔符和13作为一年所以我的问题是为什么它匹配0作为分隔符因为我没有 包括它我的模式。这是正则表达式中的一种错误 请帮帮我

2 个答案:

答案 0 :(得分:8)

关于这一行,不是吗?

private String monthAndYearSeperator="\\s*(\\s*|,|;|~|--|-|.|\\/)\\s*";

您添加了与任意字符匹配的点(。) - 以及“2013”​​中的零数字。逃离点只匹配点本身!

private String monthAndYearSeperator="\\s*(\\s*|,|;|~|--|-|\\.|\\/)\\s*";

答案 1 :(得分:2)

private String monthAndYearSeperator="\\s*(\\s*|,|;|~|--|-|.|\\/)\\s*";

问题1: .匹配任何字符而不是文字字符'.'。将其转换为\\.

问题2:括号内的 \\s*允许分隔符完全空白。它应该是\\s+,因此至少需要一个空格。

修复.会使0不被视为分隔符。但是\\s*最后允许一个空的分隔符,所以你需要修复它,以使分隔符模式正确匹配。

private String twoDatesSeperator = "\\s*(\\s*|-|~|--|,|to|til|till|until)\\s*";

这有同样的空白问题:中间\\s*应为\\s+