文本序列检测

时间:2016-05-19 22:21:30

标签: java regex

我正在尝试编写一个程序来识别给定输入文本中的文本模式。例如,我的文本将是这样的:

This is a test xa .. blah blah
This is a test xd .. blah blah.. 
This is a test x3 .. blah blah..
This is a test xa .. blah blah
This is a test xd .. blah blah.. 
This is a test x3 .. blah blah..
This is a test xa .. blah blah
This is a test xd .. blah blah.. 
This is a test x3 .. blah blah..
This is a test bc .. blah blah.. 
This is a test some more useless text..
This is a test x3 .. blah blah..
This is a test some more useless text..
This is a test xa .. blah blah
This is a test some more useless text..

我需要在备用线路上找到重复 x'digit/text' 的序列,然后反复重复。因此,在上述情况下,它是xa, xd, x3并且它自身重复3次。所以,在另一种情况下,可能x1, x2, x3, x4重复5次。使用正则表达式可以解决这个问题吗?如果我编写Java程序,我该如何有效地检测这个序列?

2 个答案:

答案 0 :(得分:2)

肯定是。 尝试从this.*xa.*\n.*xd.*\n.*x3.*\n

开始

修改:

或者您可以尝试使用this(.*x[0-9a-z].*\n)+

答案 1 :(得分:2)

我知道为没有付出努力的人(你甚至没有回答我关于要求的问题)开发解决方案是一种糟糕的stackoverflow风格。然而。

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;

public class RepeatingPatternMain {

    LineParser parser = new LineParser();

    public RepeatingPatternMain(String fileName) throws IOException {
        try (BufferedReader br = new BufferedReader(new FileReader(fileName))) {
            String line = br.readLine();
            while (line != null) {
                parser.acceptLine(line);
                line = br.readLine();
            }
        }
        parser.done();
    }

    public static void main(String[] args) throws IOException {
        if (args.length == 1) {
            new RepeatingPatternMain(args[0]);
        } else {
            System.out.println("Usage: java RepeatingPatternMain <file>");
        }
    }

}

import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class LineParser {

    Pattern xPat = Pattern.compile(".*x([0-9a-z]).*");
    RepeatingPatternRecognizer rpr = new RepeatingPatternRecognizer();

    public void acceptLine(String line) {
        Matcher m = xPat.matcher(line);
        if (m.matches()) {
            String charAfterX = m.group(1);
            assert charAfterX.length() == 1 : charAfterX;
            rpr.lineWithX(charAfterX.charAt(0), line);
        } else {
            rpr.lineWithoutX(line);
        }
    }

    public void done() {
        rpr.finish();
    }

}

import java.util.ArrayList;
import java.util.List;

public class RepeatingPatternRecognizer {

    private static final int minRepeats = 3;

    List<LineForAnalysis> lines = new ArrayList<LineForAnalysis>();

    public void lineWithX(char charAfterX, String line) {
        lines.add(new LineForAnalysis(charAfterX, line));
    }

    static class LineForAnalysis {
        final char charAfterX;
        final String line;

        public LineForAnalysis(char charAfterX, String line) {
            this.charAfterX = charAfterX;
            this.line = line;
        }
    }

    public void lineWithoutX(String line) {
        analyzeAndClear();
    }

    public void finish() {
        analyzeAndClear();
    }

    private void analyzeAndClear() {
        if (!lines.isEmpty()) {
            int ix1 = 0;
            outerLoop:
            while (ix1 < lines.size()) {
                // see if a repeating pattern starts at ix1
                for (int ix2 = ix1 + 1; ix2 < lines.size(); ix2++) {
                    if (lines.get(ix1).charAfterX == lines.get(ix2).charAfterX) {
                        int patternLength = ix2 - ix1;
                        int ix3 = ix2 + 1;
                        while (ix3 < lines.size() && lines.get(ix3).charAfterX == lines.get(ix3 - patternLength).charAfterX) {
                            ix3++;
                        }
                        int repeatedPatternLength = ix3 - ix1;
                        if (repeatedPatternLength > minRepeats  * patternLength) { // pattern found
                            int repeats = repeatedPatternLength / patternLength;
                            // a more elaborate solution may return the repeating pattern to the caller
                            System.out.println("Found a pattern repeated " + repeats + " times");
                            int repeatEndIndex = ix1 + repeats * patternLength;
                            for (int ix4 = ix1; ix4 < repeatEndIndex; ix4++) {
                                System.out.println(lines.get(ix4).line);
                            }
                            System.out.println();
                            ix1 = repeatEndIndex;
                            continue outerLoop;
                        }
                    }
                }
                // no repeating pattern found, try next index
                ix1++;
            }

            lines.clear();
        }
    }

}