拆分包含转义分隔符的字符串

时间:2014-05-03 13:34:21

标签: java parsing escaping delimiter

分隔符是|

逃避角色是\

和字符串是例如"A|B\|C\\|D\\\|E|\\\\F"

我想获得数组: {"A", "B|C\", "D\|E", "\\F"}

因此可以转义分隔符,但也可以转义转义字符。有人知道如何用Java解析它吗?

感谢。

修改 我创造了这个非常看起来很棒的解决方案至少它工作得很好而且可以定义转义字符,分隔符以及是否应该轻松删除空字符串。

解决方案(Eggyal发布的更好,向下看):

private List<String> parseString(String string, String delimiter, boolean removeEmpty) {
    String escapingChar = "\\";
    String escapingCharInRegexp = "\\\\";
    boolean begined = false;
    List<String> parsed = new ArrayList<String>();
    List<Integer> begins = new ArrayList<Integer>();
    List<Integer> ends = new ArrayList<Integer>();
    List<Integer> delimitersPositions = new ArrayList<Integer>();
    List<String> explodedParts = new ArrayList<String>();
    int i;
    for(i = 0; i < string.length(); i++) {
        if( ( string.substring(i, i+1).equals(escapingChar) || string.substring(i, i+1).equals(delimiter) ) && !begined ) {
            begins.add(i);
            begined = true;
            if( i + 1 == string.length() ) {
                begined = false;
                ends.add(i+1);
            }
        } else if( ( !string.substring(i, i+1).equals(escapingChar) && !string.substring(i, i+1).equals(delimiter) && begined ) ) {
            begined = false;
            ends.add(i);
        } else if( begined && string.substring(begins.get(begins.size()-1), i).indexOf(delimiter) != -1 ) {
            begined = false;
            ends.add(i);
            begined = true;
            begins.add(i);
        } 
        if( ( i + 1 == string.length() && begined ) ) {
            begined = false;
            ends.add(i+1);
        }
    }
    List<Integer> toRemove = new ArrayList<Integer>();
    for( i = 0; i < begins.size(); i++ ) {
        if( string.substring(begins.get(i), ends.get(i)).indexOf(delimiter) == -1 ) {
            toRemove.add(i);
        }
    }
    for( i = 0; i < toRemove.size(); i++ ) {
        begins.remove(toRemove.get(i)-i);
        ends.remove(toRemove.get(i)-i);
    }       
    for( i = 0; i < begins.size(); i++ ) {
        if( ( ends.get(i) - begins.get(i) ) % 2 != 0 ) {
            delimitersPositions.add(ends.get(i)-1);
        }
    }       
    for( i = 0; i <= delimitersPositions.size(); i++ ) {
        int start = (i == 0) ? 0 : delimitersPositions.get(i-1)+1;
        int end = ( i != delimitersPositions.size()) ? delimitersPositions.get(i) : string.length();
        if( removeEmpty ) {
            if( !string.substring(start, end).equals("") ) {
                explodedParts.add(string.substring(start, end));
            }
        } else {
            explodedParts.add(string.substring(start, end));

        }
    }
    for (i = 0; i < explodedParts.size(); i++)
        parsed.add(explodedParts.get(i).replaceAll(escapingCharInRegexp+"(.)", "$1"));

    return parsed;
}

3 个答案:

答案 0 :(得分:2)

因为您要分割 unescaping,所以每个过程都需要单独的步骤:

String[] terms = input.split("(?<=[^\\\\]|[^\\\\]\\\\\\\\)\\|");
for (int i = 0; i < terms.length; i++)
    terms[i] = terms[i].replaceAll("\\\\(.)", "$1");

这是一些测试代码:

public static void main(String[] args) {
    String input = "A|B\\|C\\\\|D\\\\\\|E|\\\\\\\\F";
    String[] terms = input.split("(?<=[^\\\\]|[^\\\\]\\\\\\\\)\\|");
    for (int i = 0; i < terms.length; i++)
        terms[i] = terms[i].replaceAll("\\\\(.)", "$1");
    System.out.println(input);
    System.out.println(Arrays.toString(terms));
}

输出:

A|B\|C\\|D\\\|E|\\\\F
[A, B|C\, D\|E, \\F]

答案 1 :(得分:2)

static final char ESCAPING_CHAR = '\\';

private List<String> parseString(final String  str,
                                 final char    delimiter,
                                 final boolean removeEmpty)
  throws IOException
{
  final Reader        input  = new StringReader(str);
  final StringBuilder part   = new StringBuilder();
  final List<String>  result = new ArrayList<String>();

  int c;
  do {
    c = input.read();                // get the next character

    if (c != delimiter) {            // so long as it isn't a delimiter...
      if (c == ESCAPING_CHAR)        //   if it's an escape
        c = input.read();            //     use the following character instead

      if (c >= 0) {                  //   only if NOT at end of string...
        part.append((char) c);       //     append to current part
        continue;                    //     move on to next character
      }
    }

    /* we're at either a real delimiter, or end of string => part complete */

    if (part.length() > 0 || !removeEmpty) { // keep this part?
      result.add(part.toString());   // add current part to result
      part.setLength(0);             // reset for next part
    }
  } while (c >= 0);                  // repeat until end of string found

  return result;
}

答案 2 :(得分:0)

在java中没有像你提到的那样的转义序列“\ |”。 这会导致编译时错误。