在自由范围文本中拉引号并不少见。假设我们想要识别拉引号,即使它们嵌套在一个句子中。例如,假设我们有一个带有嵌套拉引号的字符串:
一二“三四”五六“七八”九“十一一”
是否有一个java正则表达式找到以下3组:
答案 0 :(得分:0)
根据Wiktor的建议,我提出了以下建议。虽然没有任何优雅,但这似乎可以解决问题:
public List<String> parseNestedSingleQuotes(String text) {
return parseNestedQuotes(text,'\'','{');
}
public List<String> parseNestedDoubleQuotes(String text) {
return parseNestedQuotes(text,'"','{');
}
public List<String> parseNestedQuotes(String text,char quoteChar,char markChar) {
List<String> groups = new ArrayList<String>();
char[] charArray = text.toCharArray();
Matcher m = Pattern.compile("("+quoteChar+")\\w",Pattern.CASE_INSENSITIVE).matcher(text);
while( m.find() ) {
charArray[m.start()] = markChar;
}
//System.out.println("debug charArray with marks: " + new String(charArray));
m = Pattern.compile("\\w("+quoteChar+")",Pattern.CASE_INSENSITIVE).matcher(text);
while( m.find() ) {
int endIdx = m.start()+1;
int startIdx = unmarkLastIndexOf(charArray,endIdx,quoteChar,markChar);
if( startIdx != -1 ) {
groups.add(text.substring(startIdx+1,endIdx));
}
}
return groups;
}
int unmarkLastIndexOf(char[] charArray, int endIdx, char quoteChar, char markChar) {
String template = new String(charArray);
int idx = template.lastIndexOf(markChar,endIdx-1);
if( idx != -1 ) {
charArray[idx] = quoteChar;
return idx;
}
return -1;
}
以下是一些测试用例
void test_parseNestedQuotes()
{
String input = "zero 'one two' three 'four five 'six seven' eight' nine";
System.out.println("nested singleQuote input: " + input);
List<String>groups = parseNestedSingleQuotes(input);
System.out.println("nested singleQuote groups:");
printListOfString(groups);
assert groups.size() == 3;
System.out.println("--------");
input = "one two \"three four\" five six \"seven eight \"nine\" ten eleven\" twelve";
System.out.println("nested doubleQuote input: " + input);
groups = parseNestedDoubleQuotes(input);
System.out.println("nested doubleQuote groups:");
printListOfString(groups);
assert groups.size() == 3;
System.out.println("--------");
input = "one two \"three four\" five six \"seven eight \"nine\" ten eleven twelve";
System.out.println("nested doubleQuote input with unmatched pairs: " + input);
groups = parseNestedDoubleQuotes(input);
System.out.println("nested doubleQuote groups from unmatched pairs:");
printListOfString(groups);
assert groups.size() == 2;
System.out.println("--------");
input = "one two (three four) five six";
System.out.println("no doubleQuote input with parens: " + input);
groups = parseNestedDoubleQuotes(input);
System.out.println("no doubleQuote groups from paren pairs:");
printListOfString(groups);
assert groups.size() == 0;
System.out.println("--------");
}
void printListOfString(List<String> list) {
for( String string : list )
System.out.println(string);
}
输出
nested singleQuote input: zero 'one two' three 'four five 'six seven' eight' nine
nested singleQuote groups:
one two
six seven
four five 'six seven' eight
--------
nested doubleQuote input: one two "three four" five six "seven eight "nine" ten eleven" twelve
nested doubleQuote groups:
three four
nine
seven eight "nine" ten eleven
--------
nested doubleQuote input with unmatched pairs: one two "three four" five six "seven eight "nine" ten eleven twelve
nested doubleQuote groups from unmatched pairs:
three four
nine
--------
no doubleQuote input with parens: one two (three four) five six
no doubleQuote groups from paren pairs:
--------
有任何改进建议吗?