我的正则表达在最好的时候并不好,而且我已经在这几个小时里一直在努力。我想将一个句子解析为多数是单词的部分,但包括带小数和/或引用文本的数字。
我有一个试验台:
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class Test {
public static void main(String[] args) {
String Ihave = "a.X='Foo123. != was here' and "
+ " T!= v or "
+ " cat <> dog and "
+ " x>-15 and "
+ " \"Peter and Paul\"=\"Mary\" and "
+ " y< 15.23 bah ";
String[] Iwant = {"a.X"
, "="
, "'Foo123. != was here'"
, "and" // accidently left off on the first stackoverflow posting
, "T"
, "!="
, "v"
, "or"
, "cat"
, "<>"
, "dog"
, "and"
, "x"
, ">"
, "-15"
, "and"
, "\"Peter and Paul\""
, "="
, "\"Mary\""
, "and"
, "y"
, "<"
, "15.23"
, "bah"};
String quotedtext = "((\"[^\"]*\"|'[^']*'))";
String nospaces = "[^\\s]";
String alphanumerics = "\\w";
String trythis = quotedtext + ""
+"|(<>)|(!=)" // group these pairs together
+"|("+nospaces+alphanumerics+"*)"
+"|(-\\.|[0-9])" // quoted blocks are ok - but the rest are individual characters
;
Pattern regex = Pattern.compile(trythis);
Matcher regexMatcher = regex.matcher(Ihave);
int x=0;
while (regexMatcher.find()) {
String parsed = regexMatcher.group();
if ( x<Iwant.length ) {
if ( Iwant[x].equals(parsed)) {
System.out.println(parsed);
}
else {
System.out.println(parsed+" but not as expected ("+Iwant[x]+")");
}
}
else {
System.out.println(parsed+" but not as expected");
}
x++;
}
System.out.println("\ndone");
}
}
当我运行它时,我得到以下内容:
a but not as expected (a.X)
.X but not as expected (=)
= but not as expected ('Foo123. != was here')
'Foo123. != was here' but not as expected (and)
and but not as expected (T)
T but not as expected (!=)
!= but not as expected (v)
v but not as expected (or)
or but not as expected (cat)
cat but not as expected (<>)
<> but not as expected (dog)
dog but not as expected (and)
and but not as expected (x)
x but not as expected (>)
> but not as expected (-15)
-15 but not as expected (and)
and but not as expected ("Peter and Paul")
"Peter and Paul" but not as expected (=)
= but not as expected ("Mary")
"Mary" but not as expected (and)
and but not as expected (y)
y but not as expected (<)
< but not as expected (15.23)
15 but not as expected (bah)
.23 but not as expected
bah but not as expected
done
虽然我对模式的最后一部分持怀疑态度,但除了全停/小数点(一直被视为单独的单词)之外,一切看起来都很好 - 如何解决这个问题(即如何获得“aX”)和“15.23”待在一起)?
我想我的正则表达式的重点在于不应将点视为分组中断。
任何帮助都将非常感激。 谢谢 甲