标题非常明确,下面是几个样本输入/输出。请注意,使用的正则表达式应该从字符串的开头到结尾匹配。
'abc' =~ 'abc' (match)
'abc' =~ 'a*bc' (match)
'aaaaaaabc' =~ 'c*bc' (no match)
'aaaaaaabc' =~ 'a.*bc' (match)
'abbbbaaaaaabc' =~ 'ab*a*b*c' (match)
'abbbbaaaaaabc' =~ 'ab*a*h*bc' (match)
'bbd' =~ 'b*bbd' (match)
'bbd' =~ '.*bbd' (match)
'bbd' =~ '.*cbd' (no match)
'' =~ '.*' (match)
我的实现位于:
https://github.com/jpbillaud/piexposed/blob/master/string/string_match_regexp.c
现在我想知道是否有人会考虑使用DP,有限自动机或其他任何方法来解决这个问题。
答案 0 :(得分:8)
从this一书中查看Rob Pike的The Practice of Programming正则表达式匹配器的实现情况。这是绝对漂亮的代码,只需35行C就可以满足问题中的所有要求(还有更多!)。引用上面引用的文章:
/* match: search for regexp anywhere in text */
int match(char *regexp, char *text)
{
if (regexp[0] == '^')
return matchhere(regexp+1, text);
do { /* must look even if string is empty */
if (matchhere(regexp, text))
return 1;
} while (*text++ != '\0');
return 0;
}
/* matchhere: search for regexp at beginning of text */
int matchhere(char *regexp, char *text)
{
if (regexp[0] == '\0')
return 1;
if (regexp[1] == '*')
return matchstar(regexp[0], regexp+2, text);
if (regexp[0] == '$' && regexp[1] == '\0')
return *text == '\0';
if (*text!='\0' && (regexp[0]=='.' || regexp[0]==*text))
return matchhere(regexp+1, text+1);
return 0;
}
/* matchstar: search for c*regexp at beginning of text */
int matchstar(int c, char *regexp, char *text)
{
do { /* a * matches zero or more instances */
if (matchhere(regexp, text))
return 1;
} while (*text != '\0' && (*text++ == c || c == '.'));
return 0;
}
答案 1 :(得分:0)
我之前从未尝试过写一个正则表达式,所以我想我会试一试。我省略了一些无聊的东西。这是我的(完全未经测试或编译)版本:
class Regex {
public:
Regex(const string& pattern) {
// Sanity check pattern:
if ((!pattern.empty() && pattern[0] == '*') ||
adjacent_find(pattern.begin(), pattern.end(), both_are_repeats) != pattern.end()) {
// throw exception
}
for (string::const_iterator curr(pattern.begin()), end(pattern.end()); curr != end; ) {
char current_match = *curr;
++curr;
// Fold any number of the following characters that are current_match or '*' into
// a single Node.
int stars = 0, count = 1;
for (; curr != end; ++curr) {
if (*curr == current_match) {
++count;
} else if (*curr == '*') {
++stars;
} else {
break;
}
}
rewritten_pattern_.push_back(Node(current_match, count - stars, stars > 0));
}
}
// We could do this iteratively and avoid a stack overflow, but the recursion solution is
// a lot easier to write, so it's good enough for SO :)
bool matches(const string& value) const {
return matches_internal(value.begin(), value.end(), rewritten_pattern_.begin(), rewritten_pattern_.end());
}
private:
static bool matches_internal(string::const_iterator value_curr,
string::const_iterator value_end,
vector<Node>::const_iterator pattern_curr,
vector<Node>::const_iterator pattern_end) {
for (; pattern_curr != pattern_end; ++pattern_curr) {
// For each pattern Node, we first verify that the required count of letters is there,
// then we handle the repeats, if specified. After this section, value_curr should
// be advanced past the required elements of the Node.
if (distance(value_curr, value_end) < pattern_curr->count) return false;
string::const_iterator current_pattern_count_end = value_curr;
advance(current_pattern_count_end, pattern_curr->count);
if (pattern_curr->value == '.') {
value_curr = current_pattern_count_end;
} else {
for (; value_curr != current_pattern_count_end; ++value_curr) {
if (*value_curr != pattern_curr->value) {
return false;
}
}
}
// We've handled the required charaters, now handle the repeats, if any:
if (pattern_curr->repeats) {
if (pattern_curr->value == '.') {
// Here's the tricky case that will have to involve some backtracking. We aren't sure
// how much of the string the .* should consume, we have to try all potential positions
// and only match if any position matches. Since most regex impls are greedy
// by default, we'll start potentially matching the whole string and move our way backward.
++pattern_curr;
for (string::const_iterator wildcard_match_end = value_end;
wildcard_match_end != value_curr;
--wildcard_match_end) {
if (matches_internal(wildcard_match_end, value_end, pattern_curr, pattern_end)) {
return true;
}
}
return false;
} else {
// If this isn't a wildcard, we can just consume all of the same value.
for (; value_curr != value_end && *value_curr == pattern_curr->value; ++value_curr) {}
}
}
}
// After all the patterns are consumed, we only match if we have consumed the value also.
return value_curr == value_end;
}
static bool both_are_repeats(char i, char j) {
return i == '*' && j == '*';
}
struct Node {
// put constructor and copy constructor here
char value;
int count;
bool repeats;
}
vector<Node> rewritten_pattern_;
};