编写一个正则表达式字符串匹配函数,支持'。','*'和'。*'

时间:2012-02-18 23:58:58

标签: c regex algorithm

标题非常明确,下面是几个样本输入/输出。请注意,使用的正则表达式应该从字符串的开头到结尾匹配。

'abc' =~ 'abc' (match)
'abc' =~ 'a*bc' (match)
'aaaaaaabc' =~ 'c*bc' (no match)
'aaaaaaabc' =~ 'a.*bc' (match)
'abbbbaaaaaabc' =~ 'ab*a*b*c' (match)
'abbbbaaaaaabc' =~ 'ab*a*h*bc' (match)
'bbd' =~ 'b*bbd' (match)
'bbd' =~ '.*bbd' (match)
'bbd' =~ '.*cbd' (no match)
'' =~ '.*' (match)

我的实现位于:

https://github.com/jpbillaud/piexposed/blob/master/string/string_match_regexp.c

现在我想知道是否有人会考虑使用DP,有限自动机或其他任何方法来解决这个问题。

2 个答案:

答案 0 :(得分:8)

this一书中查看Rob PikeThe Practice of Programming正则表达式匹配器的实现情况。这是绝对漂亮的代码,只需35行C就可以满足问题中的所有要求(还有更多!)。引用上面引用的文章:

/* match: search for regexp anywhere in text */
int match(char *regexp, char *text)
{
    if (regexp[0] == '^')
        return matchhere(regexp+1, text);
    do {    /* must look even if string is empty */
        if (matchhere(regexp, text))
            return 1;
    } while (*text++ != '\0');
    return 0;
}

/* matchhere: search for regexp at beginning of text */
int matchhere(char *regexp, char *text)
{
    if (regexp[0] == '\0')
        return 1;
    if (regexp[1] == '*')
        return matchstar(regexp[0], regexp+2, text);
    if (regexp[0] == '$' && regexp[1] == '\0')
        return *text == '\0';
    if (*text!='\0' && (regexp[0]=='.' || regexp[0]==*text))
        return matchhere(regexp+1, text+1);
    return 0;
}

/* matchstar: search for c*regexp at beginning of text */
int matchstar(int c, char *regexp, char *text)
{
    do {    /* a * matches zero or more instances */
        if (matchhere(regexp, text))
            return 1;
    } while (*text != '\0' && (*text++ == c || c == '.'));
    return 0;
}

答案 1 :(得分:0)

我之前从未尝试过写一个正则表达式,所以我想我会试一试。我省略了一些无聊的东西。这是我的(完全未经测试或编译)版本:

class Regex {
 public:
  Regex(const string& pattern) {
    // Sanity check pattern:
    if ((!pattern.empty() && pattern[0] == '*') || 
        adjacent_find(pattern.begin(), pattern.end(), both_are_repeats) != pattern.end()) {
      // throw exception
    }

    for (string::const_iterator curr(pattern.begin()), end(pattern.end()); curr != end; ) {
      char current_match = *curr;
      ++curr;
      // Fold any number of the following characters that are current_match or '*' into
      // a single Node.
      int stars = 0, count = 1;
      for (; curr != end; ++curr) {
        if (*curr == current_match) {
          ++count;
        } else if (*curr == '*') {
          ++stars;
        } else {
          break;
        }
      }
      rewritten_pattern_.push_back(Node(current_match, count - stars, stars > 0));
    }
  }

  // We could do this iteratively and avoid a stack overflow, but the recursion solution is
  // a lot easier to write, so it's good enough for SO :)
  bool matches(const string& value) const {
    return matches_internal(value.begin(), value.end(), rewritten_pattern_.begin(), rewritten_pattern_.end());
  }

 private:

  static bool matches_internal(string::const_iterator value_curr,
                               string::const_iterator value_end, 
                               vector<Node>::const_iterator pattern_curr, 
                               vector<Node>::const_iterator pattern_end) {
    for (; pattern_curr != pattern_end; ++pattern_curr) {
      // For each pattern Node, we first verify that the required count of letters is there,
      // then we handle the repeats, if specified. After this section, value_curr should
      // be advanced past the required elements of the Node.
      if (distance(value_curr, value_end) < pattern_curr->count) return false;

      string::const_iterator current_pattern_count_end = value_curr;
      advance(current_pattern_count_end, pattern_curr->count);

      if (pattern_curr->value == '.') {
        value_curr = current_pattern_count_end;
      } else {
        for (; value_curr != current_pattern_count_end; ++value_curr) {
          if (*value_curr != pattern_curr->value) {
            return false;
          }
        }
      }

      // We've handled the required charaters, now handle the repeats, if any:
      if (pattern_curr->repeats) {
        if (pattern_curr->value == '.') {
          // Here's the tricky case that will have to involve some backtracking. We aren't sure
          // how much of the string the .* should consume, we have to try all potential positions
          // and only match if any position matches. Since most regex impls are greedy
          // by default, we'll start potentially matching the whole string and move our way backward.
          ++pattern_curr;
          for (string::const_iterator wildcard_match_end = value_end;
               wildcard_match_end != value_curr;
               --wildcard_match_end) {
            if (matches_internal(wildcard_match_end, value_end, pattern_curr, pattern_end)) {
              return true;
            }
          }
          return false;
        } else {
          // If this isn't a wildcard, we can just consume all of the same value.
          for (; value_curr != value_end && *value_curr == pattern_curr->value; ++value_curr) {}
        }
      }
    }

    // After all the patterns are consumed, we only match if we have consumed the value also.
    return value_curr == value_end;
  }

  static bool both_are_repeats(char i, char j) {
    return i == '*' && j == '*';
  }

  struct Node {
    // put constructor and copy constructor here
    char value;
    int count;
    bool repeats;
  }
  vector<Node> rewritten_pattern_;
};