试图用空格和引号来标记字符串

时间:2014-05-10 23:34:02

标签: c string pointers

所以我一直在研究这个问题,而且我遇到了一些奇怪的问题。最终目标是通过空格和引号分割输入字符串(即,这是一个“非常”非常复杂的“示例”,这个例子是{this,a,very,very complex,example)。现在它似乎正确地将它全部拆分,除了第一个字符串。

这里是(使用来自getline的值传入buff):

char **tokens = (char **)malloc(sizeof(char));
char *temp;
int count = 0;
int prev = 0;
// Get tokens
for (int i = 0; i <= strlen(command) && running; i++) {
    if (i > prev && strncmp((buff + i), " ", 1) == 0) {
        temp = (char **)realloc(tokens, (sizeof(char)) * WORD_SIZE * (++count));
        if (temp == NULL) {
            fprintf(stderr, "Error in parsing: ran out of memory\n");
            running = false;
            free(tokens);
        }
        else {
            tokens = temp;
            *(temp) = (buff + i);
            strncpy(*(temp), "\0", 1);
            temp = tokens + WORD_SIZE * (count - 1);
            *(temp) = buff+prev;
            prev = i+1;
        }
    }
    else if (strncmp((buff + i), "\"", 1) == 0) {
        *(temp) = (buff + i);
            strncpy(*(temp), "\0", 1);
        i++;
        prev = i;
        for (; strncmp((buff + i), "\"", 1) != 0; i++) { }
        temp = (char **)realloc(tokens, (sizeof(char)) * WORD_SIZE * (++count));
        if (temp == NULL) {
            fprintf(stderr, "Error in parsing: ran out of memory\n");
            running = false;
            free(tokens);
        }
        else {
            tokens = temp;
            *(temp) = (buff + i);
            strncpy(*(temp), "\0", 1);
            temp = tokens + WORD_SIZE * (count - 1);
            *(temp) = buff+prev;
            prev = i+1;
        }
    }
    else if (strncmp((buff + i), "\0", 1) == 0) {
        temp = (char **)realloc(tokens, (sizeof(char)) * WORD_SIZE * (++count));
        if (temp == NULL) {
            fprintf(stderr, "Error in parsing: ran out of memory\n");
            running = false;
            free(tokens);
        }
        else {
            tokens = temp;
            temp = tokens + WORD_SIZE * (count - 1);
            *(temp) = buff+prev;
            prev = i+1;
        }
    }
}
for (int i = 0; i < count; i++)
     printf("\t%i: %s\n", i, *tokens + sizeof(char) * WORD_SIZE * i);

现在,如果我输入“这是一个测试”(没有引号)我得到:
0:
1:是
2:a 3:测试

行情有些混乱,因为“这个”是一个“非常”非常复杂的“测试”我得到:
0:
1:是一个 2:
3:非常复杂 4:测试

3 个答案:

答案 0 :(得分:3)

你说替代代码没问题。如果使用确定性有限自动机模型来思考它们,简单的字符串解析算法几乎总是更容易并产生更易于维护的代码。网上有很多关于DFA的免费参考资料。

这是解决您问题的DFA。

dfa

[any]的含义是&#34;其他所有&#34;。换句话说,如果没有其他转换匹配,请选择此转换。它成为C default中的switch个案。 [eos]的含义是&#34;字符串的结尾&#34;或空字符。

请注意,DFA可让您系统地了解所有情况,例如在单词中间出现的引号。在这里,我将此视为当前单词的结尾和新引用单词的开头。如果规范发生变化,DFA很容易改变,而且更改转化为代码而没有经过深思熟虑。

剩下的就是添加&#34;动作代码&#34;捕获令牌开始并在明显的位置覆盖空终止符。在C中,我们有:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

char **tokenize(char *str, int *n_tokens_rtn)
{
  // State of the DFA.
  enum { Error = -1, Start, InQuoted, InWord } state = Start;

  // String pointer and current character
  int cp = 0;

#define CURRENT_CHAR (str[cp])
#define ADVANCE_TO_NEXT_CHAR do { ++cp; } while (0)
#define MARK_END_OF_TOKEN do { str[cp] = '\0'; } while (0)

  // Token pointer and buffer. Allocate biggest possible and shrink at end.
  int tp = 0;
  char **tokens = safe_malloc((1 + strlen(str) / 2) * sizeof *tokens);

#define SAVE_TOKEN do { tokens[tp++] = &str[cp]; } while (0)

  // Each iteration is one DFA transition.
  for (;;) {
    switch (state) {
    case Start:
      switch (CURRENT_CHAR) {
      case '\0':
        goto done_scanning;

      case ' ': case '\t': case '\n':
        ADVANCE_TO_NEXT_CHAR;
        break;

      case '"':
        state = InQuoted;
        ADVANCE_TO_NEXT_CHAR;
        SAVE_TOKEN;
        break;

      default:
        state = InWord;
        SAVE_TOKEN;
        ADVANCE_TO_NEXT_CHAR;
        break;
      }
      break;

    case InQuoted:
      switch (CURRENT_CHAR) {
      case '\0':
        state = Error; // Missing close quote.
        break;

      case '"':
        state = Start;
        MARK_END_OF_TOKEN;
        ADVANCE_TO_NEXT_CHAR;
        break;

      default:
        ADVANCE_TO_NEXT_CHAR;
        break;
      }
      break;

    case InWord:
      switch (CURRENT_CHAR) {

      case '\0':
        goto done_scanning;

      case ' ': case '\t': case '\n':
        state = Start;
        MARK_END_OF_TOKEN;
        ADVANCE_TO_NEXT_CHAR;
        break;

      case '"': // Word ended in quote, not space.
        state = InQuoted;
        MARK_END_OF_TOKEN;
        ADVANCE_TO_NEXT_CHAR;
        SAVE_TOKEN;
        break;

      default:
        ADVANCE_TO_NEXT_CHAR;
        break;
      }
      break;

    case Error:
      fprintf(stderr, "Syntax error.\n");
      goto done_scanning;
    }
  }

 done_scanning:
  // Return number of tokens if caller is interested.
  if (n_tokens_rtn) *n_tokens_rtn = tp;

  // Append a null terminator for good measure.
  tokens[tp++] = NULL;

  // Trim the returned value to the right size.
  return realloc(tokens, tp * sizeof *tokens);
}

int main(void)
{
  char str[] = "this \"is a\" very \"very complex\" example";
  char **tokens = tokenize(str, NULL);
  for (int i = 0; tokens[i]; i++)
    printf("%s\n", tokens[i]);
  return 0;
}

答案 1 :(得分:1)

这是一个全新的从头开始写,因为这更容易重写自己的代码(如果这不是你的意图,道歉)。几点说明:

  1. 无需测试之前的malloc。您可以安全地realloc一个NULL指针。
  2. if (strncmp((buff + i), "\"", 1) == 0) - 您可以立即测试buff[i]
  3. 为什么所有prev改组? :)它足以在你的字符串上循环一次
  4. 我已成功temp测试成功realloc,因为您也拥有它。它在我的代码中实际上是不必要的,因为它只退出main
  5. 补充说:角色"还会引入一个新词“&#39;没有空格的时候。
  6. 代码:

    #include <stdio.h>
    #include <stdlib.h>
    #include <string.h>
    
    int main (void)
    {
        char **tokens = NULL;
        int i, count = 0, strcount;
        char **temp, *iterate;
    
        char *input = "this \"is a\" very \"very complex\" test";
    
        iterate = input;
    
        if (iterate)
        {
            while (*iterate)
            {
                while (*iterate == ' ')
                    iterate++;
    
                if (!*iterate)
                    break;
    
                temp = realloc(tokens, sizeof(char *) * (count+1));
                if (temp == NULL)
                {
                    fprintf(stderr, "Error in parsing: ran out of memory\n");
                    return -1;
                }
                tokens = temp;
    
                if (*iterate == '\"')
                {
                    iterate++;
                    strcount = 0;
                    while (iterate[strcount] && iterate[strcount] != '\"')
                        strcount++;
                    tokens[count] = malloc(strcount+1);
                    strncpy (tokens[count], iterate, strcount);
                    tokens[count][strcount] = 0;
                    count++;
                    iterate += strcount;
                    if (*iterate == '\"')
                        iterate++;
                } else
                {
                    strcount = 0;
                    while (iterate[strcount] && iterate[strcount] != ' ' && iterate[strcount] != '\"')
                        strcount++;
                    tokens[count] = malloc(strcount+1);
                    strncpy (tokens[count], iterate, strcount);
                    tokens[count][strcount] = 0;
                    count++;
                    iterate += strcount;
                }
            } while (*iterate);
        }
    
        for (i = 0; i < count; i++)
            printf("\t%i: %s\n", i, tokens[i]);
    
        return 0;
    }
    

    this "is a" very "very complex" test的输出:

    0: this
    1: is a
    2: very
    3: very complex
    4: test
    

答案 2 :(得分:0)

这似乎是一个相对简单的问题,所以我没有编写完整的解析器,而是使用标准C库编写了一个解决方案来解决这个问题。如果这个解决方案很吸引人,请自己判断。可能有一些方法可以改进我所做的工作,使代码更清晰一点,我会把它作为一个练习给任何如此倾向的人。

#include <stdlib.h>
#include <stdio.h>
#include <string.h>

int main()
{
    char input_string[] = "this \"is a\" very \"very complex\" test";
    char **tokens = NULL;
    int token_count = 0;
    char *ptr = input_string;
    int i;
    char *next_ptr = ptr;

    while (*ptr && next_ptr)
    {
        while (*ptr == ' ') ptr++;
        tokens = realloc(tokens, ++token_count * sizeof(char *));
        if (tokens == NULL)
            return -1;
        if (*ptr == '"')
            next_ptr = strchr(ptr+1, '"');
        else
            next_ptr = strpbrk(ptr, " \"");
        if (next_ptr)
        {
            tokens[token_count-1] = malloc(sizeof(char) * (next_ptr - (ptr+(*ptr=='"'))) + 1);
            if (tokens[token_count-1] == NULL)
                return -1;
            strncpy(tokens[token_count-1], (ptr+(*ptr=='"')), next_ptr - (ptr+(*ptr=='"')));
            tokens[token_count-1][next_ptr - (ptr+(*ptr=='"'))] = 0;
            ptr = next_ptr + (*ptr=='"');
        }
        else
            tokens[token_count-1] = strdup(ptr+(*ptr=='"'));
    }

    for (i = 0; i < token_count; ++i)
        printf("[%d]: %s\n", i, tokens[i]);

    return 0;
}

输出:

[0]: this
[1]: is a
[2]: very
[3]: very complex
[4]: test