在C中拆分未加引号的字符串

时间:2016-12-24 19:43:26

标签: c string split

我正在编写一个函数来将一个字符串拆分为一个指向指针的指针,如果separator是space,我想只拆分不在引号内的单词。例如Hello world "not split"应该返回

Hello
world
"not split"

某种程度上,该函数将引号内的单词拆分,并且不会在引号之外拆分单词。

#include <stdio.h>
#include <string.h>
#include <stdlib.h>

int is_quotes(char *s)
{
    int i;
    int count;

    i = 0;
    count = 0;
    while (s[i])
    {
        if (s[i] == '"')
            count++;
        i++;
    }
    if (count == 0)
        count = 1;
    return (count % 2);
}

int count_words(char *s, char sep)
{
    int check;
    int i;
    int count;

    check = 0;
    if (sep == ' ')
      check = 1;
    i = 0;
    count = 0;
    while (*s && *s == sep)
        ++s;
    if (*s)
        count = 1;
    while (s[i])
    {
        if (s[i] == sep)
        {
          if (!is_quotes(s + i) && check)
          {
            i += 2;
            while (s[i] != 34 && s[i])
                i++;
          }
          count++;
        }
        i++;
    }
    return (count);
}

char    *ft_strsub(char const *s, unsigned int start, size_t len)
{
    char    *sub;

    sub = malloc(len + 1);
    if (sub)
        memcpy(sub, s + start, len);
    return (sub);
}

char        **ft_strsplit(char const *s, char c)
{
    int     words;
    char    *start;
    char    **result;
    int     i;

    words = count_words((char *)s, c);
    if (!s || !c || words == 0)
        return (NULL);
    i = 0;
    result = (char **)malloc(sizeof(char *) * (words + 1));
    start = (char *)s;
    while (s[i])
    {
        if (s[i] == c)
        {
            if (is_quotes((char *)s + i) == 0 && c == ' ')
            {
                i += 2;
                while (s[i] != '"' && s[i])
                    i++;
                i -= 1;
            }
            if (start != (s + i))
                *(result++) = ft_strsub(start, 0, (s + i) - start);
            start = (char *)(s + i) + 1;
        }
        ++i;
    }
    if (start != (s + i))
        *(result++) = ft_strsub(start, 0, (s + i) - start);
    *result = NULL;
    return (result - words);
}

int main(int argc, char **argv)
{
    if (argc > 1)
    {
        char **s;
        s = ft_strsplit(argv[1], ' ');
        int i = 0;
        while (s[i])
            printf("%s\n", s[i++]);
    }
  return 0;
}

当我使用hello world "hello hello"运行此代码时,我得到以下内容

hello world
"hello
hello"

2 个答案:

答案 0 :(得分:2)

你需要一个具有两种状态的状态机,报价和报价。当你点击引号时,翻转状态。当你点击一个空格时,如果有报价则转换为换行符,而不是报价。 (你很快就会想让它更精细地允许字符串转义等,状态机方法可以扩展到那个)。

答案 1 :(得分:0)

试试这个(修复并减少)

#include <stdio.h>
#include <string.h>
#include <stdlib.h>

typedef struct token {
    const char *top;
    const char *end;//point to next character
} Token;

Token getToken(const char **sp, char sep){
    const char *s = *sp;
    const char *top, *end;
    Token token = { NULL, NULL};

    while(*s && *s == sep)//skip top separators
        ++s;
    if(!*s){
        *sp = s;
        return token;//return null token
    }
    token.top = s;
    while(*s && *s != sep){
        if(*s == '"'){
            char *p = strchr(s + 1, '"');//search end '"'
            if(p)
                s = p;//skip to '"'
        }
        ++s;
    }
    token.end = s;
    *sp = s;

    return token;
}

int count_words(const char *s, char sep){
    int count = 0;
    Token token = getToken(&s, sep);

    while(token.top != NULL){
        ++count;
        token = getToken(&s, sep);
    }
    return count;
}

char *ft_strsub(Token token){
    size_t len = token.end - token.top;
    char *sub = malloc(len + 1);
    if (sub){
        memcpy(sub, token.top, len);
        sub[len] = 0;
    }
    return sub;
}

char **ft_strsplit(const char *s, char sep){
    int words;

    if (!s || !sep || !(words = count_words(s, sep)))
        return NULL;

    char **result = malloc(sizeof(char *) * (words + 1));
    if(!result){
        perror("malloc");
        return NULL;
    }

    int i = 0;
    Token token = getToken(&s, sep);

    while(token.top != NULL){
        result[i++] = ft_strsub(token);
        token = getToken(&s, sep);
    }
    result[i] = NULL;

    return result;
}

int main(int argc, char **argv){
    const char *text = "Hello world \"not split\"";
    char **s = ft_strsplit(text, ' ');
    int i = 0;
    while (s[i]){
        printf("%s\n", s[i]);
        free(s[i++]);
    }
    free(s);

    return 0;
}

转义字符处理版本。

#include <stdio.h>
#include <string.h>
#include <stdlib.h>

#define ESCAPE '\\' //ESCAPE CHARACTER

typedef struct token {
    const char *top;
    const char *end;//point to next character
} Token;

Token getToken(const char **sp, char sep){
    const char *s = *sp;
    const char *top, *end;
    Token token = { NULL, NULL};

    while(*s && *s == sep)//skip top separators
        ++s;
    if(!*s){
        *sp = s;
        return token;
    }
    token.top = s;
    while(*s && *s != sep){
        if(*s == ESCAPE)
            ++s;
        else if(*s == '"'){
            char *p = strchr(s + 1, '"');//search end '"'
            while(p && p[-1] == ESCAPE)
                p = strchr(p + 1, '"');
            if(p)
                s = p;
        }
        ++s;
    }
    token.end = s;
    *sp = s;

    return token;
}

int count_words(const char *s, char sep){
    int count = 0;
    Token token = getToken(&s, sep);

    while(token.top != NULL){
        ++count;
        token = getToken(&s, sep);
    }
    return count;
}

char *remove_escape(char *s){
    char *from, *to;
    from = to = s;
    while(*from){
        if(*from != ESCAPE)
            *to++ = *from;
        ++from;
    }
    *to = 0;
    return s;
}

char *ft_strsub(Token token){
    size_t len = token.end - token.top;
    char *sub = malloc(len + 1);
    if (sub){
        memcpy(sub, token.top, len);
        sub[len] = 0;
    }
    return sub;
}

char **ft_strsplit(const char *s, char sep){
    int words;

    if (!s || !sep || !(words = count_words(s, sep)))
        return NULL;

    char **result = malloc(sizeof(char *) * (words + 1));
    if(!result){
        perror("malloc");
        return NULL;
    }

    Token token = getToken(&s, sep);
    int i = 0;

    while(token.top != NULL){
        result[i] = ft_strsub(token);
        remove_escape(result[i++]);
        token = getToken(&s, sep);
    }
    result[i] = NULL;

    return result;
}

void test(const char *text){
    printf("original:%s\n", text);
    printf("result of split:\n");
    char **s = ft_strsplit(text, ' ');
    int i = 0;
    while (s[i]){
        printf("%s\n", s[i]);
        free(s[i++]);
    }
    free(s);
    puts("");
}

int main(int argc, char **argv){
    test("Hello world \"not split\"");
    test("Hello world \"not \\\" split\"");//include " in "..."
    test("Hello world not\\ split");//escape separator

    return 0;
}

结果:

original:Hello world "not split"
result of split:
Hello
world
"not split"

original:Hello world "not \" split"
result of split:
Hello
world
"not " split"

original:Hello world not\ split
result of split:
Hello
world
not split