Question

我试图获取一个用户输入字符串并解析为一个名为char * entire_line [100]的数组;其中每个单词放在数组的不同索引处，但如果字符串的一部分由引号封装，则应将其放在单个索引中。所以，如果我有

char buffer[1024]={0,};
fgets(buffer, 1024, stdin);

示例输入：“word filename.txt”这是一个字符串，shoudl占用输出数组中的一个索引“;

tokenizer=strtok(buffer," ");//break up by spaces
        do{
            if(strchr(tokenizer,'"')){//check is a word starts with a "
            is_string=YES;
            entire_line[i]=tokenizer;// if so, put that word into current index
            tokenizer=strtok(NULL,"\""); //should get rest of string until end "
            strcat(entire_line[i],tokenizer); //append the two together, ill take care of the missing space once i figure out this issue

              }  
        entire_line[i]=tokenizer;
        i++;
        }while((tokenizer=strtok(NULL," \n"))!=NULL);

这显然不起作用，只有在双引号封装字符串位于输入字符串的末尾时才会关闭但我可以输入：单词“这是用户输入的文本”filename.txt 一直试图解决这个问题，总是卡在某个地方。感谢

Answer 1

strtok函数是一种在C中进行标记化的可怕方法，除了一个（公认的常见）情况：简单的空格分隔的单词。（即使这样，由于缺乏重新进入和递归能力，它仍然不是很好，这就是我们为BSD发明strsep的原因。）

在这种情况下，最好的办法是建立自己的简单状态机：

char *p;
int c;
enum states { DULL, IN_WORD, IN_STRING } state = DULL;

for (p = buffer; *p != '\0'; p++) {
    c = (unsigned char) *p; /* convert to unsigned char for is* functions */
    switch (state) {
    case DULL: /* not in a word, not in a double quoted string */
        if (isspace(c)) {
            /* still not in a word, so ignore this char */
            continue;
        }
        /* not a space -- if it's a double quote we go to IN_STRING, else to IN_WORD */
        if (c == '"') {
            state = IN_STRING;
            start_of_word = p + 1; /* word starts at *next* char, not this one */
            continue;
        }
        state = IN_WORD;
        start_of_word = p; /* word starts here */
        continue;

    case IN_STRING:
        /* we're in a double quoted string, so keep going until we hit a close " */
        if (c == '"') {
            /* word goes from start_of_word to p-1 */
            ... do something with the word ...
            state = DULL; /* back to "not in word, not in string" state */
        }
        continue; /* either still IN_STRING or we handled the end above */

    case IN_WORD:
        /* we're in a word, so keep going until we get to a space */
        if (isspace(c)) {
            /* word goes from start_of_word to p-1 */
            ... do something with the word ...
            state = DULL; /* back to "not in word, not in string" state */
        }
        continue; /* either still IN_WORD or we handled the end above */
    }
}

请注意，这并未考虑单词内部双引号的可能性，例如：

"some text in quotes" plus four simple words p"lus something strange"

通过上面的状态机，您将看到"some text in quotes"变成一个令牌（忽略双引号），但p"lus也是一个令牌（包括引号）， something是单个令牌，strange"是令牌。无论您是想要这个，还是想要如何处理它，都取决于您。对于更复杂但彻底的词法标记化，您可能希望使用像flex这样的代码构建工具。

此外，当for循环退出时，如果state不是DULL，则需要处理最后的单词（我将其留在上面的代码中）并决定要如果state是IN_STRING（意味着没有关闭双引号）。

Answer 2

Torek解析代码的部分非常出色，但需要更多工作才能使用。

为了我自己的目的，我完成了c功能在这里，我分享了基于Torek's code的作品。

#include <stdio.h>
#include <string.h>
#include <ctype.h>
size_t split(char *buffer, char *argv[], size_t argv_size)
{
    char *p, *start_of_word;
    int c;
    enum states { DULL, IN_WORD, IN_STRING } state = DULL;
    size_t argc = 0;

    for (p = buffer; argc < argv_size && *p != '\0'; p++) {
        c = (unsigned char) *p;
        switch (state) {
        case DULL:
            if (isspace(c)) {
                continue;
            }

            if (c == '"') {
                state = IN_STRING;
                start_of_word = p + 1; 
                continue;
            }
            state = IN_WORD;
            start_of_word = p;
            continue;

        case IN_STRING:
            if (c == '"') {
                *p = 0;
                argv[argc++] = start_of_word;
                state = DULL;
            }
            continue;

        case IN_WORD:
            if (isspace(c)) {
                *p = 0;
                argv[argc++] = start_of_word;
                state = DULL;
            }
            continue;
        }
    }

    if (state != DULL && argc < argv_size)
        argv[argc++] = start_of_word;

    return argc;
}
void test_split(const char *s)
{
    char buf[1024];
    size_t i, argc;
    char *argv[20];

    strcpy(buf, s);
    argc = split(buf, argv, 20);
    printf("input: '%s'\n", s);
    for (i = 0; i < argc; i++)
        printf("[%u] '%s'\n", i, argv[i]);
}
int main(int ac, char *av[])
{
    test_split("\"some text in quotes\" plus four simple words p\"lus something strange\"");
    return 0;
}

见程序输出：

输入：＆＃39;＆＃34;引号中的一些文字＆＃34;加上四个简单的单词p＆＃34; lus some strange＆＃34;＆＃39;
  [0]＆＃39;引号中的一些文字＆＃39;
  [1]＆＃39;加上＆＃39;
  [2]＆＃39;四＆＃39;
  [3]＆＃39;简单＆＃39;
  [4]＆＃39;单词＆＃39;
  [5]＆＃39; p＆＃34; lus＆＃39;
  [6]＆＃39;某事＆＃39;
  [7]＆＃39;奇怪＆＃34;＆＃39;

Answer 3

前段时间我写了一个qtok函数，它从字符串中读取引用的单词。它不是一个状态机，它不会让你成为一个阵列，但将得到的令牌合二为一，这是微不足道的。它还处理转义引号和尾随和前导空格：

#include <stdio.h>
#include <ctype.h>
#include <assert.h>

// Strips backslashes from quotes
char *unescapeToken(char *token)
{
    char *in = token;
    char *out = token;

    while (*in)
    {
        assert(in >= out);

        if ((in[0] == '\\') && (in[1] == '"'))
        {
            *out = in[1];
            out++;
            in += 2;
        }
        else
        {
            *out = *in;
            out++;
            in++; 
        }
    }
    *out = 0;
    return token;
}

// Returns the end of the token, without chaning it.
char *qtok(char *str, char **next)
{
    char *current = str;
    char *start = str;
    int isQuoted = 0;

    // Eat beginning whitespace.
    while (*current && isspace(*current)) current++;
    start = current;

    if (*current == '"')
    {
        isQuoted = 1;
        // Quoted token
        current++; // Skip the beginning quote.
        start = current;
        for (;;)
        {
            // Go till we find a quote or the end of string.
            while (*current && (*current != '"')) current++;
            if (!*current) 
            {
                // Reached the end of the string.
                goto finalize;
            }
            if (*(current - 1) == '\\')
            {
                // Escaped quote keep going.
                current++;
                continue;
            }
            // Reached the ending quote.
            goto finalize; 
        }
    }
    // Not quoted so run till we see a space.
    while (*current && !isspace(*current)) current++;
finalize:
    if (*current)
    {
        // Close token if not closed already.
        *current = 0;
        current++;
        // Eat trailing whitespace.
        while (*current && isspace(*current)) current++;
    }
    *next = current;

    return isQuoted ? unescapeToken(start) : start;
}

int main()
{
    char text[] = "   \"some text in quotes\"    plus   four simple words p\"lus something strange\" \"Then some quoted \\\"words\\\", and backslashes: \\ \\ \"  Escapes only work insi\\\"de q\\\"uoted strings\\\"   ";

    char *pText = text;

    printf("Original: '%s'\n", text);
    while (*pText)
    {
        printf("'%s'\n", qtok(pText, &pText));
    }

}

输出：

Original: '   "some text in quotes"    plus   four simple words p"lus something strange" "Then some quoted \"words\", and backslashes: \ \ "  Escapes only work insi\"de q\"uoted strings\"   '
'some text in quotes'
'plus'
'four'
'simple'
'words'
'p"lus'
'something'
'strange"'
'Then some quoted "words", and backslashes: \ \ '
'Escapes'
'only'
'work'
'insi\"de'
'q\"uoted'
'strings\"'

Answer 4

我认为你的问题的答案实际上相当简单，但我假设其他答案似乎采取了不同的答案。我假设您希望任何引用的文本块都可以单独分离，而不管文本的其余部分是否用空格分隔。

所以举个例子：

“引号中的一些文字”加上四个简单的单词p“lus something strange”

输出结果为：

[0]引号中的一些文字

[1]加上

[2]四

[3]简单

[4]字

[5] p

[6] lus奇怪的事情

鉴于这种情况，只需要一小段代码，而不需要复杂的机器。您首先要检查第一个字符是否有引号，如果是，请勾选标记并删除该字符。以及删除字符串末尾的任何引号。然后根据引号对字符串进行标记。然后用空格标记每个先前获得的字符串。如果没有前导引号，则从获得的第一个字符串开始标记，或者如果有引号，则获取第二个字符串。然后，第一部分中的每个剩余字符串将被添加到一个字符串数组中，这些字符串散布着来自第二部分的字符串，而不是它们被标记化的字符串。通过这种方式，您可以获得上面列出的结果。在代码中，这看起来像：

#include<string.h>
#include<stdlib.h>

char ** parser(char * input, char delim, char delim2){
    char ** output;
    char ** quotes;
    char * line = input;
    int flag = 0;
    if(strlen(input) > 0 && input[0] == delim){
        flag = 1;
        line = input + 1;
    }
    int i = 0;
    char * pch = strchr(line, delim);
    while(pch != NULL){
        i++;
        pch = strchr(pch+1, delim);
    }
    quotes = (char **) malloc(sizeof(char *)*i+1);
    char * token = strtok(input, delim);
    int n = 0;
    while(token != NULL){
        quotes[n] = strdup(token);
        token = strtok(NULL, delim);
        n++;
    }
    if(delim2 != NULL){
        int j = 0, k = 0, l = 0;
        for(n = 0; n < i+1; n++){
            if(flag & n % 2 == 1 || !flag & n % 2 == 0){
                char ** new = parser(delim2, NULL);
                l = sizeof(new)/sizeof(char *);
                for(k = 0; k < l; k++){
                    output[j] = new[k];
                    j++;
                }
                for(k = l; k > -1; k--){
                    free(new[n]);
                }
                free(new);
            } else {
                output[j] = quotes[n];
                j++;
            }
        }
        for(n = i; n > -1; n--){
            free(quotes[n]);
        }
        free(quotes);
    } else {
        return quotes;
    }
    return output;
}

int main(){
    char * input;
    char ** result = parser(input, '\"', ' ');

    return 0;
}

（可能不完美，我还没有测试过）

根据空格或“双引号字符串”将字符串解析为数组

4 个答案: