所以我一直在研究这个问题,而且我遇到了一些奇怪的问题。最终目标是通过空格和引号分割输入字符串(即,这是一个“非常”非常复杂的“示例”,这个例子是{this,a,very,very complex,example)。现在它似乎正确地将它全部拆分,除了第一个字符串。
这里是(使用来自getline的值传入buff):
char **tokens = (char **)malloc(sizeof(char));
char *temp;
int count = 0;
int prev = 0;
// Get tokens
for (int i = 0; i <= strlen(command) && running; i++) {
if (i > prev && strncmp((buff + i), " ", 1) == 0) {
temp = (char **)realloc(tokens, (sizeof(char)) * WORD_SIZE * (++count));
if (temp == NULL) {
fprintf(stderr, "Error in parsing: ran out of memory\n");
running = false;
free(tokens);
}
else {
tokens = temp;
*(temp) = (buff + i);
strncpy(*(temp), "\0", 1);
temp = tokens + WORD_SIZE * (count - 1);
*(temp) = buff+prev;
prev = i+1;
}
}
else if (strncmp((buff + i), "\"", 1) == 0) {
*(temp) = (buff + i);
strncpy(*(temp), "\0", 1);
i++;
prev = i;
for (; strncmp((buff + i), "\"", 1) != 0; i++) { }
temp = (char **)realloc(tokens, (sizeof(char)) * WORD_SIZE * (++count));
if (temp == NULL) {
fprintf(stderr, "Error in parsing: ran out of memory\n");
running = false;
free(tokens);
}
else {
tokens = temp;
*(temp) = (buff + i);
strncpy(*(temp), "\0", 1);
temp = tokens + WORD_SIZE * (count - 1);
*(temp) = buff+prev;
prev = i+1;
}
}
else if (strncmp((buff + i), "\0", 1) == 0) {
temp = (char **)realloc(tokens, (sizeof(char)) * WORD_SIZE * (++count));
if (temp == NULL) {
fprintf(stderr, "Error in parsing: ran out of memory\n");
running = false;
free(tokens);
}
else {
tokens = temp;
temp = tokens + WORD_SIZE * (count - 1);
*(temp) = buff+prev;
prev = i+1;
}
}
}
for (int i = 0; i < count; i++)
printf("\t%i: %s\n", i, *tokens + sizeof(char) * WORD_SIZE * i);
现在,如果我输入“这是一个测试”(没有引号)我得到:
0:
1:是
2:a
3:测试
行情有些混乱,因为“这个”是一个“非常”非常复杂的“测试”我得到:
0:
1:是一个
2:
3:非常复杂
4:测试
答案 0 :(得分:3)
你说替代代码没问题。如果使用确定性有限自动机模型来思考它们,简单的字符串解析算法几乎总是更容易并产生更易于维护的代码。网上有很多关于DFA的免费参考资料。
这是解决您问题的DFA。
[any]的含义是&#34;其他所有&#34;。换句话说,如果没有其他转换匹配,请选择此转换。它成为C default
中的switch
个案。 [eos]的含义是&#34;字符串的结尾&#34;或空字符。
请注意,DFA可让您系统地了解所有情况,例如在单词中间出现的引号。在这里,我将此视为当前单词的结尾和新引用单词的开头。如果规范发生变化,DFA很容易改变,而且更改转化为代码而没有经过深思熟虑。
剩下的就是添加&#34;动作代码&#34;捕获令牌开始并在明显的位置覆盖空终止符。在C中,我们有:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
char **tokenize(char *str, int *n_tokens_rtn)
{
// State of the DFA.
enum { Error = -1, Start, InQuoted, InWord } state = Start;
// String pointer and current character
int cp = 0;
#define CURRENT_CHAR (str[cp])
#define ADVANCE_TO_NEXT_CHAR do { ++cp; } while (0)
#define MARK_END_OF_TOKEN do { str[cp] = '\0'; } while (0)
// Token pointer and buffer. Allocate biggest possible and shrink at end.
int tp = 0;
char **tokens = safe_malloc((1 + strlen(str) / 2) * sizeof *tokens);
#define SAVE_TOKEN do { tokens[tp++] = &str[cp]; } while (0)
// Each iteration is one DFA transition.
for (;;) {
switch (state) {
case Start:
switch (CURRENT_CHAR) {
case '\0':
goto done_scanning;
case ' ': case '\t': case '\n':
ADVANCE_TO_NEXT_CHAR;
break;
case '"':
state = InQuoted;
ADVANCE_TO_NEXT_CHAR;
SAVE_TOKEN;
break;
default:
state = InWord;
SAVE_TOKEN;
ADVANCE_TO_NEXT_CHAR;
break;
}
break;
case InQuoted:
switch (CURRENT_CHAR) {
case '\0':
state = Error; // Missing close quote.
break;
case '"':
state = Start;
MARK_END_OF_TOKEN;
ADVANCE_TO_NEXT_CHAR;
break;
default:
ADVANCE_TO_NEXT_CHAR;
break;
}
break;
case InWord:
switch (CURRENT_CHAR) {
case '\0':
goto done_scanning;
case ' ': case '\t': case '\n':
state = Start;
MARK_END_OF_TOKEN;
ADVANCE_TO_NEXT_CHAR;
break;
case '"': // Word ended in quote, not space.
state = InQuoted;
MARK_END_OF_TOKEN;
ADVANCE_TO_NEXT_CHAR;
SAVE_TOKEN;
break;
default:
ADVANCE_TO_NEXT_CHAR;
break;
}
break;
case Error:
fprintf(stderr, "Syntax error.\n");
goto done_scanning;
}
}
done_scanning:
// Return number of tokens if caller is interested.
if (n_tokens_rtn) *n_tokens_rtn = tp;
// Append a null terminator for good measure.
tokens[tp++] = NULL;
// Trim the returned value to the right size.
return realloc(tokens, tp * sizeof *tokens);
}
int main(void)
{
char str[] = "this \"is a\" very \"very complex\" example";
char **tokens = tokenize(str, NULL);
for (int i = 0; tokens[i]; i++)
printf("%s\n", tokens[i]);
return 0;
}
答案 1 :(得分:1)
这是一个全新的从头开始写,因为这更容易重写自己的代码(如果这不是你的意图,道歉)。几点说明:
malloc
。您可以安全地realloc
一个NULL指针。if (strncmp((buff + i), "\"", 1) == 0)
- 您可以立即测试buff[i]
。prev
改组? :)它足以在你的字符串上循环一次。temp
测试成功realloc
,因为您也拥有它。它在我的代码中实际上是不必要的,因为它只退出main
。"
还会引入一个新词“&#39;没有空格的时候。代码:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
int main (void)
{
char **tokens = NULL;
int i, count = 0, strcount;
char **temp, *iterate;
char *input = "this \"is a\" very \"very complex\" test";
iterate = input;
if (iterate)
{
while (*iterate)
{
while (*iterate == ' ')
iterate++;
if (!*iterate)
break;
temp = realloc(tokens, sizeof(char *) * (count+1));
if (temp == NULL)
{
fprintf(stderr, "Error in parsing: ran out of memory\n");
return -1;
}
tokens = temp;
if (*iterate == '\"')
{
iterate++;
strcount = 0;
while (iterate[strcount] && iterate[strcount] != '\"')
strcount++;
tokens[count] = malloc(strcount+1);
strncpy (tokens[count], iterate, strcount);
tokens[count][strcount] = 0;
count++;
iterate += strcount;
if (*iterate == '\"')
iterate++;
} else
{
strcount = 0;
while (iterate[strcount] && iterate[strcount] != ' ' && iterate[strcount] != '\"')
strcount++;
tokens[count] = malloc(strcount+1);
strncpy (tokens[count], iterate, strcount);
tokens[count][strcount] = 0;
count++;
iterate += strcount;
}
} while (*iterate);
}
for (i = 0; i < count; i++)
printf("\t%i: %s\n", i, tokens[i]);
return 0;
}
this "is a" very "very complex" test
的输出:
0: this
1: is a
2: very
3: very complex
4: test
答案 2 :(得分:0)
这似乎是一个相对简单的问题,所以我没有编写完整的解析器,而是使用标准C库编写了一个解决方案来解决这个问题。如果这个解决方案很吸引人,请自己判断。可能有一些方法可以改进我所做的工作,使代码更清晰一点,我会把它作为一个练习给任何如此倾向的人。
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
int main()
{
char input_string[] = "this \"is a\" very \"very complex\" test";
char **tokens = NULL;
int token_count = 0;
char *ptr = input_string;
int i;
char *next_ptr = ptr;
while (*ptr && next_ptr)
{
while (*ptr == ' ') ptr++;
tokens = realloc(tokens, ++token_count * sizeof(char *));
if (tokens == NULL)
return -1;
if (*ptr == '"')
next_ptr = strchr(ptr+1, '"');
else
next_ptr = strpbrk(ptr, " \"");
if (next_ptr)
{
tokens[token_count-1] = malloc(sizeof(char) * (next_ptr - (ptr+(*ptr=='"'))) + 1);
if (tokens[token_count-1] == NULL)
return -1;
strncpy(tokens[token_count-1], (ptr+(*ptr=='"')), next_ptr - (ptr+(*ptr=='"')));
tokens[token_count-1][next_ptr - (ptr+(*ptr=='"'))] = 0;
ptr = next_ptr + (*ptr=='"');
}
else
tokens[token_count-1] = strdup(ptr+(*ptr=='"'));
}
for (i = 0; i < token_count; ++i)
printf("[%d]: %s\n", i, tokens[i]);
return 0;
}
输出:
[0]: this
[1]: is a
[2]: very
[3]: very complex
[4]: test