Question

我有一个文件，其内容类似于以下内容：

Really my data is here, and I think its really 
cool. Somewhere, i want to break on some really
awesome data. Please let me really explain what is going
'\n'
on. You are amazing. Something is really awesome. 
Please give me the stuffs.

我想创建一个数组，该数组的字符串指针指向定界词之间的字符串。

字符**字符串：

my data is here, and I think its
cool. Somewhere, i want to break on some
awesome data. Please let me
explain what is going'\n'on. You are amazing. Something is
awesome.'\n'Please give me the stuffs.

尝试输入的代码：

char *filedata = malloc(fileLength);
fread(filedata, end, 1, fp); //ABC
size_t stringCount = 8;
size_t idx = 0;
char **data = malloc(stringCount * sizeof(*packets));
if(!data) {
    fprintf(stderr, "There was an error");
    return 1;
}
fread(data, end, 1, text);
char *stuff = strtok(data, "really");
while(stuff) {
    data[idx++] = strdup(stuff);
    s = strtok(NULL, "stuff");
    if(idx >= stringCount) {
        stringCount *= 2;
        void *tmp = realloc(stuff, stringCount * sizeof(*stuff));
        if(!tmp) {
            perror("Unable to make a larger string list");
            stringCount /= 2;
            break;
        }
        stuff = tmp;
    }
}

这提供了我所要寻找的内容，但是它并不限制单词本身，而不是字母。

Answer 1

在单词"really"上标记“文件” 的目标中存在一些细微的困难。这些是什么？文本文件通常一次读取一行，如果存储整个行文件，则一次读取多个指针，每个指针都指向一行的开头。意思是，如果采用一般的面向行的方法来读取文件，则令牌（从文件开头开始，或带有单词"really"）可能会跨越多行。因此，要标记化，您需要组合多行。

或者，您可以将整个文件读取到单个缓冲区中，然后使用strstr来解析定界符"really"，但是... ，您需要确保保存文件的缓冲区为 nul终止，以避免对strstr的最终调用出现不确定的行为。（通常将整个文件读入缓冲区不会导致 nul终止缓冲区）

也就是说，即使使用strstr，您也必须有效地手动解析文件的内容。您将需要保留三个指针（指向令牌开头的起始指针，用于搜索定界符的指针，以处理所发现的定界符是一个包含较少的较大单词的子字符串的情况。，最后是标记令牌结束的结束指针。

该方案相当简单，您的第一个令牌开始并且是文件的开头，随后的每个令牌都以单词"really"开头。因此，您向前扫描以找到" really"（注意" really"之前的空格），将结束指针设置为令牌" really"的开头，然后将令牌复制到缓冲区{{1} }，/* do stuff with token */，将起始指针更新为free (token);的开头，将常规分析指针设置为过去的"really"，然后重复进行操作直到找不到"really"。退出解析循环时，仍然必须"really"使用最终令牌。

您还可以决定如何处理每个令牌中包含的/* do stuff */。为了下面的输出目的，它们仅被'\n'覆盖。（您可以添加所需的任何其他条件，例如消除由换行符引起的任何尾随或中间的空格）

将其完全放在一起，您可以执行与以下操作类似的操作，其中将文件内容读取到以 nul结尾的缓冲区中是由函数' '处理的，其余的标记化只需在read_file()中处理，例如

main()

示例输入文件

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>

char *read_file (char* fname, size_t *nbytes)
{
    long bytes = 0;
    char* file_content;
    FILE *file = fopen(fname, "rb");

    if (!file)          /* validate file open for reading */
        return NULL;

    fseek (file, 0, SEEK_END);              /* fseek end of file */
    if ((bytes = ftell (file)) == -1) {     /* get number of bytes */
        fprintf (stderr, "error: unable to determine file length.\n");
        return NULL;
    }

    fseek (file, 0, SEEK_SET);              /* fseek beginning of file */

    /* allocate memory for file */
    if (!(file_content = malloc (bytes + 1))) { /* allocate/validate memory */
        perror ("malloc - virtual memory exhausted");
        return NULL;
    }

    /* read all data into file in single call to fread */
    if (fread (file_content, 1, (size_t)bytes, file) != (size_t)bytes) {
        fprintf (stderr, "error: failed to read %ld-bytes from '%s'.\n",
                bytes, fname);
        return NULL;
    }
    fclose (file);              /* close file */

    file_content[bytes] = 0;    /* nul terminate - to allow strstr use */

    *nbytes = (size_t)bytes;    /* update nbytes making size avialable */

    return file_content;        /* return pointer to caller */
}

int main (int argc, char **argv) {

    size_t nbytes;
    char *content;

    if (argc < 2) {     /* validate required argument givent */
        fprintf (stderr, "error: insufficient input. filename req'd.\n");
        return 1;
    }

    if ((content = read_file (argv[1], &nbytes))) { /* read/validate */
        char *sp = content,     /* start pointer for token */
            *p = sp,            /* pointer for parsing token */
            *ep = p;            /* end pointer one past end of token */
        const char *delim = " really";      /* delimiter */

        while ((ep = strstr (p, delim))) {  /* while delimiter found */
            if (isspace (*(ep + sizeof delim - 1)) ||   /* if next isspace */
                ispunct (*(ep + sizeof delim - 1))) {   /* or next ispunct */
                /* delimiter found */
                size_t tlen = ep - sp;              /* get token length */
                char *token = malloc (tlen + 1),    /* allocate for token */
                    *tp = token;                    /* pointer to token */
                if (!token) {                       /* validate allocation */
                    perror ("malloc-token");
                    exit (EXIT_FAILURE);
                }
                memcpy (token, sp, tlen);           /* copy to token */
                *(token + tlen) = 0;                /* nul-termiante */
                while (*tp) {               /* replace '\n' with ' ' */
                    if (*tp == '\n')
                        *tp = ' ';
                    tp++;
                }
                printf ("\ntoken: %s\n", token);    /* output token */
                /* do stuff with token */
                free (token);                       /* free token memory */
                sp = ep + 1;    /* advance start to beginning of next token */
            }
            p = ep + sizeof delim;  /* advance pointer */
        }
        p = sp;             /* use p to change '\n' to ' ' in last token */
        while (*p) {        /* replacement loop */
            if (*p == '\n')
                *p = ' ';
            p++;
        }
        printf ("\ntoken: %s\n", sp);
        /* do stuff with last token */

        free (content);     /* free buffer holding file */
    }

    return 0;
}

使用/输出示例

$ cat dat/breakreally.txt
my data is here, and I think its really
cool. Somewhere, i want to break on some really
awesome data. Please let me really explain what is going
on. You are amazing.

仔细研究一下，如果您有任何疑问，请告诉我。

C文件到多个char * groups由单词定界符

1 个答案: