从C中解析的文件中删除标签

时间:2015-11-05 07:34:07

标签: c parsing text

我想使用C。

在文本文件中打印标签之间的数据

输入声明:

<PERSON> Mark Zuckerberg </PERSON> is a entrepreneur from <LOCATION> USA </LOCATION>. He is also the CEO of <ORGANIZATION> Facebook </ORGANIZATION>.

输出:Mark Zuckerberg USA Facebook。

我的程序代码是:

const char* getfield(char* line, int num)
{
    const char* tok;
    for (tok = strtok(line, "/>");
        tok && *tok;
        tok = strtok(NULL, ">"))
    {
        if (!--num)
            return tok;
    }
    return NULL;
}

int main()
{
    char line[500000];
    while (fgets(line, 500000, stdin))
    {
        char *arg = line;
        const char *tok;
        while ((tok = getfield(arg, 2)) != NULL) {
            printf("%s\n", tok);
            arg = NULL;
        }
    }
}

我的输出是:

Mark Zuckerberg </PERSON

USA </LOCATION

Facebook </ORGANIZATION

我想摆脱</Tag,只获得Mark Zuckerberg USA Facebook作为输出。我在哪里需要更改代码?

1 个答案:

答案 0 :(得分:1)

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

char *getfield(char **sp){
    char *left; //point to <
    char *right;//point to >

    if((left = strchr(*sp, '<')) == NULL)
        return NULL;
    if((right = strchr(left, '>')) == NULL)
        return NULL;

    size_t len = right - left;//if len == 1, tag is nothing(<>)
    char *tag = malloc(len);
    memcpy(tag, left + 1, len -1);
    tag[len-1] = '\0';

    char *etag = malloc(len + 3);
    sprintf(etag, "</%s>", tag);
    left = right + 1;
    if((right = strstr(left, etag)) == NULL)//right point to end tag
    {
        free(tag);
        free(etag);
        return NULL;
    }
    len = right - left;
    char *text = malloc(len + 1);
    memcpy(text, left, len);
    text[len] = '\0';

    *sp = right + strlen(etag);
    free(tag);
    free(etag);
    return text;
}

int main(void){
    char line[500000];

    while (fgets(line, sizeof line, stdin)){
        char *arg = line;
        char *text;

        while ((text = getfield(&arg)) != NULL){
            printf("%s\n", text);
            free(text);
        }
    }
    return 0;
}