Question

我正在为大学编写一个小型应用程序来解析一些维基百科页面并输出有关页面中人员的信息。

我用Java编写它并试图用C重写它。我遇到了一个奇怪的错误，有时程序的输出是正确的，有时它是错误的，而不改变输入。

Here is a sample input that triggers the error，名称为“105.html”

这是我有时得到的输出：

105 Linus Pauling Estadunidense 28 de fevereiro de 1901 Portland，Oregon 19 de agosto de 1994 Big Sur，Califórnia93

这是我得到的其他时间的输出：

105 Linus Pauling Estadunidense 28 de f @ evereir @o y dC L e y 19I L 01波特兰，俄勒冈州19 de agosto de 1994 Big Sur，Califórnia93

我注意到如果我在XCode中设置断点，我通常会得到正确的结果......

我是C的新手，所以我实际上不知道如何开始调试。

这是代码，如果有人有兴趣实际阅读它。代码是葡萄牙语和英语的混合，但我添加了英语注释，因此应该很容易理解。

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

struct pessoa{
    int id;
    char *nome; //name
    char *nacionalidade; // nationality
    char *nascimento; // date of birth
    char *local_nascimento; // place of birth
    char *morte; // date of death
    char *local_morte; // place of death
    int idade; // age
};


struct pessoa *inicializar(int n) {
    struct pessoa *pessoas = malloc(sizeof(pessoas) * n);
    return pessoas;
}

void imprimir_pessoa(struct pessoa *p) {
    printf("%i %s %s %s %s %s %s  %i\n", p->id, p->nome, p->nacionalidade,
           p->nascimento, p->local_nascimento, p->morte,
           p->local_morte, p->idade);
}

void imprimir_pessoa_asterisco(struct pessoa *p) {
    printf("%i ## %s ## %s ## %s ## %s ## %s ## %s ## %i\n", p->id, p->nome, p->nacionalidade,
           p->nascimento, p->local_nascimento, p->morte,
           p->local_morte, p->idade);
}

size_t index_of(char *string, char *to_find) {
    return strstr(string, to_find) - string;
}

char *remove_tags(char *string) {
    // inicializa para o mesmo tamanho da string de entrada para garantir que ira caber
    char * resp = malloc(sizeof(char) * strlen(string) + 1);

    // jumps over the html tags and finds the aproppriate information
    for (size_t i = 0; i < strlen(string); i++) {

        while (i < strlen(string) && string[i] == '<') {
            for (i++; string[i] != '>'; i++);
            i++;

            while(i < strlen(string) && string[i] == '&'){
                for (i++; string[i] != ';'; i++);
                i++;
            }
        }

        while(i < strlen(string) && string[i] == '&'){
            for (i++; string[i] != ';'; i++);
            i++;
            resp[strlen(resp)] = ' ';
        }


        if (i < strlen(string)) {
            resp[strlen(resp)] = string[i];
        }
    }

    while(strlen(string) > 0 && resp[0] == ' '){ // jumps over white spaces on the begining
        resp += 1;
    }
    resp[strlen(resp)] = 0;

    return resp;
}

char* extrair_nome(char *string) { // extract the person's name
    size_t index = index_of(string, "<title>") + strlen("<title>");
    size_t index_fim = index_of(string, " Wiki") - 4;
    char *nome = malloc(sizeof(char) * (index_fim - index));
    memcpy(nome, (string+index), index_fim - index);
    return nome;
}

char* substring(char * string, char *c) {
    return string + strcspn(string, c);
}

void remove_new_line(char *string) {
    char *pos;
    if ((pos=strchr(string, '\n')) != NULL)
        *pos = '\0';
}

void ler_pessoa(char *nome_arquivo, struct pessoa *p) { // parse the file to fill the pessoa struct
    size_t length = strlen(nome_arquivo);
    p->id = (nome_arquivo[length - 8] - 48) * 100;
    p->id = (p->id + (nome_arquivo[length - 7] - 48) * 10);
    p->id = p->id + (nome_arquivo[length - 6] - 48);

    int tamanho_linha = 2000;
    char *linha = malloc(sizeof(char) * tamanho_linha);
    FILE *fp = fopen(nome_arquivo, "r");

    if (fp == NULL) {
        printf("Falha ao abrir arquivo %s\n", nome_arquivo);
        exit(1);
    }

    while (fgets(linha, tamanho_linha, fp) != NULL) {
        if (strstr(linha, "<title>")) { // extracts name
            p->nome = extrair_nome(linha);
            remove_new_line(p->nome);
            break;
        }
    }

    while (fgets(linha, tamanho_linha, fp) != NULL) {
        if (strstr(linha, "Nacionalidade")) { // extracts nationality
            fgets(linha, tamanho_linha, fp);
            p->nacionalidade = remove_tags(linha);
            remove_new_line(p->nacionalidade);
            break;
        }
    }

    while (fgets(linha, tamanho_linha, fp) != NULL) {
        if (strstr(linha, "Nascimento")) { // extracts date of births
            fgets(linha, tamanho_linha, fp);
            p->nascimento = remove_tags(linha); // <-- this one is not working all the time??
            remove_new_line(p->nascimento);
            break;
        }
    }

    //se vivo
    if (strstr(p->nascimento, ")") != NULL) { // if the person is alive the date of birth date is of the type: date(age)
        char *tmp = malloc(sizeof(char) * strlen(p->nascimento)); // so we extract the age
        strcpy(tmp, p->nascimento);
        tmp = tmp + strcspn(tmp, "(") + 1;
        tmp[index_of(tmp, " ")] = 0;
        p->idade = atoi(tmp);
        p->morte = "vivo"; // not dead
        p->local_morte = "vivo"; // not dead
    } else {
        p->morte = ""; // we set this later
        p->local_morte = "";
    }

    while (fgets(linha, tamanho_linha, fp) != NULL) {
        if (strstr(linha, "Local")) { // extracts place of death
            fgets(linha, tamanho_linha, fp);
            p->local_nascimento = remove_tags(linha);
            remove_new_line(p->local_nascimento);
            break;
        }
    }

    if (strlen(p->morte) == 0) { // we set this now if the person is not alive (hence size 0)
        while (fgets(linha, tamanho_linha, fp) != NULL) {
            if (strstr(linha, "Morte")) { // extract death day
                fgets(linha, tamanho_linha, fp);
                p->morte = remove_tags(linha);
                remove_new_line(p->morte);
                break;
            }
        }

        if (strstr(p->morte, "(") != NULL) {
            char *tmp = malloc(sizeof(char) * strlen(p->morte));
            strcpy(tmp, p->morte); // extract age when the person died, like above
            tmp = tmp + strcspn(tmp, "(") + 1;
            tmp[index_of(tmp, " ")] = 0;
            p->idade = atoi(tmp);
            p->morte[index_of(p->morte, "(")] = 0;
        }

        while (fgets(linha, tamanho_linha, fp) != NULL) {
            if (strstr(linha, "Local")) { // get the place of death
                fgets(linha, tamanho_linha, fp);
                p->local_morte = remove_tags(linha);
                remove_new_line(p->local_morte);
                break;
            }
        }
    }

    fclose(fp);
}


int main(int argc, const char * argv[]) {
    struct pessoa p;
    ler_pessoa("/tmp/105.html", &p);
    imprimir_pessoa(&p);
    return 0;
}

Answer 1

resp[strlen(resp)] = ' ';和resp[strlen(resp)] = string[i];不好，因为resp[]当然不是空字符终止。

代码需要一种新方法来确定要分配的resp[]元素。

resp[strlen(resp)] = 0;也值得怀疑。

strlen(resp)返回字符串的长度，不计算null终止符。要使strlen()运行良好，resp必须以首先为空终止，否则它不会引用字符串。 null字符在索引中等于长度，因此resp[strlen(resp)] = 0;是一个无操作函数，而不是杀死一些CPU周期。

代码还有其他问题示例：空间不足。 @Weather Vane;

// bad code
char *tmp = malloc(sizeof(char) * strlen(p->nascimento)); // so we extract the age
strcpy(tmp, p->nascimento);

示例字符串分配器/复制器（注意：strdup()通常存在于许多平台上）

char *strdupe(const char *s) {
  size_t size = strlen(s) + 1;
  dupe = malloc(size);
  if (dupe) {
    memcpy(dupe, s, size);
  }
  return dupe;
}

从文件读取的每次运行中的输出更改

1 个答案: