在n-gram算法上找不到泄漏

时间:2017-01-24 21:07:24

标签: c linked-list segmentation-fault

我正在编写一个C程序,以便在某个字符串中找到最常见的n-gram。

n-gram是

  

来自给定文本序列的n个项目的连续序列

但是,我在函数most_freq_ngram中有一个分段错误。

参数依次为:

  • 我想要计算ngrams的文本
  • 文字中的字符数
  • 我想要计算的n-gram的大小
  • 指向最常见的n-gram的字符串的指针

这是我的代码:

#include <stdlib.h>
#include <ctype.h>
#include <stdio.h>
#include <stdarg.h>
#include <errno.h>

typedef struct nodo_t{
    char* gram;
    int count;
    struct nodo_t * next;
} nodo_t;

typedef struct linked_list{
    nodo_t * head;
} linked_list;

int compare_arrays(char * igram, char * list_gram, size_t ngram_len){

    int i;
    for(i=0;i<ngram_len;i++){
        if(tolower(list_gram[i]) != tolower(igram[i])) return 0;
    }

    return 1;
}

void copy_array(char * igram, char * list_gram, size_t ngram_len){
    int i;
    for(i=0;i<ngram_len;i++)
        list_gram[i] = tolower(igram[i]);
}

void add_gram(char * igram, linked_list * list, size_t ngram_len ){
    if(list == NULL){
        list = malloc(sizeof(linked_list));
        nodo_t* head = malloc(sizeof(nodo_t));
        head->count = 1;
        head->next = NULL;

        head->gram = malloc(ngram_len * sizeof(char));

        int i;
        for(i=0;i<ngram_len;i++)
            head->gram[i] = igram[i];
        list->head = head;
    }else{
       nodo_t * sent = list->head;
       int found = 0;
       while(sent->next != NULL && !found){
           //Check every element, otherwise add to que
           int same = compare_arrays(igram, sent->gram, ngram_len);
           if(same){
               sent->count++;
               found = 1;
           }
           sent = sent->next;
       }

       if(!found){
           sent->next = malloc(sizeof(nodo_t));
           sent = sent->next;
           sent->next = NULL;
           sent->count = 1;
           copy_array(igram, sent->gram, ngram_len);
       }
    }
}

void most_freq_ngram(const char* text, size_t text_len, size_t ngram_len, char** ngram){
    int i;
    linked_list *  list = NULL;

    for(i=0;i<text_len - ngram_len +1;i++){
        char igram[ngram_len+1];
        int j;

        int temp_i = i;
        for(j=0;j<ngram_len;j++){
            igram[j] = text[temp_i];
            temp_i++;
        }

        igram[ngram_len] = '\0';
        add_gram(igram, list, ngram_len);
    }


    //Check list for most frequent element
    char *  most_frequent = malloc(ngram_len * sizeof(char));
    int frequency = 0;

    nodo_t * sent = list->head;

    if(sent == NULL ){
        int i;
        for(i=0;i<ngram_len;i++)
            most_frequent[i] = '\0';
        return;
    }

    while(sent->next != NULL){
        if(sent->count > frequency){
            copy_array(sent->gram, most_frequent, ngram_len);
            frequency = sent->count;
        }
    }

    *ngram = most_frequent;

    return ;
}

int main(){
    size_t ngram_len = 2;
    char *ngram = malloc((ngram_len+1) * sizeof(char));

    size_t text_len = 5;

    const char text[6] = {'a','a','a','a','a', '\0'};

    most_freq_ngram(text, text_len,  ngram_len, &ngram);

    return 0;
}

1 个答案:

答案 0 :(得分:0)

您的功能void add_gram(char * igram, linked_list * list, size_t ngram_len )不会更改list。它会更改list的副本。 list中的原始most_freq_ngram保持不变(NULL指针),这会导致nodo_t * sent = list->head;中的段错误。将add_gram的第二个参数更改为linked_list ** list并相应地重写该函数。