HashTable如何只重复一次重复的单词?

时间:2017-09-03 20:33:04

标签: c string file pointers hashtable

我正在实现一个hashTable,但是我遇到了一些问题,我总是在添加一个单词时打印更新的哈希表,问题是,当这个单词再次出现时我只需要增加它的频率,但我的程序再次以更新的频率打印它:如何只打印一次重复的单词?并显示他们的频率。

我遇到的另一个问题是函数print_freq。它收到一个int freq,我应该用这个频率打印单词,但问题是auxTable没有保存htable中的单词,我不知道为什么它不起作用,因为auxtable保存频率正常,但是当它是要保存单词,它会保存一个空的字符“”。

#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <ctype.h>

#define HTABLE_SIZE 1001
#define MAX_LINE_SIZ 1024

/* Hash Table */
typedef struct node* HASH_TABLE; /* estrutura de dados utilizadas para formar a hastable*/
struct node {
    char *palavra; /*word*/
    int freq;
};

/*Declaracao das funcoes*/
void inserirHashTable(char *s);
void print_ht();
void print_test();

HASH_TABLE htable[HTABLE_SIZE] = { NULL }; /* Hash table que armazenará as palavras lidas do arquivos */
unsigned int chaves[HTABLE_SIZE]; /* Vetor que armazenará as chaves das palavras da tabela*/
int tamanhoChaves=-1; /*Variavel responsavel por contar a quantidade de chaves do vetor de chaves*/
int size = 0; /* variavel responsavel por armazenar o numero de elementos da tabela*/

/*Função responsavel por processar o arquivo, a mesma recebe o arquivo como parametro,
 * pega cada palavra presente no arquivo separa dos simbolos ?!+.-... e chama a  função
 * inserirHT para inserir a palavra separada na tabela hash*/
void processarArquivo(FILE *fp)
{
    const char *seperators = " ?!'\";,.:+-*&%(){}[]<>\\\t\n"; // caractertes que deveram ser separados

    char line[MAX_LINE_SIZ];
    char *s;
    while((fgets(line,MAX_LINE_SIZ, fp)) != NULL) //pegando a linha do arquivo
    {
        for (s=strtok(line,seperators); s; s=strtok(NULL,seperators)){ // separando a palavra
            /*printf("Palavra a ser inserida %s \n",s); printf utilizado para mostrar
             * a palavra que foi seperada e será inserida*/
            inserirHashTable(s);//Chamando a função inserir
        }
    }
}

/* Função responsavel por criar a chave de cada palavra que vai para tabela,
  recebe como parametro um ponteiro para string, logo em seguida pega cada
  caractere da string e gera um unsigned int para ele, retorna por fim o
  modulo desse unsigned int pelo tamanho da tabela*/
unsigned int hash(char *tok)
{
    unsigned int hv = 0;
    while (*tok)
        hv = (hv << 4) | toupper(*tok++);
    /*printf("conversao: %d \n",hv); Printf utilizado para mostrar o valor de hv antes de ser retorna como modulo*/
    return hv % HTABLE_SIZE;
}

/* funcao responsavel por isenrir a palavra lida do arquivo na hash_table,
 * a funçãp recebe como parametro um ponteiro para estra palavra*/
void inserirHashTable(char *palavra) {
    /*printf("Inserindo a palavra %s \n",palavra); Printf utilzado para mostrar a palavra a ser inserida na tabela*/
    tamanhoChaves++; /*Assim que uma palavra é inserida o numero de chaves é incrementado*/
    chaves[tamanhoChaves] = hash(palavra);/*A palavra é convertida na função hash e sua armazenada no vetor de chaves*/
    unsigned int hashval = chaves[tamanhoChaves]; /*Chave da apalvra*/

    if (htable[hashval]==NULL){
      /*printf("indice %u de %s \n",hashval,palavra);Printf utilizado para mostrar a chave e a palavra a ser inserida*/
        htable[hashval] = malloc(sizeof(palavra)); /*Alocando memoria para palavrra*/
        htable[hashval]->palavra = palavra ; /*Inserindo a palavra*/
        htable[hashval]->freq = 1; /*Incrementado sua frequencia*/
        size++;

    }else {
        /*If a words already exists in the table, i just incremente her frequency and the size. I guess the problem for repeated word is in here*/
        htable[hashval]->freq++;
        size++;
    }
    /*A tabela é impressa a cada instante que uma palavra é inserida*/
    printf("\nAtualização da tabela\n");
    print_ht();/*Impressao das palavras já recebidas, a cada instante, com a quantidade de ocorrências*/

}


/* Function responsible to print the words that were addedd to the hash table*/
void print_ht() {
    int i=0;
    /*Tabela auxiliar que servira para impressao das palavras e suas chaves*/
    HASH_TABLE *auxTable = (HASH_TABLE*) malloc(sizeof(HASH_TABLE)*size);
    unsigned int hashval; /* variavel utilizada para pegar a chave das palavras no vetor de chaves */

    for(i; i < size; i++){
        hashval = chaves[i]; /*Pegando a chave*/
        /*printf("indice %u de %s \n",hashval,htable[hashval]->token);Printf utilizado para ver a chave e a palavra*/
        auxTable[i] = htable[hashval]; /*Atribuindo a palavra e a freq para tabela auxiliar*/
    }

    /*qsort(auxTable,size,sizeof(link),compare);*/
    /*Imprimindo a tabela*/
    printf("Palavra  |  Frequencia\n");
    for (i=0; i < size; i++)
        printf("%s  \t        %d\n",auxTable[i]->palavra,auxTable[i]->freq);
    free(auxTable);
}

/*Funcion responsible to print the words with the frequency received in the paramater*/
void print_freq(int freq){
    printf("Palavras com a frequencia: %d\n", freq);
    int i, j =0;
    HASH_TABLE *auxTable = (HASH_TABLE*) malloc(sizeof(HASH_TABLE)*size);
    unsigned int hashval;

    for(i; i < size; i++){
        hashval = chaves[i];
        /*printf("indice %u de %s \n",hashval,htable[hashval]->palavra);*/
        auxTable[i] = htable[hashval]; /*Problem is in here, when I do this, the auxTable[i]->palavra(word) = "", but the freq has been saved normally n*/
    }

    printf("Palavra  |  Frequencia\n");
    for (i=0; i < size; i++) {
        if(auxTable[i]->freq == freq) {
            printf("%s  \t           %d\n",auxTable[i]->palavra,auxTable[i]->freq); /*When I print, only the frequency is showed*/
        }
    }
    free(auxTable);

}
int main(int argc, char *argv[])
{
    int i;
    FILE *fp;

    fp = fopen("input.txt","r");
    if (NULL == fp)
    {
        fprintf(stderr,"Error ao abrir o arquivo: %s\n",fp);
    }
    printf("Imprimindo processo \n");
    processarArquivo(fp); /* debuuga aqui pra tu entender o q rola*/

    fclose(fp);
    print_freq(3); //should print the word with freq equal to 3
    //print_ht();
    /*clear_ht();*/
    return 0;
}

输出:

https://imgur.com/a/PlRPp

3 个答案:

答案 0 :(得分:1)

以下是我如何解决冲突,并且如果/在必要时允许调整哈希表的大小而不重复所有单词。

首先,我使用单链表来包含具有相同哈希的所有不同单词。我还保存散列 - 完整散列,而不是模数散列表大小 - 以便轻松调整散列表的大小。最后,我喜欢使用C99 灵活数组成员来表示单词本身:

struct hash_entry {
    struct hash_entry  *next;   /* Pointer to next word in this hash table entry */
    size_t              hash;   /* Any unsigned type works; I just like size_t */
    size_t              freq;   /* Frequency */
    char                word[]; /* C99 flexible array member */
};

哈希表只是一个size指针数组,每个单词哈希值hash位于entry[hash % size]的单链表中:

struct hash_table {
    size_t              size;
    struct hash_entry **entry;
};

然后

初始化,调整大小和释放哈希表的基本功能
int hash_table_create(struct hash_table *ht, size_t size)
{
    size_t  i;

    if (ht == NULL || size < 1)
        return -1; /* Invalid parameters */

    ht->size = size;
    ht->entry = malloc(size * sizeof ht->entry[0]);
    if (ht->entry == NULL)
        return -2; /* Cannot allocate memory */

    /* Clear all entries: no hashes/chains yet! */
    for (i = 0; i < size; i++)        
        ht->entry[i] = NULL;

    return 0; /* Success, no errors. */
}

void hash_entry_free(struct hash_entry *entry)
{
    while (entry) {
        struct hash_entry *next = entry->next;

        /* I like to "poison" the freed entries;
           this makes debugging easier, if you try
           to access an already freed entry. */
        entry->hash = 0;
        entry->freq = 0;
        entry->next = NULL;

        free(entry);

        entry = next;
    }
}

void hash_table_free(struct hash_table *ht)
{
    if (ht != NULL) {
        size_t  i;

        for (i = 0; i < ht->size; i++)
            if (ht->entry[i] != NULL)
                hash_entry_free(ht->entry[i]);

        free(ht->entry);

        ht->size = 0;
        ht->entry = NULL;
    }
}

int hash_table_resize(struct hash_table *ht, size_t new_size)
{
    struct hash_entry **entry;
    struct hash_entry  *curr, *next; 
    size_t  i, k;

    if (!ht || new_size < 1)
        return -1; /* Invalid parameters */

    if (ht->size < 1 || !ht->entry)
        return -2; /* Hash table is already freed */

    entry = malloc(new_size * sizeof entry[0]);
    if (!entry)
        return -3; /* Not enough memory */

    for (i = 0; i < new_size; i++)
        entry[i] = NULL;

    for (i = 0; i < ht->size; i++) {

        /* Chain in the old hash table entry */
        curr = ht->entry[i];            

        /* We are paranoid, and clear the old entry. */
        ht->entry[i] = NULL;

        while (curr) {
            /* Remember next hash in this chain */
            next = curr->next;

            /* Index to the new hash table */
            k = curr->hash % new_size;

            /* Prepend in front of the new hash table entry chain */
            curr->next = entry[k];
            entry[k] = curr;

            /* Advance to the next entry in the old chain */
            curr = next;
        }

    /* The old table is now useless. Free it, and use the new one. */
    free(ht->entry);
    ht->entry = entry;
    ht->size = new_size;

    return 0; /* Success; no errors. */
}

关于哈希函数,我喜欢djb2 xor hash

size_t hash(const char *s)
{
    if (s != NULL) {
        size_t  result = 5381;

        while (*s != '\0')
            result = (result * 33) ^ (*(s++));

        return result;
    } else
        return 0;
}

size_t hash_len(const char *s, const size_t len)
{
    if (s != NULL) {
        const char *z = s + len;
        size_t  result = 5381;

        while (s < z)
            result = (result * 33) ^ (*(s++));

        return result;
    } else
        return 0;
}

我还将一个字符串/单词添加到哈希表中分成两个函数:第一个函数创建struct hash_entry并将源字复制到其中,第二个函数使用第一个函数创建该条目,然后将其添加到哈希表中:

struct hash_entry *hash_entry_new_len(const char *src, size_t len)
{
    struct hash_entry *h;

    if (len > 0 && !src)
       return NULL; /* NULL src, but positive len! */

    /* To accommodate the flexible array member, we need
       to add its size to the structure size. Since it
       is a string, we need to include room for the '\0'. */
    h = malloc(sizeof (struct hash_entry) + len + 1);
    if (!h)
        return NULL; /* Out of memory */

    /* Copy the string to the entry, */
    if (len > 0)
        memcpy(h->word, src, len);

    /* Add the string-terminating nul char, */
    h->word[len] = '\0';

    /* clear the next pointer, */
    h->next = NULL;

    /* set the frequency count to 1, */
    h->freq = 1;

    /* and compute the hash. */
    h->hash = hash_len(src, len);

    /* Done! */
    return h;
}

struct hash_entry *hash_entry_new(const char *src)
{
    const size_t  len = (src) ? strlen(src) : 0;
    return hash_entry_new_len(src, len);
}

struct hash_entry *hash_table_add_part(struct hash_table *ht, const char *src, const size_t len)
{
    struct hash_entry *h;
    size_t             k;

    if (!ht || ht->size < 1)
        return NULL;  /* No hash table! */

    /* We'll check src and len, so we report the right error. */
    if (!src && len > 0)
        return NULL;  /* Invalid src (NULL)! */

    h = hash_entry_new(src, len);
    if (!h)
        return NULL;  /* Must be out of memory! */

    /* Index into the hash table */
    k = h->hash % ht->size;

    /* Prepend new hash table entry to the beginning of the chain. */
    h->next = ht->entry[k];
    ht->entry[k] = h;

    /* Success! */
    return h;
}

/* Helper function, so you don't need to specify the length
   if you wish to add a copy of entire source string. */
struct hash_entry *hash_table_add(struct hash_table *ht, const char *src)
{
    const size_t  len = (src) ? strlen(src) : 0;
    return hash_table_add_part(ht, src, len);
}

len = (src) ? strlen(src) : 0表达式是if (src != NULL) len = strlen(src); else len = 0;的简写。我使用它很多,作为检查字符串长度的安全方法,或者如果字符串为空或NULL则使用0。

另请注意,NULL字符串将接收散列0,而空字符串将散列到5381。这并不重要,我只想点那样的方式(或者像他们所说的那样完全过于挑剔和充满热气)。

请注意我的正常情况&#34;函数以_len()结尾,具有相同名称但没有_len()后缀的函数是使用整个字符串的辅助函数。如果您不使用strtok()拆分字符串,这很有用,例如strspn() / strcspn()甚至是正则表达式,以便在每个字符串中找到有趣的单词。

要在哈希表中查找特定单词,我们需要将原始字符串与之比较;哈希本身是不够的:

struct hash_entry *hash_table_find_len(struct hash_table *ht, const char *src, const size_t len)
{
    const size_t  hashval = hash_len(src, len);
    struct hash_entry *curr = ht->entry[hashval % ht->size];

    /* No matches for sure? */
    if (!curr)
        return NULL;

    /* We have a chain (singly-linked list).
       Check each one in turn. */
    while (curr) {

       /* Since we kept the entire hash value,
          and not just the index to the hash table,
          we can use the extra bits to exclude
          words that have the same hash modulus (index)
          but different complete hash value! */
       if (curr->hash == hash) {

           /* We cannot use strncmp() if len == 0,
              so we check that case separately. */
           if (len == 0) {
               if (curr->word[0] == '\0')
                   return curr; /* Match found! */
           } else {
               if (!strncmp(curr->word, src, len) &&
                   curr->word[len] == '\0')
                   return curr; /* Match found! */
           }
       }

       /* No match. Check next one in chain. */
       curr = curr->next;
    }

    /* Nope, no match. */
    return NULL;
}

struct hash_entry *hash_table_find(struct hash_table *ht, const char *src)
{
    const size_t  len = (src) ? strlen(src) : 0;
    return hash_table_find_len(ht, src, len);
}

计算单词的频率现在很简单:

int hash_table_seen_len(struct hash_table *ht, const char *src, const size_t len)
{
    struct hash_entry *h;

    /* Sanity checks first. */
    if (!ht || (!src && len > 0))
        return -1; /* Invalid parameters! */

    h = hash_table_find_len(ht, src, len);
    if (h) {
        /* Found match; increment freq counter. */
        h->freq++;
        /* All done. */
        return 0;
    }

    /* Not found. Add to hash table. */
    h = hash_table_add_len(ht, src, len);
    if (!h) {
        /* An error occurred; should be "out of memory",
           since we checked the other causes earlier
           in this function. */
        return -1;
    }

    /* The word was added to the hash table.
       Since its freq count is 1, we do not need
       to increment it; we're done. */
    return 0;
}

int hash_table_seen(struct hash_table *ht, const char *src)
{
    const size_t  len = (src) ? strlen(src) : 0;
    return hash_table_seen_len(ht, src, len);
}

我非常确定按频率顺序打印哈希表条目,我使用两个辅助函数:一个用于查找最大频率,另一个用于查找最小频率小于一个给定频率:

size_t hash_table_max_freq(struct hash_table *ht)
{
    size_t  result = 0;
    size_t  i;

    if (!ht || ht->size < 1)
        return 0;  /* No hash table. */

    for (i = 0; i < ht->size; i++) {
        struct hash_entry *curr = ht->entry[i];

        while (curr) {
            if (curr->freq > result)
                result = curr->freq;
            curr = curr->next;
        }
    }

    return result;
}

size_t hash_table_next_freq(struct hash_table *ht, const size_t max_freq)
{
    size_t  result = 0;
    size_t  i;

    if (!ht || ht->size < 1)
        return 0;  /* No hash table. */

    for (i = 0; i < ht->size; i++) {
        struct hash_entry *curr = ht->entry[i];

        while (curr) {
            if (curr->freq > result && curr->freq < max_freq)
                result = curr->freq;
            curr = curr->next;
        }
    }

    return result;
}

最后,我们可以“偷”&#34; qsort()的界面,用于查找所有单词或具有特定频率的所有单词:

int hash_table_for_all(struct hash_table *ht,
                       int (*func)(struct hash_entry *, void *),
                       void *user_data)
{
    int     retval;
    size_t  i;

    if (!ht || !func)
        return -1; /* NULL hash table or function. */

    for (i = 0; i < ht->size; i++) {
        struct hash_entry *curr = ht->entry[i];
        while (curr) {
            retval = func(curr, user_data);
            if (retval)
                return retval;
            curr = curr->next;
        }
    }

    return 0;
}

int hash_table_for_freq(struct hash_table *ht, const size_t freq,
                       int (*func)(struct hash_entry *, void *),
                       void *user_data)
{
    int     retval;
    size_t  i;

    if (!ht || !func)
        return -1; /* NULL hash table or function. */

    for (i = 0; i < ht->size; i++) {
        struct hash_entry *curr = ht->entry[i];
        while (curr) {
            if (curr->freq == freq) {
                retval = func(curr, user_data);
                if (retval)
                    return retval;
            }
            curr = curr->next;
        }
    }

    return 0;
}

以上代码均未经过编译测试,因此如果您发现任何拼写错误(或遇到编译时错误),请在评论中告诉我,以便我验证并修复。

答案 1 :(得分:0)

问题在于您作为node.palavra存储的内容。如果我们跟踪它,那么inserirHashTable的输入就是strtok中调用processarArquivo的结果。这是因为所述指针在输入数组中是事物变得毛茸茸,输入数组是函数line中的局部变量processarArquivo。一旦从文件扫描另一行或终止此功能,这将无效。这正是构造哈希表(processarArquivoline仍然在范围内)时工作得很好的原因,并且一旦函数终止(当你返回main时,就会导致未定义的行为)继续致电print_freq)。

现在,inserirHashTable还有2点需要改进:

  • 请注意您如何致电malloc。您分配sizeof(char *)字节来保存新的HASH_TABLE结构。为什么?
  • 更重要的是,您将指针复制为palavra字段的值。你应该make a copy here

奖金问题是print_freq中未初始化的i变量,auxTable的存在以及您认为散列值是唯一的这是错误的事实(怎么可能?有超过1001个?不同的词!)。

答案 2 :(得分:0)

我做了你的运动。

#include <stdio.h>
#include <string.h>
#include <stdlib.h>

//hash table size
#define SIZE 1000
//maximum size of the input string
#define STR_LEN 100

typedef struct linked_list {
    unsigned char value[STR_LEN];
    unsigned int cnt;   
    struct linked_list *next;
} linked_list;

void print_items(int freq); 
void print_freq(int freq);
void print_table();
int add_to_table(unsigned char *str); 
unsigned long get_hash(unsigned char *str);

linked_list* table[SIZE];

int main() {
    unsigned char str[STR_LEN];
    //the fgets function allows the spaces in the string.
    //so, strings like: 'red green blue' are possible.
    while(fgets(str, STR_LEN, stdin)) {
        sscanf(str, "%[^\n]", str);

        add_to_table(str);
    }

    print_freq(2);

    return 0;
}

void print_table(){
    puts("Whole table");
    print_items(0);
}

void print_freq(int freq){
    printf("Words with frequency: %d\n", freq);
    print_items(freq);
}

// if freq = 0, it prints all hash table items.
// if freq = 1, 2, 3... it prints only items with the specific frequency.
void print_items(int freq) {
    int i;
    linked_list *node;
    puts("----------------------");
    puts("Values\t|  Frequency");
    for(i = 0; i < SIZE; i++) {
        node = table[i];
        while(node) {
            if(freq && freq != node->cnt) {
                node = node->next;
                continue;
            }
            printf("%s\t|  %d\n", node->value, node->cnt);
            node = node->next;
        }
    }
    puts("----------------------");
}

//Collision processing implemented by the linked list.
//In the beginning, all items of the hash table are NULL.
//
//The first string with specific hash goes to the corresponding 'table' array index
//in the linked list's node form.
//
//Then, if the collision occurs (different string with the same hash)
//the new node shifts the previous node and linking to it 
int add_to_table(unsigned char *str) {
        unsigned long hash;
        hash = get_hash(str);
        hash = hash & SIZE;

        linked_list *node = table[hash];

        while(node) {
            if(strcmp(str, node->value) == 0) {
                node->cnt++;    
                return 1;
            }
            node = node->next;
        }

        //if the item does not exist (wasn't found in the previous while loop),
        //the new node is created
        linked_list *new_node = malloc(sizeof(linked_list));
        //the new node 'next' field is pointing to the previous 
        //first node (head) of the linked list
        new_node->next = table[hash];
        new_node->cnt = 1;
        strcpy(new_node->value, str);

        //and the new node became the new 'head'
        table[hash] = new_node; 

        print_table();

        return 1;
}

// the 'djb2' hash algorithm is used
unsigned long get_hash(unsigned char *str)
{
    unsigned long hash = 5381;
    int c;

    while (c = *str) {
        hash = hash * 33 + c;
        str++;
    }

    return hash;
}

<强>测试

one                          <--- input "one"
Whole table                  <--- output 
----------------------            the whole table is printed each time,
Values  |  Frequency              the new string is added
one     |  1
----------------------
two                          <--- input "two"
Whole table
----------------------
Values  |  Frequency
two     |  1
one     |  1
----------------------
one                          <--- input - no output, because the hash table 
                                  already have this string
two                          <--- input - the same
three                        <--- input - the output is appearing now,
                                  'three' didn't inputted before
Whole table
----------------------
Values  |  Frequency
two     |  2
three   |  1
one     |  2
----------------------
Words with frequency: 2      <--- I am send EOF (end of file) to the
                                  program by Ctrl-D 
----------------------
Values  |  Frequency          
two     |  2
one     |  2
----------------------