HashTable排名前20位的想法

时间:2012-03-25 22:58:22

标签: c hashtable

我在思考解决问题的想法时遇到了一些麻烦。 我有一个单词计数程序,它使用哈希表进行计数 任意数量的文件中的所有单词,并且只打印单词 在所有文件中,以及它们的计数。我还存储了所有用过的东西 链表中的哈希索引。

解决了我自己的问题,我知道答案很简单。我只想出了一个具有最低计数的那个,如果我的新值大于那个,但它在二十个单词结构的数组中最低计数之一的索引处。

感谢所有人的帮助!

     #include <unistd.h>
     #include <stdio.h>
     #include <stdlib.h>
     #include <sys/types.h>
     #include <sys/stat.h>
     #include <fcntl.h>
     #include <pthread.h>
     #include <string.h>


     /*Structures*///////////////////////////////////////////
     //A struct to hold the words in the hash tables and their
     //counts
     struct counter{
        int count;
        int printed;
        char word[51]; 
        int allfiles[101]; 
        struct counter * next; 
     };

     //A struct to hold the hash indexes of the already visited 
     //index, for easy printing 
     struct index{
        int used; 
        struct index * next; 
     };

     //A simple struct to pass arguments to the work function for 
     //threading
     struct arguments{
        void * id;
        int fileid; 
     };
     ////////////////////////////////////////////////////////

     /*Functions*////////////////////////////////////////////
     static int hash(char * word); 
     static void inHash(struct counter * newWord, int hash, int FILEID); 
     static void indexchain(int hash);
     //static void hashprint(int NUMFILES); 
     static void * work(struct arguments *);
     static void toptwenty(int NUMFILES);
     static void print();  
     ////////////////////////////////////////////////////////

     /*Global Variables*/////////////////////////////////////
     struct counter * top[20] = {0}; 
     struct counter * hashtable[6222] = {0}; 
     struct index * head;
     ////////////////////////////////////////////////////////

     int main(int argc, char * argv[])
     {

      //check for valid number of arguments 
      if(argc < 2)
      {
       fprintf(stderr, "invalid number of arguments"); 
       return 1; 
      }


      //set up index chain starts with a null node
      head = malloc(sizeof(struct index)); 
      head->next = NULL;  
      head->used = -1; 

      //loop through any number of files
      int arg;
      for(arg = 1; arg < argc; arg++) 
      {
        struct arguments * argum = malloc(sizeof(struct arguments)); 
        argum->fileid = arg; 
        argum->id = ((void*)argv[arg]); 
        work(argum); 
      }

      //hashprint(argc);
      toptwenty(argc); 
      print();
      return 0; 
     }


     /*Function Definitions*/
     //this function takes a file name and counts 
     //the words in the file
     static void * work(struct arguments * argum)
     {
        int FILEID = argum->fileid; 
        void * in = argum->id; 
        int fd = open((char*)in, O_RDONLY);
        if (fd == -1)
        {
         fprintf(stderr, "can't open %s for reading!\n", (char*)in);
         exit(-1);
        }
        int BUFLEN = (int) lseek(fd, 0, SEEK_END); 
        lseek(fd, 0, 0); 
        //A few variable
        char buf[BUFLEN + 1];
        int lastRead;

        lastRead = read(fd, buf, BUFLEN);
        if (lastRead == -1)
        {
          fprintf(stderr, "error reading file %s!\n", (char*)in);
          exit(-1);
        }

        //Parse the filebuffer for words. 
        char newword[51];
        int c;
        int curindex = 0; 

        buf[BUFLEN + 1] = ' '; 
        //not doing the last space because it is eof
        for(c = 0; c < BUFLEN + 1; c++)
        {
         if((buf[c] >= 'A' && buf[c] <= 'Z'))
         {
            buf[c] += 32; 
         }
         if(buf[c] >= 'a' && buf[c] <= 'z')
         {
           //add the next char to the string. 
           newword[curindex] = buf[c]; 
           curindex++; 
         }
         else
         {
           //make a new struct for the entry, and add it to the hashtable
           //add its hash to the 
           if(strlen(newword) >= 6)
           {
             struct counter * temp = malloc(sizeof(struct counter));
             strcpy(temp->word,newword);
             int thishash = hash(temp->word);

             //Only save hash indexes if they are in the first files
             if(FILEID == 1)
             {       
               indexchain(thishash);
             }
             inHash(temp, thishash, FILEID);          
            }

            int wordlength = strlen(newword);
            int i; 
            for(i = 0;i < wordlength; i++)
            {
             newword[i] = 0; 
            }
            curindex = 0; 
         }


        } 
        close(fd); 
        return in; 

     }

     //Bad hash function by just adding ascii values of the 
     //characters  
     static int hash(char * word)
     {
      int loop = strlen(word);
      int i;
      int hashval = 0;   
      for(i = 0; i < loop; i++)
        hashval += word[i];

      return hashval; 
     }

     //add a new word to the hash table
     static void inHash(struct counter * newWord, int hash, int FILEID)
     {
      int eflag = 0; 
      if(hashtable[hash] == NULL)
      {
        //if the entry isnt in the table
        if(FILEID == 1)
        {
          newWord->allfiles[FILEID] = 1;                      /*FILEID ARRAY TEST*/
          newWord->count = 1; 
          newWord->next = NULL;
          hashtable[hash] = newWord; 
        }
      }
      else
      {
        //if its not, but what if it is?
        struct counter * cur = hashtable[hash];
        if(strcmp(cur->word, newWord->word) == 0)
        {
            //is the word in the first slot? 
            cur->count += 1;
            cur->allfiles[FILEID] = 1;                     /*FILEID ARRAY TEST*/
            eflag = 1; 
        }
        else
        {
          while(cur->next != NULL)
          {
            cur = cur->next; 
            if(strcmp(cur->word, newWord->word) == 0)
            {
              //if the word already exsists, update the count
              cur->allfiles[FILEID] = 1;                            /*FILEID ARRAY TEST*/
              cur->count += 1;
              eflag = 1; 
              break; 
            }

          }
        }

        //if its not in any bucket, make a new bucket
        if(eflag == 0)
        {
          //Else add the new entry to the end of that list
          if(FILEID == 1)
          {
            newWord->allfiles[FILEID] = 1;                            /*FILEID ARRAY TEST*/
            newWord->count = 1; 
            newWord->next = NULL;
            cur->next = newWord;
          }
        }   
      }
     } 

     //adding a value to the linked list for printing
     static void indexchain(int hash)
     {  
       struct index * p = head;
       int eflag = 0; 
       while(p->next != NULL)
       {
         if(p->used != hash)
           p = p->next;
         else
         {
          eflag = 1; 
          break;
         }
       }
       if(eflag == 0)
       {
         struct index * newValue = malloc(sizeof(struct index));
         newValue->used = hash;
         newValue->next = NULL;
         p->next = newValue; 
       }

     }
     /*
     //This function will print the values in the hash tables and their counts
     //Prints based on number of files to check if words are in all files
     static void hashprint(int NUMFILES)
     {
       struct index * p;
       p = head->next; 
       int hash; 
       int i; 
       int printbool = 1; 
       while(p != NULL)
       {
         hash = p->used; 
         struct counter * ptr = hashtable[hash];

         while(ptr != NULL)
         {

           if(ptr->printed == 0)
           {
             for(i = 1; i < NUMFILES; i++)
             {
               if(ptr->allfiles[i] == 0)
               {
                 printbool = 0; 
                 break;
               }
               else
                 printbool = 1; 
             }    
             if(printbool == 1)
             {
               ptr->printed = 1; 
               printf("%s    %d\n", ptr->word, ptr->count);
             }        
           } 
           ptr = ptr->next;        
          }
          p = p->next; 
       }


     }
     */
     //A function to see which numbers have the top twenty highest count
     static void toptwenty(int NUMFILES)
     {
       struct index * p;
       p = head->next; 
       int hash; 
       int i; 
       int printbool = 1; 
       while(p != NULL)
       {
         hash = p->used; 
         struct counter * ptr = hashtable[hash];

         while(ptr != NULL)
         {

           if(ptr->printed == 0)
           {
             for(i = 1; i < NUMFILES; i++)
             {
               if(ptr->allfiles[i] == 0)
               {
                 printbool = 0; 
                 break;
               }
               else
                 printbool = 1; 
             }    
             if(printbool == 1)
             {
               for(i = 0; i < 20; i++)
               {
                 if(top[i] == NULL)
                 {
                   top[i] = ptr; 
                   break;
                 }
                 else if(ptr->count > top[i]->count)
                 {
                   top[i] = ptr;
                   break;              
                 }

               }
             }        
           } 
           ptr = ptr->next;        
          }
          p = p->next; 
       }
     }
     //print the top 20 count
     static void print()
    {
     int i;
     for(i = 0; i < 20; i++)
     {
       if(top[i] != NULL)
       {
         if(top[i]->printed == 0)
         {
           //printf("%s\n", top[i]->word);
           printf("%s    %d\n", top[i]->word, top[i]->count);
           top[i]->printed = 1; 
         }
       }
       else
         break; 
     }
    } 

1 个答案:

答案 0 :(得分:0)

创建一个优先级队列,该队列包含具有最高计数及其相应计数的20个哈希索引。

当您计算最低值时,如果您的新单词胜过,则将其从队列顶部删除,并将其从队列O(1)中删除,并将新值添加到仅O(log(n))的队列中O(log(20)) {1}}。