Question

我有一个程序以这种格式读取一个巨大的行文本文件，我需要从这个文本文件构造一个数据结构。

microfinance 5 41 5 1650 2 1667 1 1811 1 1988 5 
subminiature 1 432 1

单词后面的第一个数字是找到单词的文档数。以下数字在文档ID号和文档中找到的单词出现次数之间交替。因此，对于小额信贷，有5个文件，第一个是文档41，有5个出现，其次是doc 1650 with 2，等等。

我使用strtok获取每个元素并组织它们。我知道strtok工作正常。问题是将元素正确地附加到我的数据结构中。

DocumentNode *myDoc;
while (fgets(theLine, sizeof(theLine), newPointer) != NULL)
    {
        counter = 0;
        pch = strtok (theLine," ");
        while (pch != NULL)
        {
         if (0 == counter)
         {
            WordNode *toInsertPtr = (malloc(sizeof(struct WordNode)));
            word = (malloc(100));
            strncpy (word, pch, strlen(pch));
            toInsertPtr->word = word;
            toInsertPtr->next = NULL;

            currIndex = JenkinsHash(word, MAX_HASH_SLOT);
            if ((TheIndex->index[currIndex]) == NULL)
            {
                TheIndex->index[currIndex] = toInsertPtr;
            }
            else 
            {
                TheIndex->index[currIndex]->next = toInsertPtr;
            }   
         }

         if (1 == counter)
         {
            numOfDocs = atoi(pch);
         }

         if (counter % 2 == 0 && counter != 0 && pch != NULL)
         {
            myDoc= (malloc(sizeof(struct DocumentNode)));
            myDoc->next = NULL;
            int doc_id = atoi(pch);
            myDoc->documentID = doc_id;         
         }

         if (counter % 2 != 0 && counter != 1 && pch != NULL)
         {
            myDoc->occurences = atoi(pch);

            if (TheIndex->index[currIndex]->page == NULL)
            {
                TheIndex->index[currIndex]->page = myDoc;
            }
            else
            {
                TheIndex->index[currIndex]->page->next = myDoc;
            }
         }
          pch = strtok (NULL, " ");
          counter++;
        }
    }

我已经GDB了解问题就在这里。检查索引中是否有doc节点的第一个if语句总是捕获为null（即使在索引中的那个点上有明显的东西）并且它一次又一次地覆盖一个槽。为什么它总是相信它不是什么时候呢？

        if (TheIndex->index[currIndex]->page == NULL)
        {
            TheIndex->index[currIndex]->page = myDoc;
        }
        else
        {
            TheIndex->index[currIndex]->page->next = myDoc;
        }

数据结构如下：

typedef struct DocumentNode {
    struct DocumentNode *next;      // pointer to next member of the list.
    int documentID;                 //doc identifier (filename, ie. 1, 2, etc.)
    int occurences;                 //num. occurances. 
} DocumentNode;

typedef struct WordNode {                
    struct WordNode *next;           //pointer to the next word (for collisions)
    char *word;                      //the word itself.
    DocumentNode *page;              // pointer to the first element of the page list.
} WordNode;

typedef struct InvertedIndex {
    WordNode *index[MAX_HASH_SLOT];   
} InvertedIndex;

Answer 1

你的方法过于复杂：循环试图维持状态，并有一系列条件决定如何处理下一个标记。

不是一次做strtok个，而是先做第一个得到这个词，第二个得到计数，然后成对完成其余的。它应该如下：

while (fgets(theLine, sizeof(theLine), newPointer) != NULL) {
    pch = strtok (theLine," ");
    char *word = malloc(strlen(pch)+1);
    strcpy(word, pch);
    ... // Add the word
    pch = strtok(NULL, " ");
    int pairCount = atoi(pch);
    for (int i = 0 ; i != pairCount ; i++) {
         pch = strtok(NULL, " ");
         int id = atoi(pch);
         pch = strtok(NULL, " ");
         int count = atoi(pch);
         ... // Add the document
    }
}

P.S。如果你很好地理解这种方法，你可能会喜欢Edsger Dijkstra的this tale。

在C中重建索引

1 个答案: