Question

我正在尝试将我制作的文本文件读入链接列表，文本文件如下所示：

 around 1 2 1
 bread 2 4 3 5 1
 four 1 3 2
 head 3 1 2 2 1 5 1
 has 2 3 1 5 2

每行的第一个字符串只是段落中的单词。单词后面的第一个数字是段落中找到单词的行数。然后，以下数字是段落中的（行，出现次数）对。

例如，对于单词bread：

在段落的2行中找到了它。在第一行4中，找到了3次。然后在第二行5行中，发现1时间。

我正在尝试从此文本文件创建链接列表，到目前为止我的程序看起来像这样：

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <ctype.h>

#define MAXWORD 999

typedef struct node node_t;

struct node {
    char *word;
    int num_lines;
    int paragraph;
    int freq;
    node_t *next;
};

int
main(int argc, char *argv[]) {
    FILE *fp;
    char word[MAXWORD+1];
    int ch, line_count = 0, len = 0;
    node_t *node = (node_t*)malloc(sizeof(*node));
    node_t *curr, *prev;

    fp = fopen(argv[1], "r");

    if (fp == NULL) {
        fprintf(stderr, "Error reading file\n");
        exit(EXIT_FAILURE);
    }

    /* Just trying to store the string so far */
    while ((ch = getc(fp)) != EOF) {
        if (ch == '\n') {
            line_count++;
            strcpy(node->word, word);
        }

        if (isalpha(ch)) {
            word[len] = ch;
            len++;
            word[len] = '\0';
        } 

        if (isdigit(ch)) {
            len = 0;
        }
    }

    printf("line count = %d", line_count);

    free(node)

    fclose(fp);

    return 0;
}

在这个片段中，我一直在尝试将字符串存储在链表数据结构中，但我还没有使用动态数组来存储文本文件中出现的单词之后的数字。我知道我需要使用malloc()和realloc()构建此数据结构，但我不确定如何执行此操作。

我该怎么做？

我想要的输出如下：

There are five words in the text file, 
and 9 pairs of (line, occurences)

Word: pairs
"around": 2,1
"bread": 4,3; 5,1
"four": 3,2
"head": 1,2; 2,1; 5,1
"has": 3,1; 5,2

更新

我一直在研究它，它似乎与倒排索引问题非常相似，我已经看到使用二叉搜索树是最好的。

我可以像这样实现我的二叉搜索树：

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <ctype.h>

#define MAXWORD 999

typedef char word_t[MAXWORD+1];

typedef struct node node_t;

struct node {
    void *data;
    int *ints;
    node_t *rght;
    node_t *left;
};

typedef struct {
    node_t *root;
    int (*cmp)(void*, void*);
} tree_t;

int
main(int argc, char *argv[]) {
    FILE *fp;

    fp = fopen(argv[1], "r");

    if (fp == NULL) {
        fprintf(stderr, "Error reading file\n");
        exit(EXIT_FAILURE);
    }

    while ((ch = getc(fp)) != EOF) {
        if (ch == '\n') {
            line_count++;
        }
    }

    fclose(fp);

    return 0;
}

Answer 1

你可以这样做：

typedef struct {
    int paragraph;
    int freq;
} stats_t;

struct node {
    char *word;
    int num_lines;
    stats_t *stats;
    node_t *next;
};

然后在解析字符串后，您可以执行以下操作：

ps = calloc(line_count, sizeof(stats_t));

获取指向stats_t结构数组的指针，您可以使用行位置和频率填充这些结构。然后，您可以将指针ps存储在node结构中。

Answer 2

我写了一个程序来完成我认为你正在寻找的东西。我修改了之前想到的结构：

typedef node node_t;

struct node {
    char *word;
    int num_lines;
    int *location;
    int *frequency;
    node_t *next;
};

这样，节点包含指向int数组的指针，用于存储位置和频率信息。字串，位置数组和频率数组的节点和存储都是动态分配的。这是代码：

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>

#define MAXLINE  1000
#define MAXWORD    30

typedef struct node node_t;

struct node {
    char *word;
    int num_lines;
    int *location;
    int *frequency;
    node_t *next;
};

void strip(char *pln);
void normalize_word(char *pstr);
struct node * update_word(char *pwd, int lnum, struct node *phead);
struct node * find_in_list(char *pwd, struct node *phead);
int find_line_pair(int lnum, struct node *pwn);
int list_len(struct node *phead);
int num_pairs(struct node *phead);

int main(int argc, char *argv[])
{
    FILE *fp;
    struct node *head, *current;
    char *pline, *pword;
    char line[MAXLINE + 1];
    char word[MAXWORD + 1];
    int i, n, line_count = 0;

    head = NULL;

    if (argc < 2) {
        fprintf(stderr, "Usage: %s filename\n", argv[0]);
        exit(EXIT_FAILURE);
    } else {        
        if ((fp = fopen(argv[1], "r")) == NULL) {
            fprintf(stderr, "Unable to open file %s\n", argv[1]);
            exit(EXIT_FAILURE);
        }
    }

    /* Read in lines and process words */
    pline = line;
    pword = word;
    while (fgets(pline, MAXLINE, fp) != NULL) {
        ++line_count;
        strip(pline);
        while ((pword = strtok(pline, " ")) != NULL) {
            normalize_word(pword);
            if (*pword != '\0')     // don't add empty words
                head = update_word(pword, line_count, head);
            pline = NULL;
        }
        pline = line;
    }

    /* Display list contents */
    printf("There are %d words in the text file,\n",
           list_len(head));
    printf("and %d pairs of (line, occurrences)\n",
           num_pairs(head));
    printf("Word: pairs\n");
    current = head;
    while (current != NULL) {
        n = current->num_lines;
        printf("%s:", current->word);
        for (i = 0; i < n; i++) {
            printf(" %d, %d;",
                   current->location[i], current->frequency[i]);
        }
        putchar('\n');
        current = current->next;
    }

    /* Cleanup */
    // close file
    if (fclose(fp) != 0)
        fprintf(stderr, "Error closing file %s\n", argv[1]);

    // free all allocated memory
    current = head;
    while (current != NULL) {
        free(current->word);
        free(current->location);
        free(current->frequency);
        current = current->next;
        free(head);
        head = current;
    }

    return 0;
}

/* Remove trailing newlines */
void strip(char *pln)
{
    while (*pln != '\0') {
        if (*pln == '\n')
            *pln = '\0';
        ++pln;
    }
}

/* Convert word to lowercase and remove trailing
 * non-alphanumeric characters                   */
void normalize_word(char *pstr)
{
    int i = 0;
    char ch;

    while ((ch = pstr[i]) != '\0') {
        pstr[i] = tolower(ch);
        ++i;
    }
    while ((--i >= 0) && !isalnum(pstr[i])) {
        pstr[i] = '\0';
        continue;
    }
}

/* Update existing word node or create a new one, and return
 * a pointer to the head of the list */
struct node * update_word(char *pwd, int lnum, struct node *phead)
{
    struct node *found, *newnode;
    char *pword;
    int *ploc, *pfreq;
    int index;

    /* Modify existing node if word is in list */
    if ((found = find_in_list(pwd, phead)) != NULL) {
        // add new (location, freq) pair if word not in found line
        if ((index = find_line_pair(lnum, found)) == -1) {
            index = found->num_lines;  // index for new pair
            found->num_lines += 1;     // increment number of lines
            ploc = realloc(found->location, (index + 1) * sizeof(int));
            pfreq = realloc(found->frequency, (index + 1) * sizeof(int));
            ploc[index] = lnum;        // new location
            pfreq[index] = 1;          // found once in this line so far
            found->location = ploc;    // point to new location array
            found->frequency = pfreq;  // point to new frequency array
        }
        else {  // update frequency in existing line
            found->frequency[index] += 1;
        }
    /* Set up a new node */
    } else {
        // allocate memory for new node
        newnode = malloc(sizeof(struct node));
        // allocate memory for string pointed to from node
        pword = malloc((strlen (pwd) + 1) * sizeof(char));
        strcpy(pword, pwd);
        newnode->word = pword;      // set word pointer
        newnode->num_lines = 1;     // only one line so far
        ploc = malloc(sizeof(int));
        pfreq = malloc(sizeof(int));
        *ploc = lnum;               // location was passed by caller
        *pfreq = 1;                 // only one occurrence so far
        newnode->location = ploc;
        newnode->frequency = pfreq;

        if (phead == NULL) {        // if wordlist is empty
            newnode->next = NULL;   // only/last link in the list
            phead = newnode;        // newnode is the head
        } else {
            newnode->next = phead;  // insert newnode at front of list
            phead = newnode;
        }
    }

    return phead;
}

/* Return pointer to node containing word, or NULL */
struct node * find_in_list(char *pwd, struct node *phead)
{
    struct node *current = phead;

    while (current != NULL) {
        if (strcmp(current->word, pwd) == 0)
            return current;         // word already in list
        current = current->next;
    }

    return NULL;                    // word not found
}

/* Return index of existing line location, or -1 */
int find_line_pair(int lnum, struct node *pwn)
{
    int n = pwn->num_lines;
    int index = 0;

    while (index < n) {
        if (pwn->location[index] == lnum)
            return index;           // word already found in this line
        ++index;
    }

    return -1;                      // word not yet found in this line
}

/* Find number of nodes in linked list */
int list_len(struct node *phead)
{
    int length = 0;
    struct node *current = phead;

    while (current != NULL) {
        ++length;
        current = current->next;
    }

    return length;
}

/* Find number of (line, occurrence) pairs */
int num_pairs(struct node *phead)
{
    int num = 0;
    struct node *current = phead;

    while (current != NULL) {
        num += current->num_lines;
        current = current->next;
    }

    return num;
}

注意：我在update_word()函数中对先前版本进行了修改。原始代码在列表的末尾插入了一个新节点，因此结果列表按照它们在输入文本中首次出现的顺序包含单词。此版本在列表的开头插入一个新节点，因此结果列表包含与其首次出现相反的单词。这加速了节点插入并简化了以下节点插入代码：

current = phead;
while (current->next != NULL)  // find tail
    current = current->next;
current->next = newnode;       // add newnode to end

为：

newnode->next = phead;  // insert newnode at front of list

我毫不怀疑代码可以改进，但这确实有效。我不会说这很简单，但相对简单。我针对这个文本文件运行它：

Three blind mice. Three blind mice.
See how they run. See how they run.
They all ran after the farmer's wife,
Who cut off their tails with a carving knife,
Did you ever see such a sight in your life,
As three blind mice?

结果如下：

There are 31 words in the text file,
and 37 pairs of (line, occurrences)
Word: pairs
as: 6, 1;
life: 5, 1;
your: 5, 1;
in: 5, 1;
sight: 5, 1;
such: 5, 1;
ever: 5, 1;
you: 5, 1;
did: 5, 1;
knife: 4, 1;
carving: 4, 1;
a: 4, 1; 5, 1;
with: 4, 1;
tails: 4, 1;
their: 4, 1;
off: 4, 1;
cut: 4, 1;
who: 4, 1;
wife: 3, 1;
farmer's: 3, 1;
the: 3, 1;
after: 3, 1;
ran: 3, 1;
all: 3, 1;
run: 2, 2;
they: 2, 2; 3, 1;
how: 2, 2;
see: 2, 2; 5, 1;
mice: 1, 2; 6, 1;
blind: 1, 2; 6, 1;
three: 1, 2; 6, 1;

Answer 3

这是我使用二进制搜索树（BST）的版本：

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>

typedef struct internal_node in_node;

struct internal_node{
    int line;
    int freq;
    in_node* next; 
};

struct tree{
    char *word;
    int num_lines;
    in_node* in_nodeptr;
    in_node* current;
    struct tree* right;
    struct tree* left;
};

typedef struct tree* treeptr;

void free_list(in_node* in_nodeptr){
   if(in_nodeptr!=NULL) {
    free(in_nodeptr);
  }
}

void free_bst(treeptr head){
  if (head!=NULL) {
    free_bst(head->right);
    free_bst(head->left);
    free_list(head->in_nodeptr);
    free(head->word);
    free(head);
   } 
}

void print_list(in_node* in_nodeptr){
    while(in_nodeptr!=NULL){
        printf("%d %d; ",in_nodeptr->line,in_nodeptr->freq);
        in_nodeptr=in_nodeptr->next;
    }
}

void print_bst(treeptr head){
  if(head!=NULL){
    printf("%s: ",head->word);
    print_list(head->in_nodeptr);
    printf("\n");
    print_bst(head->right);
    print_bst(head->left);
  }
}

void input_to_bst(treeptr* head,char* word,int line){
  if((*head)==NULL){
       (*head)=(treeptr)malloc(sizeof(struct tree));
       (*head)->word=(char*)malloc(50*sizeof(char));
       strcpy(((*head)->word),word);

       (*head)->num_lines=1;
       (*head)->right=NULL;
       (*head)->left=NULL;
       (*head)->in_nodeptr=(in_node*)malloc(sizeof(in_node));
       (*head)->in_nodeptr->line=line;
       (*head)->in_nodeptr->freq=1;
       (*head)->in_nodeptr->next=NULL;
       (*head)->current=(*head)->in_nodeptr;
  }
  else{
      int check=strcmp(((*head)->word),word);
      if(check>0) input_to_bst(&((*head)->left),word,line);
      else if(check<0) input_to_bst(&((*head)->right),word,line);
      else{
           if( (*head)->current->line==line) (*head)->current->freq++;
           else {
              (*head)->current->next=(in_node*)malloc(sizeof(in_node));
              (*head)->current->next->line=line;
              (*head)->current->next->freq=1;
              (*head)->current->next->next=NULL;
           }
      }
  }
}

int main(int argc, char *argv[]) {

    treeptr head=NULL;
    FILE *fp=fopen(argv[1], "r");
    char word[50],ch;
    int len=0,lines=1;

    if (fp == NULL) {
        fprintf(stderr, "Error reading file\n");
        exit(1);
    }

    while ((ch = getc(fp)) != EOF) {
        if (ch == '\n') {
            word[len]='\0';
             if(len>0) input_to_bst(&head,word,lines);
            len=0;
            lines++;
        }
        else if (ch==' '){
            word[len]='\0';
            if(len>0) input_to_bst(&head,word,lines);
            len=0;
        }
        else if (isalpha(ch)){
            word[len]=ch;
            len++;
        }
    }
    if(len>0) {
        word[len]='\0';
        input_to_bst(&head,word,lines);
    }
    print_bst(head);
    fclose(fp);
    free_bst(head);
    return 0;
}

每个单词都作为BST的节点保存，除了单词之外，BST的每个节点都保存一个列表，其中包含单词的所有外观（行和频率）。为了尽可能提高效率，我们将一个指针（in_node* current）保存到外观列表的最后一个元素中，这样我们就不需要在每次需要添加外观时遍历它。

举个例子：

文本：

C is an imperative procedural language. It was designed to be compiled 
using a relatively straightforward compiler and to require minimal 
runtime support.

输出：

  C: 1 1; 
is: 1 1; 
procedural: 1 1; 
was: 1 1; 
to: 1 1; 2 1; 
using: 2 1; 
relatively: 2 1; 
straightforward: 2 1; 
support: 3 1; 
require: 2 1; 
runtime: 3 1; 
language: 1 1; 
minimal: 2 1; 
an: 1 1; 
imperative: 1 1; 
designed: 1 1; 
be: 1 1; 
compiled: 1 1; 
compiler: 2 1; 
and: 2 1; 
It: 1 1; 
a: 2 1;

请注意，上述实现区分大小写，例如“And”与“and”不同。如果您不希望区分大小写，只需将行word[len]=ch;替换为word[len]=tolower(ch);即可。上述算法的复杂度为O（n ^ 2），如果仅使用链表，则相同但在平均情况下BST为O（nlogn），这比链表要好得多，这就是它的原因。被认为是更好的。还要注意，因为我们必须保留每个单词出现的列表，如果我们没有保留in_node* current指针，这使得我们可以在恒定时间内访问每个外观列表的末尾，那么复杂性将是最差的（O（1 ））。所以我认为，作为复杂性的条款，你不能比O（nlogn）更好。

将文件读入链表

3 个答案: