将文件读入链表

时间:2016-09-27 15:09:44

标签: c linked-list

我正在尝试将我制作的文本文件读入链接列表,文本文件如下所示:

 around 1 2 1
 bread 2 4 3 5 1
 four 1 3 2
 head 3 1 2 2 1 5 1
 has 2 3 1 5 2

每行的第一个字符串只是段落中的单词。单词后面的第一个数字是段落中找到单词的行数。然后,以下数字是段落中的(行,出现次数)对。

例如,对于单词bread

在段落的2行中找到了它。在第一行4中,找到了3次。然后在第二行5行中,发现1时间。

我正在尝试从此文本文件创建链接列表,到目前为止我的程序看起来像这样:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <ctype.h>

#define MAXWORD 999

typedef struct node node_t;

struct node {
    char *word;
    int num_lines;
    int paragraph;
    int freq;
    node_t *next;
};

int
main(int argc, char *argv[]) {
    FILE *fp;
    char word[MAXWORD+1];
    int ch, line_count = 0, len = 0;
    node_t *node = (node_t*)malloc(sizeof(*node));
    node_t *curr, *prev;

    fp = fopen(argv[1], "r");

    if (fp == NULL) {
        fprintf(stderr, "Error reading file\n");
        exit(EXIT_FAILURE);
    }

    /* Just trying to store the string so far */
    while ((ch = getc(fp)) != EOF) {
        if (ch == '\n') {
            line_count++;
            strcpy(node->word, word);
        }

        if (isalpha(ch)) {
            word[len] = ch;
            len++;
            word[len] = '\0';
        } 

        if (isdigit(ch)) {
            len = 0;
        }
    }

    printf("line count = %d", line_count);

    free(node)

    fclose(fp);

    return 0;
}

在这个片段中,我一直在尝试将字符串存储在链表数据结构中,但我还没有使用动态数组来存储文本文件中出现的单词之后的数字。我知道我需要使用malloc()realloc()构建此数据结构,但我不确定如何执行此操作。

我该怎么做?

我想要的输出如下:

There are five words in the text file, 
and 9 pairs of (line, occurences)

Word: pairs
"around": 2,1
"bread": 4,3; 5,1
"four": 3,2
"head": 1,2; 2,1; 5,1
"has": 3,1; 5,2 

更新

我一直在研究它,它似乎与倒排索引问题非常相似,我已经看到使用二叉搜索树是最好的。

我可以像这样实现我的二叉搜索树:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <ctype.h>

#define MAXWORD 999

typedef char word_t[MAXWORD+1];

typedef struct node node_t;

struct node {
    void *data;
    int *ints;
    node_t *rght;
    node_t *left;
};

typedef struct {
    node_t *root;
    int (*cmp)(void*, void*);
} tree_t;

int
main(int argc, char *argv[]) {
    FILE *fp;

    fp = fopen(argv[1], "r");

    if (fp == NULL) {
        fprintf(stderr, "Error reading file\n");
        exit(EXIT_FAILURE);
    }

    while ((ch = getc(fp)) != EOF) {
        if (ch == '\n') {
            line_count++;
        }
    }

    fclose(fp);

    return 0;
}

3 个答案:

答案 0 :(得分:3)

你可以这样做:

typedef struct {
    int paragraph;
    int freq;
} stats_t;

struct node {
    char *word;
    int num_lines;
    stats_t *stats;
    node_t *next;
};

然后在解析字符串后,您可以执行以下操作:

ps = calloc(line_count, sizeof(stats_t));

获取指向stats_t结构数组的指针,您可以使用行位置和频率填充这些结构。然后,您可以将指针ps存储在node结构中。

答案 1 :(得分:3)

我写了一个程序来完成我认为你正在寻找的东西。我修改了之前想到的结构:

typedef node node_t;

struct node {
    char *word;
    int num_lines;
    int *location;
    int *frequency;
    node_t *next;
};

这样,节点包含指向int数组的指针,用于存储位置和频率信息。字串,位置数组和频率数组的节点和存储都是动态分配的。这是代码:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>

#define MAXLINE  1000
#define MAXWORD    30

typedef struct node node_t;

struct node {
    char *word;
    int num_lines;
    int *location;
    int *frequency;
    node_t *next;
};

void strip(char *pln);
void normalize_word(char *pstr);
struct node * update_word(char *pwd, int lnum, struct node *phead);
struct node * find_in_list(char *pwd, struct node *phead);
int find_line_pair(int lnum, struct node *pwn);
int list_len(struct node *phead);
int num_pairs(struct node *phead);

int main(int argc, char *argv[])
{
    FILE *fp;
    struct node *head, *current;
    char *pline, *pword;
    char line[MAXLINE + 1];
    char word[MAXWORD + 1];
    int i, n, line_count = 0;

    head = NULL;

    if (argc < 2) {
        fprintf(stderr, "Usage: %s filename\n", argv[0]);
        exit(EXIT_FAILURE);
    } else {        
        if ((fp = fopen(argv[1], "r")) == NULL) {
            fprintf(stderr, "Unable to open file %s\n", argv[1]);
            exit(EXIT_FAILURE);
        }
    }

    /* Read in lines and process words */
    pline = line;
    pword = word;
    while (fgets(pline, MAXLINE, fp) != NULL) {
        ++line_count;
        strip(pline);
        while ((pword = strtok(pline, " ")) != NULL) {
            normalize_word(pword);
            if (*pword != '\0')     // don't add empty words
                head = update_word(pword, line_count, head);
            pline = NULL;
        }
        pline = line;
    }

    /* Display list contents */
    printf("There are %d words in the text file,\n",
           list_len(head));
    printf("and %d pairs of (line, occurrences)\n",
           num_pairs(head));
    printf("Word: pairs\n");
    current = head;
    while (current != NULL) {
        n = current->num_lines;
        printf("%s:", current->word);
        for (i = 0; i < n; i++) {
            printf(" %d, %d;",
                   current->location[i], current->frequency[i]);
        }
        putchar('\n');
        current = current->next;
    }

    /* Cleanup */
    // close file
    if (fclose(fp) != 0)
        fprintf(stderr, "Error closing file %s\n", argv[1]);

    // free all allocated memory
    current = head;
    while (current != NULL) {
        free(current->word);
        free(current->location);
        free(current->frequency);
        current = current->next;
        free(head);
        head = current;
    }

    return 0;
}

/* Remove trailing newlines */
void strip(char *pln)
{
    while (*pln != '\0') {
        if (*pln == '\n')
            *pln = '\0';
        ++pln;
    }
}

/* Convert word to lowercase and remove trailing
 * non-alphanumeric characters                   */
void normalize_word(char *pstr)
{
    int i = 0;
    char ch;

    while ((ch = pstr[i]) != '\0') {
        pstr[i] = tolower(ch);
        ++i;
    }
    while ((--i >= 0) && !isalnum(pstr[i])) {
        pstr[i] = '\0';
        continue;
    }
}

/* Update existing word node or create a new one, and return
 * a pointer to the head of the list */
struct node * update_word(char *pwd, int lnum, struct node *phead)
{
    struct node *found, *newnode;
    char *pword;
    int *ploc, *pfreq;
    int index;

    /* Modify existing node if word is in list */
    if ((found = find_in_list(pwd, phead)) != NULL) {
        // add new (location, freq) pair if word not in found line
        if ((index = find_line_pair(lnum, found)) == -1) {
            index = found->num_lines;  // index for new pair
            found->num_lines += 1;     // increment number of lines
            ploc = realloc(found->location, (index + 1) * sizeof(int));
            pfreq = realloc(found->frequency, (index + 1) * sizeof(int));
            ploc[index] = lnum;        // new location
            pfreq[index] = 1;          // found once in this line so far
            found->location = ploc;    // point to new location array
            found->frequency = pfreq;  // point to new frequency array
        }
        else {  // update frequency in existing line
            found->frequency[index] += 1;
        }
    /* Set up a new node */
    } else {
        // allocate memory for new node
        newnode = malloc(sizeof(struct node));
        // allocate memory for string pointed to from node
        pword = malloc((strlen (pwd) + 1) * sizeof(char));
        strcpy(pword, pwd);
        newnode->word = pword;      // set word pointer
        newnode->num_lines = 1;     // only one line so far
        ploc = malloc(sizeof(int));
        pfreq = malloc(sizeof(int));
        *ploc = lnum;               // location was passed by caller
        *pfreq = 1;                 // only one occurrence so far
        newnode->location = ploc;
        newnode->frequency = pfreq;

        if (phead == NULL) {        // if wordlist is empty
            newnode->next = NULL;   // only/last link in the list
            phead = newnode;        // newnode is the head
        } else {
            newnode->next = phead;  // insert newnode at front of list
            phead = newnode;
        }
    }

    return phead;
}

/* Return pointer to node containing word, or NULL */
struct node * find_in_list(char *pwd, struct node *phead)
{
    struct node *current = phead;

    while (current != NULL) {
        if (strcmp(current->word, pwd) == 0)
            return current;         // word already in list
        current = current->next;
    }

    return NULL;                    // word not found
}

/* Return index of existing line location, or -1 */
int find_line_pair(int lnum, struct node *pwn)
{
    int n = pwn->num_lines;
    int index = 0;

    while (index < n) {
        if (pwn->location[index] == lnum)
            return index;           // word already found in this line
        ++index;
    }

    return -1;                      // word not yet found in this line
}

/* Find number of nodes in linked list */
int list_len(struct node *phead)
{
    int length = 0;
    struct node *current = phead;

    while (current != NULL) {
        ++length;
        current = current->next;
    }

    return length;
}

/* Find number of (line, occurrence) pairs */
int num_pairs(struct node *phead)
{
    int num = 0;
    struct node *current = phead;

    while (current != NULL) {
        num += current->num_lines;
        current = current->next;
    }

    return num;
}

注意:我在update_word()函数中对先前版本进行了修改。原始代码在列表的末尾插入了一个新节点,因此结果列表按照它们在输入文本中首次出现的顺序包含单词。此版本在列表的开头插入一个新节点,因此结果列表包含与其首次出现相反的单词。这加速了节点插入并简化了以下节点插入代码:

current = phead;
while (current->next != NULL)  // find tail
    current = current->next;
current->next = newnode;       // add newnode to end

为:

newnode->next = phead;  // insert newnode at front of list

我毫不怀疑代码可以改进,但这确实有效。我不会说这很简单,但相对简单。我针对这个文本文件运行它:

Three blind mice. Three blind mice.
See how they run. See how they run.
They all ran after the farmer's wife,
Who cut off their tails with a carving knife,
Did you ever see such a sight in your life,
As three blind mice?

结果如下:

There are 31 words in the text file,
and 37 pairs of (line, occurrences)
Word: pairs
as: 6, 1;
life: 5, 1;
your: 5, 1;
in: 5, 1;
sight: 5, 1;
such: 5, 1;
ever: 5, 1;
you: 5, 1;
did: 5, 1;
knife: 4, 1;
carving: 4, 1;
a: 4, 1; 5, 1;
with: 4, 1;
tails: 4, 1;
their: 4, 1;
off: 4, 1;
cut: 4, 1;
who: 4, 1;
wife: 3, 1;
farmer's: 3, 1;
the: 3, 1;
after: 3, 1;
ran: 3, 1;
all: 3, 1;
run: 2, 2;
they: 2, 2; 3, 1;
how: 2, 2;
see: 2, 2; 5, 1;
mice: 1, 2; 6, 1;
blind: 1, 2; 6, 1;
three: 1, 2; 6, 1;

答案 2 :(得分:2)

这是我使用二进制搜索树(BST)的版本:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>

typedef struct internal_node in_node;

struct internal_node{
    int line;
    int freq;
    in_node* next; 
};

struct tree{
    char *word;
    int num_lines;
    in_node* in_nodeptr;
    in_node* current;
    struct tree* right;
    struct tree* left;
};

typedef struct tree* treeptr;

void free_list(in_node* in_nodeptr){
   if(in_nodeptr!=NULL) {
    free(in_nodeptr);
  }
}

void free_bst(treeptr head){
  if (head!=NULL) {
    free_bst(head->right);
    free_bst(head->left);
    free_list(head->in_nodeptr);
    free(head->word);
    free(head);
   } 
}

void print_list(in_node* in_nodeptr){
    while(in_nodeptr!=NULL){
        printf("%d %d; ",in_nodeptr->line,in_nodeptr->freq);
        in_nodeptr=in_nodeptr->next;
    }
}

void print_bst(treeptr head){
  if(head!=NULL){
    printf("%s: ",head->word);
    print_list(head->in_nodeptr);
    printf("\n");
    print_bst(head->right);
    print_bst(head->left);
  }
}

void input_to_bst(treeptr* head,char* word,int line){
  if((*head)==NULL){
       (*head)=(treeptr)malloc(sizeof(struct tree));
       (*head)->word=(char*)malloc(50*sizeof(char));
       strcpy(((*head)->word),word);

       (*head)->num_lines=1;
       (*head)->right=NULL;
       (*head)->left=NULL;
       (*head)->in_nodeptr=(in_node*)malloc(sizeof(in_node));
       (*head)->in_nodeptr->line=line;
       (*head)->in_nodeptr->freq=1;
       (*head)->in_nodeptr->next=NULL;
       (*head)->current=(*head)->in_nodeptr;
  }
  else{
      int check=strcmp(((*head)->word),word);
      if(check>0) input_to_bst(&((*head)->left),word,line);
      else if(check<0) input_to_bst(&((*head)->right),word,line);
      else{
           if( (*head)->current->line==line) (*head)->current->freq++;
           else {
              (*head)->current->next=(in_node*)malloc(sizeof(in_node));
              (*head)->current->next->line=line;
              (*head)->current->next->freq=1;
              (*head)->current->next->next=NULL;
           }
      }
  }
}

int main(int argc, char *argv[]) {

    treeptr head=NULL;
    FILE *fp=fopen(argv[1], "r");
    char word[50],ch;
    int len=0,lines=1;

    if (fp == NULL) {
        fprintf(stderr, "Error reading file\n");
        exit(1);
    }

    while ((ch = getc(fp)) != EOF) {
        if (ch == '\n') {
            word[len]='\0';
             if(len>0) input_to_bst(&head,word,lines);
            len=0;
            lines++;
        }
        else if (ch==' '){
            word[len]='\0';
            if(len>0) input_to_bst(&head,word,lines);
            len=0;
        }
        else if (isalpha(ch)){
            word[len]=ch;
            len++;
        }
    }
    if(len>0) {
        word[len]='\0';
        input_to_bst(&head,word,lines);
    }
    print_bst(head);
    fclose(fp);
    free_bst(head);
    return 0;
}

每个单词都作为BST的节点保存,除了单词之外,BST的每个节点都保存一个列表,其中包含单词的所有外观(行和频率)。为了尽可能提高效率,我们将一个指针(in_node* current)保存到外观列表的最后一个元素中,这样我们就不需要在每次需要添加外观时遍历它。

举个例子:

文本:

C is an imperative procedural language. It was designed to be compiled 
using a relatively straightforward compiler and to require minimal 
runtime support.

输出:

  C: 1 1; 
is: 1 1; 
procedural: 1 1; 
was: 1 1; 
to: 1 1; 2 1; 
using: 2 1; 
relatively: 2 1; 
straightforward: 2 1; 
support: 3 1; 
require: 2 1; 
runtime: 3 1; 
language: 1 1; 
minimal: 2 1; 
an: 1 1; 
imperative: 1 1; 
designed: 1 1; 
be: 1 1; 
compiled: 1 1; 
compiler: 2 1; 
and: 2 1; 
It: 1 1; 
a: 2 1; 

请注意,上述实现区分大小写,例如“And”与“and”不同。 如果您不希望区分大小写,只需将行word[len]=ch;替换为word[len]=tolower(ch);即可。 上述算法的复杂度为O(n ^ 2),如果仅使用链表,则相同但在平均情况下BST为O(nlogn),这比链表要好得多,这就是它的原因。被认为是更好的。 还要注意,因为我们必须保留每个单词出现的列表,如果我们没有保留in_node* current指针,这使得我们可以在恒定时间内访问每个外观列表的末尾,那么复杂性将是最差的(O(1 ))。所以我认为,作为复杂性的条款,你不能比O(nlogn)更好。