在c中的缓冲区块中读取未知长度的文件

时间:2018-09-27 03:02:58

标签: c

我试图不使用lseek()fseek之类的功能将未知长度的二进制文件读取到缓冲区块中。

  1. 我使用了一次具有1024个字节的结构缓冲区。当读取大于1012字节的文件时,它将分配几个缓冲区。但是,当遇到最后一个块时,它肯定会具有小于或等于1024字节的字节。 因此,我尝试计算最后一块的长度,以便可以读取最后一块直到eof,但是我对如何实现这一点感到困惑。

谢谢。

#include <stdio.h>
#include <stdlib.h>

typedef struct Buffer{
  unsigned char data[1012];
  struct Buffer *next; //12 bytes
}Buffer;

void mymemcpy(void *dest, void *src, size_t length){
  Buffer *buffer_toFill = (Buffer *)dest;
  Buffer *buffer_toAdd = (Buffer *)src;
  int a = 0; 
  for(int i = 0; i < length; i++){
    buffer_toFill->data[i] = buffer_toAdd->data[i];
  }
}

Buffer* add_buffer_front(Buffer *head, Buffer *read_buffer, int size){
  Buffer *new_buffer = malloc(sizeof(Buffer));
  mymemcpy(new_buffer, read_buffer, size);
  if(head != NULL){
    new_buffer->next = head;
  }
  return new_buffer;
}

void display_List(Buffer *head, size_t length){
  Buffer *current = head;
  while(current != NULL){
    for(int i = 0; i < length; i++){
      printf("%02X",(unsigned)current->data[i]); //this shows different value compare with  xxd <filename>
      //printf("%c", current->data[i]);  
    }
    Buffer *prev = current;
    free(prev);
    current = current->next;
  }
}

int main(int argc, char **argv){
  FILE *fd;
  Buffer *head_buffer = NULL;
  int file_length = 0;
  int eof_int = 1;
  if(argc != 2){
    printf("Usage: readFile <filename>\n");
    return 1; 
  }

  fd = fopen(argv[1], "rb");

  while(eof_int != 0){ 
    Buffer *new_buffer = malloc(sizeof(Buffer));
    eof_int = fread(new_buffer, sizeof(Buffer)-12, 1, fd);
    if(eof_int == 0){ 
      //size_t length
      //
      //
      head_buffer = add_buffer_front(head_buffer, new_buffer, length);
      file_length += length;
    }else{
      head_buffer = add_buffer_front(head_buffer, new_buffer, (sizeof(new_buffer->data)));
      file_length += (sizeof(new_buffer->data));
    }
  }
  display_List(head_buffer, file_length);
  fclose(fd);
  return 0;
}

3 个答案:

答案 0 :(得分:2)

您有几个问题。

(1)fread返回读取的项目数,但不会返回eof指示。您需要致电feof(stream*)来确定是否已到达文件末尾。

(2)您说的是下一个指针是12个字节。这是一个非常危险的假设。最好读取已分配给数据结构的1012个字节。您很有可能正在打印未读入的东西,而只是未初始化的内存。

(3)使用fread的返回值来确定要复制多少内存。

答案 1 :(得分:1)

请参见下面的代码中的注释-还可以考虑将1012更改为使用#define。

#include <stdio.h>
#include <stdlib.h>

typedef struct Buffer{
  unsigned char data[1012];
  struct Buffer *next; //12 bytes
}Buffer;

// Create a structure to store stuff about a file

typedef struct {
   Buffer *head;
   Buffer *tail;
   size_t length;
} MyFile;

/*
void mymemcpy(void *dest, void *src, size_t length){
  Buffer *buffer_toFill = (Buffer *)dest;
  Buffer *buffer_toAdd = (Buffer *)src;
  int a = 0; 
  for(int i = 0; i < length; i++){
    buffer_toFill->data[i] = buffer_toAdd->data[i];
  }
}

Buffer* add_buffer_front(Buffer *head, Buffer *read_buffer, int size){
  Buffer *new_buffer = malloc(sizeof(Buffer));
  mymemcpy(new_buffer, read_buffer, size);
  if(head != NULL){
    new_buffer->next = head;
  }
  return new_buffer;
}

*/

// Lets make this easier - The buffer has already been "malloced" once - why do it again

// And why are you reversing the file

// Perhaps 

void add_buffer(Buffer *to_be_added, MyFile *file, size_t extra_length) {
   if (file->tail) { // We have one item in the list
     file->tail->next = to_be_added;
   } else { // First buffer!
     file-> head = to_be_added;
     file-> tail = to_be_added;
   }
   to_be_added->next = NULL;  // This is always the case as it is the last one
   file->length += extra_length;
}

/*
void display_List(Buffer *head, size_t length){
  Buffer *current = head;
  while(current != NULL){
    for(int i = 0; i < length; i++){
      printf("%02X",(unsigned)current->data[i]); //this shows different value compare with  xxd <filename>
      //printf("%c", current->data[i]);  
    }
    Buffer *prev = current;
    free(prev);
    current = current->next;
  }
}

*/

// Instead pass in the new structure

void display_list(MyFile *file) {
   size_t contents_left = file -> length;
   Buffer * current = file -> head;
   while (current) {
      // At most each chunk has 1012 bytes - Check for that
      size_t chunk_length = contents_left > 1012 ? 1012 : contents_left;
       for(int i = 0; i <chunk_length ; i++){
         printf("%02X",(unsigned)current->data[i]);
       }
       current = current -> next;
   }
}


}
int main(int argc, char **argv){
  FILE *fd;
  MyFile read_file;
  read_file.head = NULL;
  read_file.tail = NULL;
  read_file.length = 0;

  Buffer *head_buffer = NULL;
  int file_length = 0;
  int eof_int = 1;
  if(argc != 2){
    printf("Usage: readFile <filename>\n");
    return 1; 
  }

  fd = fopen(argv[1], "rb");

  // Check fd
  if (fd == NULL) {
    // error stuff
    return EXIT_FAILURE; // Look up the include for this
 }
  while(eof_int != 0){ 
    Buffer *new_buffer = malloc(sizeof(Buffer));
    eof_int = fread(new_buffer->data, 1012, 1, fd); // Do not make assumptions on the size of a pointer and store it in the correct location
    if(eof_int == 0) { // Read nothing
       free(new_buffer); // We was too optimistic! Did Not need this in the end 
       break;
    } else {
      add_buffer(&read_file, new_buffer, eof_int);
    }
  }
  display_List(&read_file);
  fclose(fd);
  return 0;
}

答案 2 :(得分:0)

您要找的技巧是fread返回读取的 项目 的数量。您正在读取1个缓冲区已满,因此它只会告诉您读取0或1个缓冲区。相反,请翻转并反转它: 读取缓冲区的字节数

size_t bytes_read = fread(buffer, 1, sizeof(Buffer)-12, fd);

现在您可以知道多少字节已读入缓冲区。我们可以在size上添加一个Buffer字段,以便每个缓冲区可以记住它读取了多少字节,而只打印了那么多字节。

const size_t BUFFER_SIZE = 1024;

typedef struct Buffer {
    // I'll explain why I switched to a pointer in a moment
    unsigned char *data;
    size_t size;
    struct Buffer *next;
} Buffer;

void Buffer_print( Buffer *buffer ) {
    for( size_t i = 0; i < buffer->size; i++ ) {
        printf("%02hhX ", buffer->data[i]);
    }
}

Buffer *Buffer_new() {
    Buffer *buffer = malloc(sizeof(Buffer));

    buffer->size = 0;
    buffer->data = NULL;
    buffer->next = NULL;

    return buffer;
}

请注意,我会谨慎初始化缓冲区的所有字段,否则我们有可能会被垃圾吞没。

现在,我们已经更改了缓冲区,因此关于其大小和位置的假设被打破了。没关系,无论如何,我们应该直接阅读buffer->data

size_t Buffer_read( Buffer *buffer, size_t buffer_size, FILE* fp ) {
    buffer->data = malloc(buffer_size);
    size_t bytes_read = fread(buffer->data, 1, buffer_size, fp);
    buffer->size = bytes_read;
    return bytes_read;
}

现在,缓冲区知道其读取了多少数据,因此我们可以根据需要分配任意大小的数据。无需将其硬编码到结构中。这使代码更加灵活和高效。它使我们能够廉价地分配空缓冲区,这将使事情变得简单得多。

我们还可以避免使用malloc并使用垃圾进行初始化buffer->data。如果fread仅部分填充buffer->data,则其余部分将保持为垃圾。没关系,知道我们已读取的数据大小意味着我们在遇到垃圾之前就停止打印。


现在我们可以构建循环了。读取0字节后,我们知道读取已完成。

while( Buffer_read( buffer, BUFFER_SIZE, fp ) > 0 ) {
    ... now what ...
}
fclose(fp);

链表的工作方式,当您添加到链表时,将其添加到tail->next并使其成为新尾巴。这通常称为“推”。

Buffer *Buffer_push( Buffer *tail, Buffer *new_tail ) {
    tail->next = new_tail;
    return new_tail;
}

Buffer *head = Buffer_new();
Buffer *tail = head;
while( Buffer_read( tail, BUFFER_SIZE, fp ) > 0 ) {
    tail = Buffer_push( tail, Buffer_new() );
}
fclose(fp);

请注意,我们从一个空的head开始,这也是tail。从这两个分配开始,使循环更加简单。无需检查if( head )if( tail )。这确实意味着我们列表的末尾总是有一个空缓冲区。没关系。由于我们不再使用固定的buffer->data空缓冲区,所以现在变得很小且便宜。


最后一步是打印所有内容。我们已经可以打印一个缓冲区,因此只需要遍历链表并打印每个缓冲区即可。

void Buffer_print_all( Buffer *head ) {
    for( Buffer *buffer = head; buffer != NULL; buffer = buffer->next ) {
        Buffer_print(buffer);
    }
}

Buffer_print_all(head);

最后一个空的缓冲区挂在末端就可以了。它知道其大小为0,因此Buffer_print实际上不会使用空buffer->data