一开始看起来很简单,但这是一个采访问题,其诀窍如下:
我编写了一个简单的代码,用于将Bytewise从一个文件复制到另一个文件,并返回在while(!feof)循环中递增的计数。但是,我的采访者说,执行此循环以复制1 GB文件需要花费1小时才能复制Bytewise,但这在现实生活中不会发生。有人能告诉我如何在计算机上实际复制大文件,底层算法是什么?另外,请记住我需要返回复制的字节数。
答案 0 :(得分:5)
他可能只是完全错了。
除非你用汇编语言编写代码,否则一次读/写一个字符几乎肯定对整体速度的影响相当小。原因很简单:几乎任何比汇编语言更高级别的东西都会(至少某些)缓存你什么时候进行面向字符的I / O.
例如,考虑C中的代码:
#include <stdio.h>
int main(int argc, char **argv) {
FILE *infile = fopen(argv[1], "rb");
FILE *outfile = fopen(argv[2], "wb");
unsigned long count = 0;
int ch;
while (EOF != (ch=getc(infile))) {
++count;
putc(ch, outfile);
}
printf("%lu bytes copied\n", count);
return 0;
}
现实情况是,这可能会比典型的文件副本运行 little 慢,但只有 little 。原因很简单:至少假设C的中途正常实现,getc
和putc
(以及标准I / O的大部分其余部分)将在幕后为您进行缓冲。事实上,getc和putc通常会以宏的形式实现,因此大多数代码也会内联扩展。虽然它从一个编译器到另一个编译器各不相同,但典型的代码看起来像这样:
#define getc(f) f->__pos<f->__len?f->__buf[f->__pos++]:__filbuf()
#define putc(ch, f) f-__>pos<f->__len?f->__buf[f->__pos++]=ch:__flshbuf(f, ch)
这将伴随着类似的代码:
#define BUF_SIZE 4096
typedef struct {
char __buf[BUF_SIZE];
size_t __pos;
size_t __len=BUF_SIZE;
int __file_number;
} FILE;
现在,你可以改进这一点当然是正确的:
但是,请注意,这些可能会增加开发时间,即使充其量也不应该计划看到面试官建议的速度差异。即使提高10倍也不太可能,更不用说你的面试官提出的~1000x了。
答案 1 :(得分:2)
算法基本相同,只使用大于一个字节的缓冲区。基本上,你做这样的事情:
这是简单的方法。有时候更快的方法必须更难(例如,使用POSIX async io)。例如,这是使用POSIX AIO看起来的样子(来自我编写的程序)。谨防小虫,我写这篇文章只是为了好玩:
int copy_file(int input, int output) {
struct stat statbuf;
off_t input_size, input_pos, output_pos, last_block;
struct aiocb read_cb, write_cb;
const struct aiocb *suspend_list[2];
char *bufs[NR_BUFFERS];
ssize_t bufs_size[NR_BUFFERS];
int i, ex_code, reading, writing, read_depleted;
uint64_t read_buf, write_buf;
if (-1 == fstat(input, &statbuf)) {
perror("fstat(input)");
return EXIT_FAILURE;
}
input_size = statbuf.st_size;
ex_code = 0;
for (i = 0; i < NR_BUFFERS; ++i)
bufs[i] = 0;
for (i = 0; i < NR_BUFFERS; ++i) {
if (!(bufs[i] = malloc(BUFFER_SIZE))) {
perror("malloc");
ex_code = EXIT_FAILURE;
break;
}
}
memset(&read_cb, 0, sizeof(read_cb));
memset(&write_cb, 0, sizeof(write_cb));
output_pos = input_pos = last_block = 0;
read_buf = write_buf = 0;
read_depleted = reading = writing = 0;
while (!ex_code && (!read_depleted || write_buf != read_buf)) {
if (!read_depleted && !reading && ((read_buf - write_buf) < NR_BUFFERS)) {
read_cb.aio_fildes = input;
read_cb.aio_offset = input_pos;
read_cb.aio_buf = bufs[read_buf % NR_BUFFERS];
read_cb.aio_nbytes = BUFFER_SIZE;
read_cb.aio_sigevent.sigev_notify = SIGEV_NONE;
if (-1 == aio_read(&read_cb)) {
perror("aio_read");
ex_code = EXIT_FAILURE;
break;
}
suspend_list[0] = &read_cb;
reading = 1;
}
if (!writing && (read_buf > write_buf)) {
write_cb.aio_fildes = output;
write_cb.aio_offset = output_pos;
write_cb.aio_buf = bufs[write_buf % NR_BUFFERS];
write_cb.aio_nbytes = bufs_size[write_buf % NR_BUFFERS];
write_cb.aio_sigevent.sigev_notify = SIGEV_NONE;
if (-1 == aio_write(&write_cb)) {
perror("aio_write");
ex_code = EXIT_FAILURE;
break;
}
suspend_list[1] = &write_cb;
writing = 1;
}
suspend_list[0] = reading ? &read_cb : NULL;
suspend_list[1] = writing ? &write_cb : NULL;
if (-1 == aio_suspend(suspend_list, 2, NULL)) {
if (EINTR != errno && EAGAIN != errno) {
perror("aio_suspend");
ex_code = EXIT_FAILURE;
break;
}
} else {
int err;
if (reading && EINPROGRESS != (err = aio_error(&read_cb))) {
if (err) {
fprintf(stderr, "read error: %s\n", strerror(err));
ex_code = EXIT_FAILURE;
break;
}
bufs_size[read_buf%NR_BUFFERS] = aio_return(&read_cb);
input_pos += bufs_size[read_buf%NR_BUFFERS];
if (0 == bufs_size[read_buf % NR_BUFFERS])
read_depleted = 1;
++read_buf;
reading = 0;
}
if (writing && EINPROGRESS != (err = aio_error(&write_cb))) {
if (err) {
fprintf(stderr, "write error: %s\n", strerror(err));
ex_code = EXIT_FAILURE;
break;
}
if (bufs_size[write_buf%NR_BUFFERS] != aio_return(&write_cb)) {
fprintf(stderr, "partial write, fuck, can't handle\n");
ex_code = EXIT_FAILURE;
break;
}
output_pos += bufs_size[write_buf%NR_BUFFERS];
++write_buf;
writing = 0;
}
fprintf(stderr,
"\xd%5.1f%% (%llu of %llu; r: %i, w: %i, r-w: %llu)",
100*((double)output_pos)/input_size,
output_pos, input_size, reading, writing,
(read_buf - write_buf)
);
}
}
fprintf(stderr, "\n");
for (i = 0; i < NR_BUFFERS; ++i)
if (bufs[i])
free(bufs[i]);
return ex_code;
}
答案 2 :(得分:-2)
你不按字节复制。相反,您保留一个合理大小的内存缓冲区(例如,请参阅dd中的“bs”选项)并以该缓冲区的粒度进行读写。我虽然那很明显