Question

我正在编写一个程序，一次读取一个大文件（44GB - 63GB）1MB，然后我正在哈希1MB。但是，我想看看执行这些哈希需要多长时间

我不知道一次读取1MB文件需要多长时间，只关心散列性能时间。目前我使用的是非常基本/通用的哈希函数

关于我从哪里开始并结束时钟的任何想法？

这是我到目前为止所做的：

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#define HASH_PRIME 65551// prime number for hash table

// generic hash function
static unsigned short hash_Function(char *hash_1MB)
{
    unsigned short hash;
    int i = 0;
    while(hash_1MB[i]!='\0')//each char of the file name
    {
        hash += (unsigned short)hash_1MB[i];//add it to hash
        i++;
    }
    return hash%HASH_PRIME;//mod hash by table size
}

int main()
{
    struct stat fileSize;
    char *buffer;

    FILE *fp;
    clock_t start, stop;
    double duration;
    char fname[40];

    printf("Enter name of file:");
    fgets(fname, 40, stdin);
    while (fname[strlen(fname) - 1] == '\n')
    {
        fname[strlen(fname) - 1] = '\0';
    }

    // handle file, open file, and read in binary form
    fp = fopen(fname, "rb");
    if (fp == NULL)
    {
        printf("Cannot open %s for reading\n", fname);
        exit(1);
    }

    stat(fname, &fileSize);
    size_t size = fileSize.st_size;
    printf("Size of file: %zd\n", size);

    buffer = (char*) malloc(sizeof(*buffer)*1000*1000);

    unsigned long long counter = 0;
    // read in 1MB at a time // & start timing how long it takes to perform the hash
    start = clock();
    clock_t total = 0;
    while (fread(buffer, sizeof(*buffer), (1<<20), fp) == (1<<20)) 
    {
    start = clock();
    hash_Function(buffer);  
    counter++;
    total += (clock() - start);
    }

    //free(buffer);

     fclose (fp); // close files

     duration = (double)((stop - start)/CLOCKS_PER_SEC);

     printf("Counter: %llu\n", counter); // how many MB were hashed
     printf("Hashing took %.2f seconds\n", (float)duration);
     return 0;
}

我的结果也没有像预期的那样出来，我分析的第一个文件是1,961,893,364字节大，所以应该至少有1,961MB被散列

但是当我打印出我的计数器以检查正确数量的MB是否被哈希时我只得到1871

以下是我的结果：

$ gcc one_mb.c
$ ./a.out
Enter name of file:v.10.nc
Size of file: 1961893364
Counter: 1871
Hashing took 0.00 seconds

提前感谢您的帮助！

/////结果w /（1000 * 1000）

Enter name of file:v.13.nc
Size of file: 15695146912
Counter: 15695
Hashing took 18446744.00 seconds

//////结果w / 1＆lt;＆lt; 20

Enter name of file:v.13.nc
Size of file: 15695146912
Counter: 14968
Hashing took 18446744.00 seconds // why this long?!?!? It didn't take 30mins

/////用for循环替换while循环

// generic hash function
static unsigned short hash_Function(char *hash_1MB)
{
    unsigned short hash;
    int i;

    for(i = 0; i < (1 << 20); i++)
    {
        hash += (unsigned short)hash_1MB[i];//add it to hash
    }

    return hash%HASH_PRIME;//mod hash by table size
}

Answer 1

您需要在while循环中记下时间戳并保留它们的总和以避免计时文件IO。

start = clock();
clock_t total = 0;
while (fread(buffer, 1<<20, (1<<20), fp) == (1<<20)) 
{
    start = clock();
    hash_Function(buffer);  
    counter++;
    total += (clock() - start);
}

注意我将1000 * 1000更改为1＆lt;＆lt; 20，因此它实际上是MB的大小。

还要确保正确分配缓冲区至少1 MB。

buffer = (char*) malloc(1<<20);

以下将评估为（char的大小）* 1000 * 1000 = 1000 * 1000，这将无效。

buffer = (char*) malloc(sizeof(*buffer)*1000*1000);

此外，当你执行sizeof（* buffer）时，它也会返回char的大小（1个字节）。查看更新的fread。

Clock（）无法按预期工作;避免IO

1 个答案: