Question

我有一个多线程应用程序（OpenMP），在其中我们正在读取一个非常大的文件，大小为10GB-350GB，其中包含基因组读取（字符串）。由于有限的RAM（8 GB），我们试图分块处理这个大输入文件，在此我们将x数量的字符串推入向量，并使用多个线程对其进行处理。重复此过程，直到完全处理了输入文件中的所有字符串。但是这种方法很慢。我们甚至通过改变数组的大小（1000000个字符串）来测试代码，但是比大小为1000的数组需要更多的时间如何在时间上对其进行优化？

示例代码：

#include <zlib.h>
#include <stdio.h>
#include "kseq.h"
#include <string>
#include <vector>
#include <iostream>
#include <omp.h>
int main()
{
    gzFile fp;
    kseq_t *seq;
    int l;
    int it;
    int read_count=0;
    fp = gzopen("dm.fastq", "r");
    seq = kseq_init(fp);
    vector <string> array;       
 while ((l = kseq_read(seq)) >= 0) 
    {
            if (read_count <= 999) 
        {
                array.push_back(seq->seq.s);
                read_count++;
        }
            if (read_count == 1000) 
        {
        #pragma omp parallel for num_threads(12) schedule(static) private(it) shared(array)
                for (it = 0; it < array.size(); ++it) 
            {
                string line = array[it];
                            int size_s = line.size();
                            char _buf[size_s + 1];
                            strcpy(_buf, line.c_str());
            }
        array.clear();    
        read_count=0;
        }
    }
    #pragma omp parallel for num_threads(12) schedule(static) private(it) shared(array)
        for (it = 0; it < array.size(); ++it) 
            {
                            string line = array[it];
                            int size_s = line.size();
                            char _buf[size_s + 1];
                            strcpy(_buf, line.c_str());
            }
    
    kseq_destroy(seq);gzclose(fp);return 0;
}//main close

Answer 1

首先，您应该知道，只是从标准旋转的硬盘中读取（不进行处理）一个100Gb文件大约需要17分钟。

第二，关于您的代码-它在无关紧要的地方具有openmp，并且没有操作（strcpy(_buf）。此 for 循环不需要那么多的CPU即可对其进行并行化。可能这只是一个例子，但这很重要。

最后，大多数（如90％）的CPU被库（ kseq_read ）和 gzopen （似乎压缩了100Gb文件）吃掉了。

如何在最短的时间内处理大文件（C ++，OpenMP，输入文件：FASTQ / FASTA）

1 个答案: