我有一个C程序,它需要一个非常大的文件(可以是5GB到65GB)并转换文件中的数据,然后将转置的数据写出到其他文件。总的来说,由于转换,结果文件大约大30倍。我使用的是开放式mpi,因此每个处理器都会写入自己的文件。
每个处理器以非常快的速度将第一个~18 GB的数据写入其自己的结果文件。但是,在此阶段,程序会慢慢爬行,并且top命令输出上的%CPU从~100%急剧下降到0.3%。
有人能说出这个理由吗?我达到了系统限制吗?
代码:
#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
unsigned long long impute_len=0;
void write_results(unsigned long long, unsigned long long, int);
void main(int argc, char **argv){
// the impute output
impute_fp=fopen("infile.txt", "r");
// find input file length
fseek(impute_fp, 0, SEEK_END);
impute_len=ftell(impute_fp);
//mpi magic - hopefully!
MPI_Status status;
unsigned long long proc_id, ierr, num_procs, tot_recs, recs_per_proc,
root_recs, start_byte, end_byte, start_recv, end_recv;
// Now replicte this process to create parallel processes.
ierr = MPI_Init(&argc, &argv);
//find out process ID, and how many processes were started.
ierr = MPI_Comm_rank(MPI_COMM_WORLD, &proc_id);
ierr = MPI_Comm_size(MPI_COMM_WORLD, &num_procs);
if(proc_id == 0){
tot_recs = impute_len/54577; //54577 is length of each line
recs_per_proc = tot_recs/num_procs;
if(tot_recs % num_procs != 0){
recs_per_proc=recs_per_proc+1;
root_recs = tot_recs-(recs_per_proc*(num_procs-1));
}else{
root_recs = recs_per_proc;
}
//distribute a portion to each child process
int z=0;
for(int x=1; x<num_procs; x++){
start_byte = ((root_recs*54577))+(z*(recs_per_proc*54577));
end_byte = ((root_recs*54577))+((z+1)*(recs_per_proc*54577));
ierr = MPI_Send(&start_byte, 1 , MPI_UNSIGNED_LONG_LONG, x, 0, MPI_COMM_WORLD);
ierr = MPI_Send(&end_byte, 1 , MPI_UNSIGNED_LONG_LONG, x, 0, MPI_COMM_WORLD);
z++;
}
//root proc bit of work
write_results(0, (root_recs*54577), proc_id);
}else{
//must be a slave process
ierr = MPI_Recv(&start_recv, 1, MPI_UNSIGNED_LONG_LONG, 0, 0, MPI_COMM_WORLD, &status);
ierr = MPI_Recv(&end_recv, 1, MPI_UNSIGNED_LONG_LONG, 0, 0, MPI_COMM_WORLD, &status);
//Write my portion of file
write_results(start_recv, end_recv, proc_id);
}
ierr = MPI_Finalize();
fclose(impute_fp);
}
void write_results(unsigned long long start, unsigned long long end, int proc_id){
**logic to write out transposed data here
}
fclose(results_fp);
}