我有一个奇怪的问题。 我正在使用OpenMP库在C ++中编写一个简单的异步优化算法。 我编写了代码并且运行良好,没有任何错误。
然后我试着评估某段代码的计算时间。 在我的系统上,该部分大约需要12秒。
然后,我注意到如果我评论完全与该部分无关的代码行,该部分的计算时间会减少很多!它下降到大约1秒钟。
我不知道如何为您提供显示我的问题的简单代码。 我在下面附加的代码是我的原始代码,我删除了所有不创建时间问题的部分。 不幸的是我无法从代码中删除其他行,因为我试图删除的每一行都会改变我感兴趣的部分的执行时间。
我所指的部分是这一部分,它是在代码的末尾:
double gradientD_time = omp_get_wtime();
compute_function_gradient_D(gradient_D, DX, K, M, N);
double gradientD_total = (omp_get_wtime()- gradientD_time);
您可能会在这里看到我正在评估compute_function_gradient_D()函数的计算时间。如果我运行此代码,则执行大约需要12秒。 如果从代码中删除行,则该部分的执行时间将下降到1秒。 您可能尝试删除的行示例:
std::string str_1 = folder + "parameters.dat";
std::string str_2 = folder + "times.dat";
std::string str_3 = folder + "merits.dat";
std::string str_4 = folder + "values.dat";
std::string str_5 = folder + "lipx.dat";
std::string str_6 = folder + "lipd.dat";
或
throw std::exception();
或
merits[iter] = max_br_init;
这些行与我计算执行时间的部分完全无关...如果我删除其中一行,为什么执行时间会改变?这是怎么回事?
#include <omp.h>
#include <cmath>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include <cstdlib>
#include <fstream>
#include <sstream>
#include <iomanip>
#include <iostream>
#include <stdexcept>
#include <algorithm>
#include "mkl.h"
void compute_function_gradient_D(double *gradient_D, double *DX, int K, int M, int N) {
for (int j = 0; j < K; j++){
for (int i = 0; i < M; i++){
gradient_D[j*M+i] = 0;
for (int k = 0; k < N; k++)
gradient_D[i+M*j] += DX[i+k*M];
}
}
}
double compute_D_const(double *D, int M, int K){
double L1norm_col = 0.0, err0=0, err1 = 0.0, tol=1e-6, normx = 0.0, normy= 0.0, nrm2= 0.0;
int count = 0;
double *Dt_col = new double[K]();
double *DDtb = new double[M]();
double *Dtb = new double[K]();
for (int i = 0; i < M; i++){
Dt_col[0:K:1] = D[i:K:M];
L1norm_col = cblas_dasum(K, Dt_col, 1);
DDtb[i] = L1norm_col;
}
nrm2 = cblas_dnrm2(M, DDtb, 1);
cblas_dscal(M, 1.0/nrm2, DDtb, 1);
err1 = nrm2;
while(std::abs(err1-err0)>tol*err1 && count<20){
err0 = err1;
cblas_dgemv(CblasColMajor, CblasTrans, M, K, 1.0, D, M, DDtb, 1 , 0.0, Dtb, 1);
cblas_dgemv(CblasColMajor, CblasNoTrans, M, K, 1.0, D, M, Dtb, 1, 0.0, DDtb, 1);
normx = cblas_dnrm2(M, DDtb, 1);
normy = cblas_dnrm2(K, Dtb, 1);
err1 = normx/normy;
cblas_dscal(M, 1.0/normx, DDtb, 1);
count++;
if(count>100) break;
}
err1*= err1;
delete [] Dt_col; delete [] DDtb; delete [] Dtb;
return err1;
}
void compute_function_gradient_X(double *gradient_X, double *D, double *DX, int over_X, int fe_X, int K, int M, int kn) {
int current_index_X = 0, col = 0, row = 0;
for (int i = 0; i < (kn+over_X); i++){
gradient_X[i] = 0.0;
current_index_X = fe_X + i;
col = std::floor(current_index_X/K);
row = current_index_X - col*K;
for(int j = 0; j < M; j++)
gradient_X[i] += D[M*row+j]*DX[M*col+j];
}
}
int main (int argc, char **argv) {
srand(time(NULL));
int max_time = 15000;
int max_iter = 1;
int time_flag = 0;
int merit_flag = 0;
int iter_flag = 0;
int iter = 0;
int core_count = 0;
double merit_limit = 1e-6;
double tau_0 = 1;
int number_of_threads = 1;
int M = 0;
int K = 0;
int N = 0;
double entry = 0.0;
int kn = 0.0;
int uneven_X = 0;
int uneven_D = 0;
int k = 0;
double lambda = 1;
double constr = 1;
double warm_up = 10;
std::string data = "../../data/param.dat";
FILE *file = fopen(data.c_str(), "r");
if (file == NULL) {
std::cout << "ERROR" << std::endl;
throw std::exception();
}
fscanf(file, "%lf", &entry); M = entry; fscanf(file, "\n");
fscanf(file, "%lf", &entry); K = entry; fscanf(file, "\n");
fscanf(file, "%lf", &entry); N = entry; fscanf(file, "\n");
fscanf(file, "%lf", &entry); lambda = entry; fscanf(file, "\n");
fscanf(file, "%lf", &entry); constr = entry; fscanf(file, "\n");
fscanf(file, "%lf", &entry); warm_up = entry;
fclose(file);
double *X = new double[N*K]();
double *D = new double[M*K]();
double *S = new double[N*M]();
double *times = new double[max_iter+2*number_of_threads+1]();
double *merits = new double[max_iter+2*number_of_threads+1]();
double *values = new double[max_iter+2*number_of_threads+1]();
double *Lip_X = new double[max_iter+2*number_of_threads+1]();
double *Lip_D = new double[max_iter+2*number_of_threads+1]();
int *actual_iteration_vector = new int[number_of_threads]();
double f_value = 0.0;
for (int i = 0; i < M*N; i++)
f_value += S[i]*S[i];
double *nabla_X_init = new double[K*N]();
double max_br_init = 0.0;
double x_hat_init = 0.0, gradient_init = 0.0, parameter_init = 0.0, tauX_init = 0.0, LipD_init = 0.0;
double m_value = 9999;
int t_warm_up = warm_up*number_of_threads;
LipD_init = compute_D_const(D, M, K);
tauX_init = std::max(LipD_init, tau_0);
cblas_dgemm(CblasColMajor, CblasTrans, CblasNoTrans, K, N, M, -1.0, D, M, S, M, 0.0, nabla_X_init, K);
for (int i = 0; i < (K*N); i++){
gradient_init = nabla_X_init[i];
x_hat_init = X[i] - gradient_init/tauX_init;
parameter_init = lambda/tauX_init;
if (x_hat_init >= parameter_init)
x_hat_init -= parameter_init;
else {
if (x_hat_init <= -parameter_init)
x_hat_init += parameter_init;
else
x_hat_init = 0.0;
}
if(std::abs(x_hat_init-X[i]) >= max_br_init)
max_br_init = std::abs(x_hat_init-X[i]);
}
double *D_col_init = new double[M]();
double *D_hat_init = new double[M*K]();
double *max_br = new double[number_of_threads]();
std::fill(max_br, max_br+number_of_threads, -9999);
D_hat_init[0:M*K:1] = D[0:M*K:1];
double col_norm_init = 0.0;
for (int i = 0; i < K; i++){
D_col_init[0:M:1] = D[(i*M):M:1];
col_norm_init = cblas_dnrm2(M, D_col_init, 1);
if(col_norm_init > constr)
D_hat_init[(i*M):M:1] *= constr/col_norm_init;
}
for (int i = 0; i < (M*K); i++){
if(std::abs(D_hat_init[i]-D[i]) >= max_br_init)
max_br_init = std::abs(D_hat_init[i]-D[i]);
}
values[iter] = 0.5*f_value;
merits[iter] = max_br_init;
times[iter] = 0.0;
iter++;
kn = std::floor((K*N)/number_of_threads);
uneven_X = (K*N % number_of_threads);
k = std::floor(K/number_of_threads);
uneven_D = (K % number_of_threads);
delete [] nabla_X_init; delete [] D_col_init; delete [] D_hat_init;
double total = omp_get_wtime();
double init_time = omp_get_wtime() - total;
int thread_id = 0;
thread_id = omp_get_thread_num();
int over_X = 0;
int over_D = 0;
if ((uneven_X != 0) && (thread_id == (number_of_threads-1)))
over_X = uneven_X;
if ((uneven_D != 0) && (thread_id == (number_of_threads-1)))
over_D = uneven_D;
double *gradient_X = new double[kn+over_X]();
double *delta_X = new double[kn+over_X]();
double *delta_D = new double[(k+over_D)*M]();
double *D_col = new double[M]();
int fe_X = thread_id*kn;
int fe_D = thread_id*k;
double end = 0.0, LipX = 0.0, LipD = 0.0, tauX = 0.0, tauD = 0.0, X_hat = 0.0, col_norm = 0.0, max_br_local = 0.0;
double *D_hat = new double[(k+over_D)*M]();
double *times_local = new double[max_iter+1]();
double *merits_local = new double[max_iter+1]();
double *values_local = new double[max_iter+1]();
int current_index_X = 0, current_index_D = 0;
int actual_iteration = 1;
times_local[0] = times[0];
merits_local[0] = merits[0];
values_local[0] = values[0];
actual_iteration_vector[thread_id] = 1;
double start = omp_get_wtime();
double gradientX_total = 0.0;
double *gradient_D = new double[(k+over_D)*M]();
double *DX = new double[M*N]();
while (iter_flag == 0 && merit_flag == 0 && time_flag == 0){
double gradientX_time = omp_get_wtime();
compute_function_gradient_X(gradient_X, D, DX, over_X, fe_X, K, M, kn);
gradientX_total += (omp_get_wtime()-gradientX_time);
double gradientD_time = omp_get_wtime();
compute_function_gradient_D(gradient_D, DX, K, M, N);
double gradientD_total = (omp_get_wtime()- gradientD_time);
printf("Gradient D total = %f \n", gradientD_total);
iter++;
if ((omp_get_wtime() - total) >= max_time)
time_flag = 1;
if (m_value <= merit_limit)
merit_flag = 1;
if (iter >= max_iter)
iter_flag = 1;
}
end = omp_get_wtime();
#pragma omp barrier
int value = 0;
for(int i = 0; i < thread_id; i++)
value += (actual_iteration_vector[i]-1);
for (int i = 0; i < (actual_iteration_vector[thread_id]-1); i++){
times[value+1+i] = times_local[i+1];
merits[value+1+i] = merits_local[i+1];
values[value+1+i] = values_local[i+1];
}
delete [] X; delete [] D; delete [] S; delete [] times; delete [] merits; delete [] values; delete [] Lip_X;
delete [] Lip_D; delete [] actual_iteration_vector; delete [] max_br; delete [] gradient_D; delete [] DX;
delete [] gradient_X; delete [] delta_X; delete [] delta_D; delete [] D_col; delete [] D_hat;
delete [] times_local; delete [] merits_local; delete [] values_local;
std::string folder = "../results/";
std::string str_1 = folder + "parameters.dat";
std::string str_2 = folder + "times.dat";
std::string str_3 = folder + "merits.dat";
std::string str_4 = folder + "values.dat";
std::string str_5 = folder + "lipx.dat";
std::string str_6 = folder + "lipd.dat";
return 0;
}
代码的含义无关紧要。实际上,由于我删除了很多行,因此代码不再具有意义。 在开始时,一个名为&#34; param&#34;的文件已被读取:它只包含六个不同于零的输入值:
64
64
255025
0.125
1
1000000
为了运行代码,我使用以下cmake文件:
project(example)
cmake_minimum_required(VERSION 2.8)
set(CMAKE_CXX_COMPILER "icc")
set(CMAKE_SHARED_LIBRARY_LINK_CXX_FLAGS)
set(CMAKE_CXX_FLAGS "-qopenmp -mkl=sequential")
add_executable(example main.cpp)
,通过命令cmake
创建makefile。然后我做make
,最后我运行二进制文件。
答案 0 :(得分:1)
幸运的是,我能够重现这个问题。英特尔编译器-qopt-report=5
有一个非常有用的标志。这将创建一个文件main.optrpt
,其中包含有关编译器优化的所有有趣内容,特别是 slow 版本:
-> (225,5) compute_function_gradient_D(double *, double *, int, int, int) (isz = 56) (sz = 69)
[[ Inlining inhibited by overrideable criterion <2>]]
fast 版本只是缺少第二行。
基于以下内容:
INLINING OPTION VALUES:
-inline-factor: 100
-inline-min-size: 30
-inline-max-size: 230
-inline-max-total-size: 2000
-inline-max-per-routine: 10000
-inline-max-per-compile: 500000
我猜criterion <2>
是-inline-max-size
。
确实添加-inline-max-size=999
可以将代码的慢版本加速到同一级别!因此,main
函数的大小被看似无关的语句所改变,阻止了内联。
您可能仍然想知道内联与非内容之间的巨大差异来自何处。函数调用本身肯定不相关。但让我们看看各自的产出:
对于函数本身:
Begin optimization report for: compute_function_gradient_D(double *, double *, int, int, int)
Report from: Interprocedural optimizations [ipo]
INLINE REPORT: (compute_function_gradient_D(double *, double *, int, int, int)) [10/60=16.7%] main.cpp(17,41)
Report from: Loop nest, Vector & Auto-parallelization optimizations [loop, vec, par]
LOOP BEGIN at main.cpp(18,3)
remark #15344: loop was not vectorized: vector dependence prevents vectorization
remark #15346: vector dependence: assumed FLOW dependence between gradient_D[j*M+i] (20:7) and DX[i+k*M] (22:9)
remark #15346: vector dependence: assumed ANTI dependence between DX[i+k*M] (22:9) and gradient_D[j*M+i] (20:7)
LOOP BEGIN at main.cpp(19,5)
remark #15344: loop was not vectorized: vector dependence prevents vectorization
remark #15346: vector dependence: assumed FLOW dependence between gradient_D[j*M+i] (20:7) and DX[i+k*M] (22:9)
remark #15346: vector dependence: assumed ANTI dependence between DX[i+k*M] (22:9) and gradient_D[j*M+i] (20:7)
LOOP BEGIN at main.cpp(21,7)
remark #15344: loop was not vectorized: vector dependence prevents vectorization
remark #15346: vector dependence: assumed FLOW dependence between gradient_D[i+M*j] (22:9) and gradient_D[i+M*j] (22:9)
remark #15346: vector dependence: assumed ANTI dependence between gradient_D[i+M*j] (22:9) and gradient_D[i+M*j] (22:9)
remark #25439: unrolled with remainder by 2
LOOP END
LOOP BEGIN at main.cpp(21,7)
<Remainder>
LOOP END
LOOP END
LOOP END
内联版本:
LOOP BEGIN at main.cpp(18,3) inlined into main.cpp(225,5)
<Distributed chunk1>
remark #25426: Loop Distributed (2 way)
remark #15542: loop was not vectorized: inner loop was already vectorized
LOOP BEGIN at main.cpp(19,5) inlined into main.cpp(225,5)
<Distributed chunk1>
remark #25426: Loop Distributed (2 way)
remark #25408: memset generated
remark #15542: loop was not vectorized: inner loop was already vectorized
LOOP BEGIN at main.cpp(19,5) inlined into main.cpp(225,5)
<Distributed chunk1>
remark #15389: vectorization support: reference U55_V[j*M+i] has unaligned access [ main.cpp(20,7) ]
remark #15381: vectorization support: unaligned access used inside loop body
remark #15305: vectorization support: vector length 2
remark #15399: vectorization support: unroll factor set to 2
remark #15309: vectorization support: normalized vectorization overhead 0.300
remark #15301: PARTIAL LOOP WAS VECTORIZED
remark #15451: unmasked unaligned unit stride stores: 1
remark #15475: --- begin vector cost summary ---
remark #15476: scalar cost: 4
remark #15477: vector cost: 2.500
remark #15478: estimated potential speedup: 1.450
remark #15488: --- end vector cost summary ---
remark #25015: Estimate of max trip count of loop=3
LOOP END
LOOP BEGIN at main.cpp(19,5) inlined into main.cpp(225,5)
<Remainder loop for vectorization, Distributed chunk1>
remark #25015: Estimate of max trip count of loop=12
LOOP END
LOOP END
LOOP END
LOOP BEGIN at main.cpp(18,3) inlined into main.cpp(225,5)
<Distributed chunk2>
remark #25444: Loopnest Interchanged: ( 1 2 3 ) --> ( 1 3 2 )
remark #15542: loop was not vectorized: inner loop was already vectorized
LOOP BEGIN at main.cpp(21,7) inlined into main.cpp(225,5)
<Distributed chunk2>
remark #15542: loop was not vectorized: inner loop was already vectorized
LOOP BEGIN at main.cpp(19,5) inlined into main.cpp(225,5)
<Peeled loop for vectorization>
remark #25015: Estimate of max trip count of loop=1
LOOP END
LOOP BEGIN at main.cpp(19,5) inlined into main.cpp(225,5)
remark #15388: vectorization support: reference U55_V[i+M*j] has aligned access [ main.cpp(22,9) ]
remark #15388: vectorization support: reference U55_V[i+M*j] has aligned access [ main.cpp(22,9) ]
remark #15388: vectorization support: reference U58_V[i+k*M] has aligned access [ main.cpp(22,34) ]
remark #15305: vectorization support: vector length 2
remark #15399: vectorization support: unroll factor set to 4
remark #15309: vectorization support: normalized vectorization overhead 0.700
remark #15301: PERMUTED LOOP WAS VECTORIZED
remark #15442: entire loop may be executed in remainder
remark #15448: unmasked aligned unit stride loads: 2
remark #15449: unmasked aligned unit stride stores: 1
remark #15475: --- begin vector cost summary ---
remark #15476: scalar cost: 8
remark #15477: vector cost: 2.500
remark #15478: estimated potential speedup: 3.050
remark #15488: --- end vector cost summary ---
LOOP END
LOOP BEGIN at main.cpp(19,5) inlined into main.cpp(225,5)
<Alternate Alignment Vectorized Loop>
LOOP END
LOOP BEGIN at main.cpp(19,5) inlined into main.cpp(225,5)
<Remainder loop for vectorization>
remark #15388: vectorization support: reference U55_V[i+M*j] has aligned access [ main.cpp(22,9) ]
remark #15388: vectorization support: reference U55_V[i+M*j] has aligned access [ main.cpp(22,9) ]
remark #15389: vectorization support: reference U58_V[i+k*M] has unaligned access [ main.cpp(22,34) ]
remark #15381: vectorization support: unaligned access used inside loop body
remark #15335: remainder loop was not vectorized: vectorization possible but seems inefficient. Use vector always directive or -vec-threshold0 to override
remark #15305: vectorization support: vector length 2
remark #15309: vectorization support: normalized vectorization overhead 1.083
LOOP END
LOOP END
LOOP END
在内联版本中,编译器了解有关参数的更多信息,而函数本身需要适用于常规参数。但是,该报告披露了一般优化。这就是将循环拆分为两部分,将第二部分的循环顺序更改为更优化的版本,线性地通过内存。这也可以应用于C代码本身:
void compute_function_gradient_D(double *gradient_D, double *DX, int K, int M,
int N) {
for (int j = 0; j < K; j++) {
for (int i = 0; i < M; i++) {
gradient_D[j * M + i] = 0;
}
}
for (int j = 0; j < K; j++) {
for (int k = 0; k < N; k++) {
for (int i = 0; i < M; i++) {
gradient_D[i + M * j] += DX[i + k * M];
}
}
}
}
使用此代码,即使函数未内联,编译器也会对第二个循环进行矢量化以获得类似的性能。
正如您所看到的,整个事情与OpenMP无关。
icpc 17.0.1,-fopenmp -mkl=sequential -Wall -g -O3