系数为a和b的两个Chebyshev多项式系列的乘积可以用公式表示
问题是要尽可能并行化。
我已经成功地使用cuda通过简单地对每个向量元素应用一个线程来并行化上述公式。因此,一个线程执行求和/乘法。
class YoutubeAdapter(internal var context: Context, internal var youtubevideosList: List<YoutubeModel>, recyclerView: RecyclerView) : RecyclerView.Adapter<YoutubeAdapter.MViewHolder>() {
internal var recyclerView: RecyclerView? = null
internal var progressBar:ProgressBar?=null
init {
this.recyclerView = recyclerView
this.progressBar=ProgressBar(context)
}
@RequiresApi(Build.VERSION_CODES.O)
override fun onCreateViewHolder(parent: ViewGroup, viewType: Int): MViewHolder {
val view = LayoutInflater.from(context).inflate(R.layout.youtube_row, parent, false)
return MViewHolder(view)
}
override fun onBindViewHolder(holder: MViewHolder, position: Int) {
//val youtubeid = "Wn8-CsL_lg4"
val youtubeid = youtubevideosList.get(position).getYoutubeID()
//val youtubelink = "<iframe width=\"100%\" height=\"100%\" src=\"https://www.youtube.com/embed/"+youtubeid+"\" frameborder=\"0\" allowfullscreen></iframe>"
val youtubelink1 = "<iframe width=\"100%\" height=\"100%\" src=\"https://www.youtube.com/embed/$youtubeid\" frameborder=\"0\" allowfullscreen></iframe>"
holder.youTubePlayerView.loadData(youtubelink1, "text/html", "utf-8")
//holder.descTextview.text=youtubevideosList.get(position).getVideoDesc()
/* holder.youTubePlayerView.webViewClient = object : WebViewClient() {
override fun onPageStarted(view: WebView, url: String, favicon: Bitmap) {
super.onPageStarted(view, url, favicon)
view.visibility =View.INVISIBLE
progressBar!!.visibility = View.VISIBLE
}
override fun onPageFinished(view: WebView, url: String) {
super.onPageFinished(view, url)
view.visibility =View.VISIBLE
progressBar!!.visibility = View.INVISIBLE
}
}*/
}
override fun getItemCount(): Int {
return youtubevideosList.size
}
@RequiresApi(Build.VERSION_CODES.O)
inner class MViewHolder(internal var view: View) : RecyclerView.ViewHolder(view) {
internal var youTubePlayerView: WebView
internal var descTextview: TextView
internal var progressBar:ProgressBar?=null
init {
youTubePlayerView = view.findViewById<View>(R.id.webVideoView) as WebView
descTextview = view.findViewById<View>(R.id.describtion_tv) as TextView
youTubePlayerView.getSettings().setJavaScriptEnabled(true)
progressBar= ProgressBar(context)
/* youTubePlayerView.setWebChromeClient(object : WebChromeClient() {
})*/
}
}
}
在另一个代码中,我设法使用总和减少来求和向量中的所有元素(我从nvidia演示中复制的其他人的代码)。现在的问题是,如何结合这两种方法?我想要一堆线程来计算c的每个元素中的所有和/乘。有小费吗?还是我可以学习的类似问题?
矩阵中的行缩减可能类似于该问题。但是我有多个不同长度和乘法的和。
这是nvidia员工提供的代码(我认为)
#include <stdio.h>
#include <iostream>
#include <cuda.h>
#include <time.h>
__global__ void chebyprod(int n, float *a, float *b, float *c){
int i = blockIdx.x *blockDim.x + threadIdx.x;
float sum;
if (i < n) {
sum = 0.f;
for (int j = 0; j<=i; j++){
sum += a[j]*b[j-i];
}
for (int j = 1; j < n-i; j++){
sum += a[j]*b[j+i]+a[j+i]*b[j];
}
c[i] = 0.5f*sum;
}
/*
if (i < n)
c[i] = a[i] + b[i];
*/
}
int main(void){
clock_t tStart = clock();
int N = 10000;
float *a, *b, *c, *d_a, *d_b, *d_c;
a = (float*)malloc(N*sizeof(float));
b = (float*)malloc(N*sizeof(float));
c = (float*)malloc(N*sizeof(float));
cudaMalloc(&d_a, N*sizeof(float));
cudaMalloc(&d_b, N*sizeof(float));
cudaMalloc(&d_c, N*sizeof(float));
for (int i = 0; i < N; i++) {
a[i] = 0.1f;
b[i] = 0.2f;
}
cudaMemcpy(d_a, a, N*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, N*sizeof(float), cudaMemcpyHostToDevice);
int blockSize, gridSize;
// Number of threads in each thread block
blockSize = 1024;
// Number of thread blocks in grid
gridSize = (int)ceil((float)N/blockSize);
std::cout << "blockSize: " << blockSize << "\ngridSize: " << gridSize << "\n";
// Perform chebyprod on N elements
chebyprod<<< gridSize, blockSize >>>(N, d_a, d_b, d_c);
printf("Time taken: %.2fs\n", (double)(clock() - tStart)/CLOCKS_PER_SEC);
cudaMemcpy(c, d_c, N*sizeof(float), cudaMemcpyDeviceToHost);
std::cout << "Vector c: [ ";
for (int k = 0; k < 10; ++k)
std::cout << c[k] << " ";
std::cout <<"]\n";
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
free(a);
free(b);
free(c);
}
答案 0 :(得分:2)
问题中提供的代码是实现的明智第一步。线程策略是最常见/典型的策略:每个输出点(此处为N
个输出点)分配一个线程。每个线程必须执行计算特定输出点所需的所有计算。改善CUDA代码性能的动机应始终解决至少2个CUDA优化优先级:
关于第1项,问题中提供的代码的有效性将取决于GPU。根据经验,我们希望在运行的GPU中为每个SM至少启动2048个线程(在Turing上为1024),从而有机会使GPU“饱和”。对于N
= 10000,我们可以使具有5个SM的GPU饱和。对于拥有80个SM的Tesla V100,我们没有希望将具有10,000个线程的GPU饱和。
关于第2项,所提供的代码在某种程度上也不够完善;在合并时会遇到问题:在许多情况下,相邻线程不会读取内存中的相邻值。仅举一个例子,我看到的第一个全局负载为a[j]
。这将为每个线程加载相同的值/位置,而不是在相邻线程中加载相邻的值。
我们能否提出一个可能在这两个方面都得到改善的替代实现?我们将考虑线程策略的以下更改:每个输出点分配一个线程块,而不是每个输出点分配一个线程。每个输出点所需的计算将可视化为矩阵的一个“行”。线程块将沿着该行“跨越”,执行所需的计算,并最终进行线程块级别的缩减以产生该行的单个结果。这将使我们能够解决这两个问题:经线中的相邻线程将能够从a
和b
中读取相邻值,并且我们也将立即能够将线程总数增加一个。系数高达1024(因此,从一万个线程开始,我们最多可以增加一千万个线程。一千万个足以饱和任何当前的CUDA GPU)。该线程策略还具有另一个不错的功能:上述计算的“行”具有不同的长度。第一行和最后一行将是最长的,具有大约N
个计算元素,而中间的行将更接近N/2
个计算元素。通过选择跨步循环(概念上类似于grid-stride loop),我们可以有效地处理变化的行长度。每个线程块仅在需要时才沿行“跨步”,从而累积结果。
以下是该实现的可行示例:
$ cat t1497.cu
#include <stdio.h>
#include <iostream>
#include <cuda.h>
typedef float mt;
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL
const bool sync = true;
const bool nosync = false;
unsigned long long dtime_usec(unsigned long long start, bool use_sync = nosync){
if (use_sync == sync) cudaDeviceSynchronize();
timeval tv;
gettimeofday(&tv, 0);
return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
__global__ void chebyprod(int n, const mt * __restrict__ a, const mt * __restrict__ b, mt * __restrict__ c){
int i = blockIdx.x *blockDim.x + threadIdx.x;
mt sum;
if (i < n) {
sum = 0.f;
for (int j = 0; j<=i; j++){
sum += a[j]*b[i-j];
}
for (int j = 1; j < n-i; j++){
sum += a[j]*b[j+i]+a[j+i]*b[j];
}
c[i] = 0.5f*sum;
}
}
// assume one threadblock per c_k coefficient
// assume a power-of-2 threadblock size
const int tpb_p2 = 8;
const int nTPB = 1<<tpb_p2;
const unsigned row_mask = ~((0xFFFFFFFFU>>tpb_p2)<<tpb_p2);
__global__ void chebyprod_imp(int n, const mt * __restrict__ a, const mt * __restrict__ b, mt * __restrict__ c){
#ifndef NO_WS
__shared__ mt sd[32];
if (threadIdx.x < 32) sd[threadIdx.x] = 0;
__syncthreads();
#else
__shared__ mt sd[nTPB];
#endif
int k = blockIdx.x;
mt sum = 0.0f;
int row_width = (((k)>(n-k))?(k):(n-k))+1;
int strides = (row_width>>tpb_p2)+ ((row_width&row_mask)?1:0);
int j = threadIdx.x;
mt tmp_a;
for (int s=0; s < strides; s++){ // block-stride loop
if (j < n) tmp_a = a[j];
if (j <= k) sum += tmp_a*b[k-j];
if ((j > 0) && (j < (n-k))) sum += tmp_a*b[j+k] + a[j+k]*b[j];
j += nTPB;
}
#ifndef NO_WS
// 1st warp-shuffle reduction
int lane = threadIdx.x & (warpSize-1);
int warpID = threadIdx.x >> 5; // assumes warpSize == 32
unsigned mask = 0xFFFFFFFFU;
for (int offset = warpSize>>1; offset > 0; offset >>= 1)
sum += __shfl_down_sync(mask, sum, offset);
if (lane == 0) sd[warpID] = sum;
__syncthreads(); // put warp results in shared mem
// hereafter, just warp 0
if (warpID == 0){
// reload val from shared mem if warp existed
sum = sd[lane];
// final warp-shuffle reduction
for (int offset = warpSize>>1; offset > 0; offset >>= 1)
sum += __shfl_down_sync(mask, sum, offset);
}
#else
sd[threadIdx.x] = sum;
for (int s = nTPB>>1; s > 0; s>>=1){ // sweep reduction
__syncthreads();
if (threadIdx.x < s) sd[threadIdx.x] += sd[threadIdx.x+s];}
if (!threadIdx.x) sum = sd[0];
#endif
if (!threadIdx.x) c[k] = sum*0.5f;
}
int main(int argc, char *argv[]){
int N = 10000;
if (argc>1) N = atoi(argv[1]);
std::cout << "N = " << N << std::endl;
mt *a, *b, *c, *ic, *d_a, *d_b, *d_c;
a = (mt*)malloc(N*sizeof(mt));
b = (mt*)malloc(N*sizeof(mt));
c = (mt*)malloc(N*sizeof(mt));
ic = (mt*)malloc(N*sizeof(mt));
cudaMalloc(&d_a, N*sizeof(mt));
cudaMalloc(&d_b, N*sizeof(mt));
cudaMalloc(&d_c, N*sizeof(mt));
for (int i = 0; i < N; i++) {
a[i] = 0.1f;
b[i] = 0.2f;
}
cudaMemcpy(d_a, a, N*sizeof(mt), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, N*sizeof(mt), cudaMemcpyHostToDevice);
int blockSize, gridSize;
// Number of threads in each thread block
blockSize = 1024;
// Number of thread blocks in grid
gridSize = (int)ceil((float)N/blockSize);
std::cout << "blockSize: " << blockSize << "\ngridSize: " << gridSize << "\n";
// Perform chebyprod on N elements
unsigned long long dt = dtime_usec(0);
chebyprod<<< gridSize, blockSize >>>(N, d_a, d_b, d_c);
dt = dtime_usec(dt,sync);
cudaMemcpy(c, d_c, N*sizeof(mt), cudaMemcpyDeviceToHost);
printf("Time taken: %fs\n", dt/(float)USECPSEC);
std::cout << cudaGetErrorString(cudaGetLastError()) << std::endl;
std::cout << "Vector c: [ ";
for (int k = 0; k < 10; ++k)
std::cout << c[k] << " ";
std::cout <<"]\n";
dt = dtime_usec(0);
chebyprod_imp<<< N, nTPB >>>(N, d_a, d_b, d_c);
dt = dtime_usec(dt,sync);
cudaMemcpy(ic, d_c, N*sizeof(mt), cudaMemcpyDeviceToHost);
printf("Time taken: %fs\n", dt/(float)USECPSEC);
std::cout << cudaGetErrorString(cudaGetLastError()) << std::endl;
std::cout << "Vector c: [ ";
for (int k = 0; k < 10; ++k)
std::cout << ic[k] << " ";
std::cout <<"]\n";
mt max_error = 0;
for (int k = 0; k < N; k++)
max_error = fmax(max_error, fabs(c[k] - ic[k]));
std::cout << "Max error = " << max_error << std::endl;
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
free(a);
free(b);
free(c);
free(ic);
}
$ nvcc -arch=sm_52 -o t1497 t1497.cu
$ ./t1497
blockSize: 1024
gridSize: 10
Time taken: 0.001687s
no error
Vector c: [ 199.996 199.986 199.976 199.966 199.956 199.946 199.936 199.926 199.916 199.906 ]
Time taken: 0.000350s
no error
Vector c: [ 199.99 199.98 199.97 199.96 199.95 199.94 199.93 199.92 199.91 199.9 ]
Max error = 0.0137787
$
(更改-arch
开关以匹配您的GPU)
以上示例显示,修改后的算法运行速度提高了约5倍(在Tesla V100上)。尽管存在数值差异,但这是由于浮点问题引起的。为了证明算法给出了正确的结果,请将typedef
从float
切换到double
。您将看到结果基本上不再存在任何数值差异(建议算法在逻辑上是相同的),并且float
分辨率的改进算法版本为数值上的前10个元素提供了答案更接近于double
算法产生的“更准确”的结果。
N
线程)的GPU。相对较小的GPU(例如,对于N
= 10000,可能为8个SM或更少)可能无法从中受益,并且实际上代码的运行速度可能会比原始算法慢。
尽管我提到了合并,但对于N
= 10000,此处的输入数据非常小(〜80K字节),将适合大多数GPU的L2缓存。一旦数据位于L2缓存中,低效的访问模式就不再是问题。因此,在这种情况下,该算法的主要好处可能是由于第1项。如果无法利用第1项,则该算法几乎没有收益。
出于测试目的,我使用了warp-stride循环创建了另一个版本。但是,在小型GPU上,它似乎并没有明显更快,而在V100上,它实际上却要慢一些:
#include <stdio.h>
#include <iostream>
#include <cuda.h>
typedef float mt;
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL
const bool sync = true;
const bool nosync = false;
unsigned long long dtime_usec(unsigned long long start, bool use_sync = nosync){
if (use_sync == sync) cudaDeviceSynchronize();
timeval tv;
gettimeofday(&tv, 0);
return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
__global__ void chebyprod(int n, const mt * __restrict__ a, const mt * __restrict__ b, mt * __restrict__ c){
int i = blockIdx.x *blockDim.x + threadIdx.x;
mt sum;
if (i < n) {
sum = 0.f;
for (int j = 0; j<=i; j++){
sum += a[j]*b[i-j];
}
for (int j = 1; j < n-i; j++){
sum += a[j]*b[j+i]+a[j+i]*b[j];
}
c[i] = 0.5f*sum;
}
}
// assume one warp per c_k coefficient
// assume a multiple-of-32 threadblock size
const int nTPB = 32*8;
const int warpSize_p2 = 5; // assumes warpSize == 32
const int nWarps = nTPB>>warpSize_p2;
const unsigned row_mask = ~((0xFFFFFFFFU>>warpSize_p2)<<warpSize_p2);
__global__ void chebyprod_imp(int n, const mt * __restrict__ a, const mt * __restrict__ b, mt * __restrict__ c){
int warpID = threadIdx.x >> warpSize_p2;
int k = blockIdx.x*(nWarps)+warpID;
if (k < n){
mt sum = 0.0f;
int lane = threadIdx.x & (warpSize-1);
int row_width = (((k)>(n-k))?(k):(n-k))+1;
int strides = (row_width>>warpSize_p2)+ ((row_width&row_mask)?1:0);
int j = lane;
mt tmp_a;
for (int s=0; s < strides; s++){ // warp-stride loop
if (j < n) tmp_a = a[j];
if (j <= k) sum += tmp_a*b[k-j];
if ((j > 0) && (j < (n-k))) sum += tmp_a*b[j+k] + a[j+k]*b[j];
j += warpSize;
}
// warp-shuffle reduction
for (int offset = warpSize>>1; offset > 0; offset >>= 1)
sum += __shfl_down_sync(0xFFFFFFFFU, sum, offset);
if (lane==0) c[k] = sum*0.5f;}
}
int main(int argc, char *argv[]){
int N = 10000;
if (argc>1) N = atoi(argv[1]);
std::cout << "N = " << N << std::endl;
mt *a, *b, *c, *ic, *d_a, *d_b, *d_c;
a = (mt*)malloc(N*sizeof(mt));
b = (mt*)malloc(N*sizeof(mt));
c = (mt*)malloc(N*sizeof(mt));
ic = (mt*)malloc(N*sizeof(mt));
cudaMalloc(&d_a, N*sizeof(mt));
cudaMalloc(&d_b, N*sizeof(mt));
cudaMalloc(&d_c, N*sizeof(mt));
for (int i = 0; i < N; i++) {
a[i] = 0.1f;
b[i] = 0.2f;
}
cudaMemcpy(d_a, a, N*sizeof(mt), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, N*sizeof(mt), cudaMemcpyHostToDevice);
int blockSize, gridSize;
// Number of threads in each thread block
blockSize = 1024;
// Number of thread blocks in grid
gridSize = (int)ceil((float)N/blockSize);
std::cout << "blockSize: " << blockSize << "\ngridSize: " << gridSize << "\n";
// Perform chebyprod on N elements
unsigned long long dt = dtime_usec(0);
chebyprod<<< gridSize, blockSize >>>(N, d_a, d_b, d_c);
dt = dtime_usec(dt,sync);
cudaMemcpy(c, d_c, N*sizeof(mt), cudaMemcpyDeviceToHost);
printf("Time taken: %fs\n", dt/(float)USECPSEC);
std::cout << cudaGetErrorString(cudaGetLastError()) << std::endl;
std::cout << "Vector c: [ ";
for (int k = 0; k < 10; ++k)
std::cout << c[k] << " ";
std::cout <<"]\n";
dt = dtime_usec(0);
chebyprod_imp<<< (N/nWarps)+1, nTPB >>>(N, d_a, d_b, d_c);
dt = dtime_usec(dt,sync);
cudaMemcpy(ic, d_c, N*sizeof(mt), cudaMemcpyDeviceToHost);
printf("Time taken: %fs\n", dt/(float)USECPSEC);
std::cout << cudaGetErrorString(cudaGetLastError()) << std::endl;
std::cout << "Vector c: [ ";
for (int k = 0; k < 10; ++k)
std::cout << ic[k] << " ";
std::cout <<"]\n";
mt max_error = 0;
for (int k = 0; k < N; k++)
max_error = fmax(max_error, fabs(c[k] - ic[k]));
std::cout << "Max error = " << max_error << std::endl;
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
free(a);
free(b);
free(c);
free(ic);
}