Question

问题：我试图实时渲染动态的Julia分形。因为分形不断变化，我需要能够每秒渲染至少20帧，最好是更多。您需要了解的关于Julia分形的信息是每个像素都可以独立计算，因此任务很容易并行化。

第一种方法：因为我已经习惯在C＃中使用Monogame，所以我尝试在HLSL中编写一个可以完成这项工作的着色器，但编译器一直抱怨因为我用完了以上允许的64个算术槽（我至少需要一千个）。

第二种方法：使用CPU，可以预期大约两分钟生成一帧。

第三种方法：我开始使用名为Cloo的包装器学习OpenCL的基础知识。通过使用OpenCL计算图像数据，然后从GPU获取数据，将数据存储在Texture2D中并将纹理绘制到屏幕上，我实际上获得了快速，好的结果。对于1000x1000图像，我每秒大约可以获得13帧。这仍然不是我所希望的，因为图像应该是1920x1080来填满我的屏幕，并且帧速率非常明显。我意识到我实际上是在GPU上生成图像，将数据发送到CPU然后将其发送回GPU，所以这似乎是一个不必要的步骤，如果可以删除，可能会解决我的问题。我读过一些论坛，OpenGL能够做到这一点，但我还没能找到具体的信息。

问题：首先，是否有一种简单的方法可以直接绘制OpenCL生成的数据而不涉及CPU（最好与Monogame兼容）？如果不是这样的话，是否可以使用OpenGL实现它，然后将它与Monogame结合使用？其次，为什么使用简单的HLSL着色器不能实现这一点？由于HLSL和OpenCL都使用GPU，为什么HLSL在进行许多算术运算时会受到如此多的限制？

修改

我发现this网站的功能与我想要的大致相同，但使用的是GLSL着色器。这再次质疑我在HLSL中的成功。不幸的是，由于monogame不支持GLSL（尚未），我的问题仍然没有答案。

Answer 1

抱歉，我不使用 OpenCL ，也不使用 C＃，但您可以使用 GLSL 在着色器内完全执行此操作（但您可能会遇到精度问题对于Julia而言，分形有时甚至是64位double还不够。无论如何，这里有一个Mandelbrot集的简单例子，我在几年前做过......

CPU端app C ++ / OpenGL / GLSL / VCL代码::

//---------------------------------------------------------------------------
#include <vcl.h>
#pragma hdrstop
#include "Unit1.h" // VCL window header
#include "gl\\OpenGL3D_double.cpp" // my GL engine
//---------------------------------------------------------------------------
#pragma package(smart_init)
#pragma resource "*.dfm"
TForm1 *Form1;
OpenGLscreen scr;
GLSLprogram shd;
float mx=0.0,my=0.0,mx0=0.0,my0=0.0,mx1=0.0,my1=0.0;
TShiftState sh0,sh1;
int xs=1,ys=1;
int txrmap=-1;
float zoom=1.000;
unsigned int queryID[2];
//---------------------------------------------------------------------------
void gl_draw()
    {
    float x,y,dx,dy;
    scr.cls();
    glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);

    // matrix for old GL rendering
    glMatrixMode(GL_PROJECTION);
    glLoadIdentity();
    glMatrixMode(GL_MODELVIEW);
    glLoadIdentity();
    glMatrixMode(GL_TEXTURE);
    glLoadIdentity();


    // GLSL uniforms
    shd.bind();
    shd.set1i("txrmap",0);      // texture unit
    shd.set2f("p0",mx,my);      // pan position
    shd.set1f("zoom",zoom);     // zoom

    // issue the first query
    // Records the time only after all previous
    // commands have been completed
    glQueryCounter(queryID[0], GL_TIMESTAMP);

    // QUAD covering screen
    scr.txrs.bind(txrmap);
    glColor3f(1.0,1.0,1.0);
    glBegin(GL_QUADS);
    glTexCoord2f(0.0,0.0); glVertex2f(-1.0,+1.0);
    glTexCoord2f(0.0,1.0); glVertex2f(-1.0,-1.0);
    glTexCoord2f(1.0,1.0); glVertex2f(+1.0,-1.0);
    glTexCoord2f(1.0,0.0); glVertex2f(+1.0,+1.0);
    glEnd();
    shd.unbind();
    scr.txrs.unbind();

    // issue the second query
    // records the time when the sequence of OpenGL
    // commands has been fully executed
    glQueryCounter(queryID[1], GL_TIMESTAMP);


    // GL driver info and GLSL log
    scr.text_init_pix(1.0);
    glColor4f(0.0,0.2,1.0,0.8);
    scr.text(glGetAnsiString(GL_VENDOR));
    scr.text(glGetAnsiString(GL_RENDERER));
    scr.text("OpenGL ver: "+glGetAnsiString(GL_VERSION));
    glColor4f(0.4,0.7,0.8,0.8);
    for (int i=1;i<=shd.log.Length();) scr.text(str_load_lin(shd.log,i,true));
    scr.text_exit();

    scr.exe();
    scr.rfs();

    // wait until the results are available
    int e;
    unsigned __int64 t0,t1;
    for (e=0;!e;) glGetQueryObjectiv(queryID[0],GL_QUERY_RESULT_AVAILABLE,&e);
    for (e=0;!e;) glGetQueryObjectiv(queryID[1],GL_QUERY_RESULT_AVAILABLE,&e);
    glGetQueryObjectui64v(queryID[0], GL_QUERY_RESULT, &t0);
    glGetQueryObjectui64v(queryID[1], GL_QUERY_RESULT, &t1);
    Form1->Caption=AnsiString().sprintf("Time spent on the GPU: %f ms\n", (t1-t0)/1000000.0);
    }
//---------------------------------------------------------------------------
__fastcall TForm1::TForm1(TComponent* Owner):TForm(Owner)
    {
    scr.init(this);

    OpenGLtexture txr;
    txr.load      ("gradient.jpg");
    txrmap=scr.txrs.add(txr);

    shd.set_source_file("","","","Mandelbrot_set.glsl_vert","Mandelbrot_set.glsl_frag");

    glGenQueries(2, queryID);
    }
//---------------------------------------------------------------------------
void __fastcall TForm1::FormDestroy(TObject *Sender)
    {
    scr.exit();
    }
//---------------------------------------------------------------------------
void __fastcall TForm1::FormResize(TObject *Sender)
    {
    scr.resize();
    xs=ClientWidth;
    ys=ClientHeight;
    gl_draw();
    }
//---------------------------------------------------------------------------
void __fastcall TForm1::FormPaint(TObject *Sender)
    {
    gl_draw();
    }
//---------------------------------------------------------------------------
void __fastcall TForm1::FormMouseMove(TObject *Sender, TShiftState Shift, int X,int Y)
    {
    bool q0,q1;
    mx1=1.0-divide(X+X,xs-1);
    my1=divide(Y+Y,ys-1)-1.0;
    sh1=Shift;
    q0=sh0.Contains(ssLeft);
    q1=sh1.Contains(ssLeft);
    if (q1)
        {
        mx-=(mx1-mx0)*zoom;
        my-=(my1-my0)*zoom;
        }
    mx0=mx1; my0=my1; sh0=sh1;
    gl_draw();
    }
//---------------------------------------------------------------------------
void __fastcall TForm1::FormMouseDown(TObject *Sender, TMouseButton Button,TShiftState Shift, int X, int Y)
    {
    FormMouseMove(Sender,Shift,X,Y);
    }
//---------------------------------------------------------------------------
void __fastcall TForm1::FormMouseUp(TObject *Sender, TMouseButton Button,TShiftState Shift, int X, int Y)
    {
    FormMouseMove(Sender,Shift,X,Y);
    }
//---------------------------------------------------------------------------
void __fastcall TForm1::FormMouseWheelDown(TObject *Sender, TShiftState Shift, TPoint &MousePos, bool &Handled)
    {
    zoom*=1.2;
    gl_draw();
    }
//---------------------------------------------------------------------------
void __fastcall TForm1::FormMouseWheelUp(TObject *Sender, TShiftState Shift, TPoint &MousePos, bool &Handled)
    {
    zoom/=1.2;
    gl_draw();
    }
//---------------------------------------------------------------------------

您可以忽略大部分代码，重要的内容是gl_draw()呈现单个QUAD覆盖整个屏幕并传递zoom和pan位置。此代码使用旧样式glBegin/glEnd和默认nVidia位置，因此它可能无法在不同的供应商gfx驱动程序上运行。网格应该在VAO / VBO中，因此布局位置将匹配以查看如何执行此操作以查看答案末尾的链接或将着色器移植到兼容性配置文件。

<强>顶点：

// Vertex
#version 420 core
layout(location=0) in vec2 pos;     // glVertex2f <-1,+1>
out smooth vec2 p;                  // texture end point <0,1>
void main()
    {
    p=pos;
    gl_Position=vec4(pos,0.0,1.0);
    }

<强>片段：

// Fragment
#version 420 core
uniform sampler2D txrmap;           // texture unit for light map
uniform vec2 p0=vec2(0.0,0.0);      // mouse position <-1,+1>
uniform float zoom=1.000;           // zoom [-]
in smooth vec2 p;
out vec4 col;
void main()
    {
    int i,n;
    vec2 pp;
    float x,y,q,xx,yy;
    pp=(p*zoom)-p0;         // y (-1, 1)
    pp.x=(1.75*pp.x)-0.75;  // x (-2.5, 1)
    for (x=0.0,y=0.0,xx=0.0,yy=0.0,i=0,n=200;(i<n)&&(xx+yy<4.0);i++)
        {
        q=xx-yy+pp.x;
        y=(2.0*x*y)+pp.y;
        x=q;
        xx=x*x;
        yy=y*y;
        }
    q=float(i)/float(n);
    col=texture2D(txrmap,vec2(q,0.5));
//  col=vec4(q,q,q,1.0);
    }

将此纹理用作渐变：

结果截图：

如果你需要开始使用 GLSL （替换我的gl引擎的东西），请参阅：

simple complete GL+VAO/VBO+GLSL+shaders example in C++

但我相信在 C＃中必须有大量的教程，所以google

如果您对色彩增强感兴趣，请参阅：

Mandelbrot Set - Color Spectrum Suggestions?

Answer 2

要涵盖这些问题：是的，OpenCL可以画画，但是Monogame显然没有封装在CL的顶部，所以不是问题1.问题2是正确的问题：也许，请参阅下面的建议。问题3：HLSL基本上是PS 1.1所以＆＃34;为什么不可能＆＃34;是因为PS演变为2.x以通过更广泛的数据管道来管理并行化...所以你需要Dx12支持或GLSL / OpenGL。

由于您接近使用CLoo的性能预期，为什么不尝试使用OpenCL.Net和/或OpenTK将Julia计算与Monogame API更紧密地绑定在一起？ - 如果你必须使用GPU-CPU-GPU至少使其成为尽可能宽的管道。

或者，对并行化和帧速率问题稍微侧重的解决方案可能是将GP-GPU包装器（如Quanta的Alea）与Monogame解决方案集成在一起。我建议看看Cudafy，但Alea更强大，支持跨厂商GPU。

构建过程将决定Julia代码的哪一部分将通过Alea在GPU上计算，并且Monogame部分将接收用于渲染的像素字段。关键点将是图书馆＆＃34; play-nice＆＃34;兼容性，最终是帧速率，如果你让它工作。

底线：你选择在HLSL（阅读：微软Dx9）和Monogame不支持GLSL / Dx12 ....你将不得不创造性地操纵以获得卡住。

C＃渲染OpenCL生成的图像

修改

2 个答案: