如何在SM3的着色器代码上减少或优化指令槽?

时间:2014-11-26 00:29:33

标签: optimization directx shader hlsl directx-9

我在以下着色器代码上获得了太多指令槽:

float2 texture_size;
float4x4 matrixTransform;

const static float coef = 2.0;
const static float3 yuv_weighted = float3(14.352, 28.176, 5.472);

sampler decal : register(s0);

float4 df(float4 A, float4 B)
{
    return abs(float4(A.x - B.x, A.y - B.y, A.z - B.z, A.w - B.w));
}

float4 weighted_distance(float4 a, float4 b, float4 c, float4 d, float4 e, float4 f, float4 g, float4 h)
{
    return (df(a, b) + df(a, c) + df(d, e) + df(d, f) + 4.0 * df(g, h));
}

float4 main_vertex(inout float4 color : COLOR0, inout float2 texCoord : TEXCOORD0, inout float4 position : POSITION0) : TEXCOORD1
{
    float2 ps = float2(1.0 / texture_size.x, 1.0 / texture_size.y);
    float4 t1;

    t1.xy = float2(ps.x, 0); // F
    t1.zw = float2(0, ps.y); // H

    position = mul(position, matrixTransform);

    return t1;
}

/*    FRAGMENT SHADER    */
float4 main_fragment(float4 p : POSITION0, float2 tex0 : TEXCOORD0, float4 tex1 : TEXCOORD1) : COLOR0
{
    bool4 edr, edr_left, edr_up, px; // px = pixel, edr = edge detection rule
    bool4 ir_lv1, ir_lv2_left, ir_lv2_up;
    bool4 nc; // new_color
    bool4 fx, fx_left, fx_up; // inequations of straight lines.

    float2 fp = frac(tex0 * texture_size);
    float2 dx = tex1.xy;
    float2 dy = tex1.zw;

    float3 A  = tex2D(decal, tex0 - dx - dy).xyz;
    float3 B  = tex2D(decal, tex0 - dy).xyz;
    float3 C  = tex2D(decal, tex0 + dx - dy).xyz;
    float3 D  = tex2D(decal, tex0 - dx).xyz;
    float3 E  = tex2D(decal, tex0).xyz;
    float3 F  = tex2D(decal, tex0 + dx).xyz;
    float3 G  = tex2D(decal, tex0 - dx + dy).xyz;
    float3 H  = tex2D(decal, tex0 + dy).xyz;
    float3 I  = tex2D(decal, tex0 + dx + dy).xyz;
    float3 A1 = tex2D(decal, tex0 - dx - 2.0 * dy).xyz;
    float3 C1 = tex2D(decal, tex0 + dx - 2.0 * dy).xyz;
    float3 A0 = tex2D(decal, tex0 - 2.0 * dx - dy).xyz;
    float3 G0 = tex2D(decal, tex0 - 2.0 * dx + dy).xyz;
    float3 C4 = tex2D(decal, tex0 + 2.0 * dx - dy).xyz;
    float3 I4 = tex2D(decal, tex0 + 2.0 * dx + dy).xyz;
    float3 G5 = tex2D(decal, tex0 - dx + 2.0 * dy).xyz;
    float3 I5 = tex2D(decal, tex0 + dx + 2.0 * dy).xyz;
    float3 B1 = tex2D(decal, tex0 - 2.0 * dy).xyz;
    float3 D0 = tex2D(decal, tex0 - 2.0 * dx).xyz;
    float3 H5 = tex2D(decal, tex0 + 2.0 * dy).xyz;
    float3 F4 = tex2D(decal, tex0 + 2.0 * dx).xyz;

    float4 b = mul(float4x3(B, D, H, F), yuv_weighted);
    float4 c = mul(float4x3(C, A, G, I), yuv_weighted);
    float4 e = mul(float4x3(E, E, E, E), yuv_weighted);
    float4 d = b.yzwx;
    float4 f = b.wxyz;
    float4 g = c.zwxy;
    float4 h = b.zwxy;
    float4 i = c.wxyz;

    float4 i4 = mul(float4x3(I4, C1, A0, G5), yuv_weighted);
    float4 i5 = mul(float4x3(I5, C4, A1, G0), yuv_weighted);
    float4 h5 = mul(float4x3(H5, F4, B1, D0), yuv_weighted);
    float4 f4 = h5.yzwx;

    float4 Ao = float4(1.0, -1.0, -1.0, 1.0);
    float4 Bo = float4(1.0, 1.0, -1.0, -1.0);
    float4 Co = float4(1.5, 0.5, -0.5, 0.5);
    float4 Ax = float4(1.0, -1.0, -1.0, 1.0);
    float4 Bx = float4(0.5, 2.0, -0.5, -2.0);
    float4 Cx = float4(1.0, 1.0, -0.5, 0.0);
    float4 Ay = float4(1.0, -1.0, -1.0, 1.0);
    float4 By = float4(2.0, 0.5, -2.0, -0.5);
    float4 Cy = float4(2.0, 0.0, -1.0, 0.5);

    // These inequations define the line below which interpolation occurs.
    fx.x      = (Ao.x * fp.y + Bo.x * fp.x > Co.x);
    fx_left.x = (Ax.x * fp.y + Bx.x * fp.x > Cx.x);
    fx_up.x   = (Ay.x * fp.y + By.x * fp.x > Cy.x);

    fx.y      = (Ao.y * fp.y + Bo.y * fp.x > Co.y);
    fx_left.y = (Ax.y * fp.y + Bx.y * fp.x > Cx.y);
    fx_up.y   = (Ay.y * fp.y + By.y * fp.x > Cy.y);

    fx.z      = (Ao.z * fp.y + Bo.z * fp.x > Co.z);
    fx_left.z = (Ax.z * fp.y + Bx.z * fp.x > Cx.z);
    fx_up.z   = (Ay.z * fp.y + By.z * fp.x > Cy.z);

    fx.w      = (Ao.w * fp.y + Bo.w * fp.x > Co.w);
    fx_left.w = (Ax.w * fp.y + Bx.w * fp.x > Cx.w);
    fx_up.w   = (Ay.w * fp.y + By.w * fp.x > Cy.w);

    ir_lv1.x      = ((e.x != f.x) && (e.x != h.x));
    ir_lv2_left.x = ((e.x != g.x) && (d.x != g.x));
    ir_lv2_up.x   = ((e.x != c.x) && (b.x != c.x));

    ir_lv1.y      = ((e.y != f.y) && (e.y != h.y));
    ir_lv2_left.y = ((e.y != g.y) && (d.y != g.y));
    ir_lv2_up.y   = ((e.y != c.y) && (b.y != c.y));

    ir_lv1.z      = ((e.z != f.z) && (e.z != h.z));
    ir_lv2_left.z = ((e.z != g.z) && (d.z != g.z));
    ir_lv2_up.z   = ((e.z != c.z) && (b.z != c.z));

    ir_lv1.w      = ((e.w != f.w) && (e.w != h.w));
    ir_lv2_left.w = ((e.w != g.w) && (d.w != g.w));
    ir_lv2_up.w   = ((e.w != c.w) && (b.w != c.w));

    float4 w1 = weighted_distance(e, c, g, i, h5, f4, h, f);
    float4 w2 = weighted_distance(h, d, i5, f, i4, b, e, i);
    float4 df_fg = df(f, g);
    float4 df_hc = df(h, c);
    float4 t1 = (coef * df_fg);
    float4 t2 = df_hc;
    float4 t3 = df_fg;
    float4 t4 = (coef * df_hc);

    edr = bool4((w1.x < w2.x) && ir_lv1.x, 
                (w1.y < w2.y) && ir_lv1.y, 
                (w1.z < w2.z) && ir_lv1.z, 
                (w1.w < w2.w) && ir_lv1.w);

    edr_left = bool4((t1.x <= t2.x) && ir_lv2_left.x, 
                     (t1.y <= t2.y) && ir_lv2_left.y, 
                     (t1.z <= t2.z) && ir_lv2_left.z, 
                     (t1.w <= t2.w) && ir_lv2_left.w);

    edr_up = bool4((t4.x <= t3.x) && ir_lv2_up.x, 
                   (t4.y <= t3.y) && ir_lv2_up.y, 
                   (t4.z <= t3.z) && ir_lv2_up.z, 
                   (t4.w <= t3.w) && ir_lv2_up.w);

    nc.x = (edr.x && (fx.x || edr_left.x && fx_left.x || edr_up.x && fx_up.x));
    nc.y = (edr.y && (fx.y || edr_left.y && fx_left.y || edr_up.y && fx_up.y));
    nc.z = (edr.z && (fx.z || edr_left.z && fx_left.z || edr_up.z && fx_up.z));
    nc.w = (edr.w && (fx.w || edr_left.w && fx_left.w || edr_up.w && fx_up.w));

    t1 = df(e, f);
    t2 = df(e, h);

    px = bool4(t1.x <= t2.x, 
               t1.y <= t2.y, 
               t1.z <= t2.z, 
               t1.w <= t2.w);

    float3 res = nc.x ? px.x ? F : H : 
                 nc.y ? px.y ? B : F : 
                 nc.z ? px.z ? D : B : 
                 nc.w ? px.w ? H : D : E;

    return float4(res.x, res.y, res.z, 1.0);
}

technique T0
{
    pass P0
    {
        VertexShader = compile vs_3_0 main_vertex();
        PixelShader = compile ps_3_0 main_fragment();
    }
}

目前代码使用601个指令槽,它需要为512或更少,因此在编译时会出现此错误:

  

编译的着色器代码使用太多指令槽(601)。最大。目标允许(ps_3_0)为512。

最初代码有649个指令插槽,我能够将其降低到601.我现在已经达到了一个我不知道在哪里优化它的地步。任何帮助将不胜感激。

1 个答案:

答案 0 :(得分:0)

感谢来自@Gnietschow的信息,我得到了这个工作。以下优化版本将使用少于512个指令槽。

float2 texture_size;
float4x4 matrixTransform;

const static float coef = 2.0;
const static float3 yuv_weighted = float3(14.352, 28.176, 5.472);

sampler decal : register(s0);

float4 df(float4 A, float4 B)
{
    return abs(A - B);
}

float4 weighted_distance(float4 a, float4 b, float4 c, float4 d, 
                         float4 e, float4 f, float4 g, float4 h)
{
    return (df(a, b) + df(a, c) + df(d, e) + df(d, f) + 4.0 * df(g, h));
}

float4 main_vertex(inout float4 color : COLOR0, inout float2 texCoord : TEXCOORD0, 
                   inout float4 position : POSITION0) : TEXCOORD1
{
    float2 ps = float2(1.0 / texture_size.x, 1.0 / texture_size.y);
    float4 t1;

    t1.xy = float2(ps.x, 0); // F
    t1.zw = float2(0, ps.y); // H

    position = mul(position, matrixTransform);

    return t1;
}

/*    FRAGMENT SHADER    */
float4 main_fragment(float4 p : POSITION0, float2 tex0 : TEXCOORD0, 
                     float4 tex1 : TEXCOORD1) : COLOR0
{
    bool4 edr, edr_left, edr_up, px; // px = pixel, edr = edge detection rule
    bool4 ir_lv1, ir_lv2_left, ir_lv2_up;
    bool4 nc; // new_color
    bool4 fx, fx_left, fx_up; // inequations of straight lines.

    float2 fp = frac(tex0 * texture_size);
    float2 dx = tex1.xy;
    float2 dy = tex1.zw;

    float3 A  = tex2D(decal, tex0 - dx - dy).xyz;
    float3 B  = tex2D(decal, tex0 - dy).xyz;
    float3 C  = tex2D(decal, tex0 + dx - dy).xyz;
    float3 D  = tex2D(decal, tex0 - dx).xyz;
    float3 E  = tex2D(decal, tex0).xyz;
    float3 F  = tex2D(decal, tex0 + dx).xyz;
    float3 G  = tex2D(decal, tex0 - dx + dy).xyz;
    float3 H  = tex2D(decal, tex0 + dy).xyz;
    float3 I  = tex2D(decal, tex0 + dx + dy).xyz;
    float3 A1 = tex2D(decal, tex0 - dx - 2.0 * dy).xyz;
    float3 C1 = tex2D(decal, tex0 + dx - 2.0 * dy).xyz;
    float3 A0 = tex2D(decal, tex0 - 2.0 * dx - dy).xyz;
    float3 G0 = tex2D(decal, tex0 - 2.0 * dx + dy).xyz;
    float3 C4 = tex2D(decal, tex0 + 2.0 * dx - dy).xyz;
    float3 I4 = tex2D(decal, tex0 + 2.0 * dx + dy).xyz;
    float3 G5 = tex2D(decal, tex0 - dx + 2.0 * dy).xyz;
    float3 I5 = tex2D(decal, tex0 + dx + 2.0 * dy).xyz;
    float3 B1 = tex2D(decal, tex0 - 2.0 * dy).xyz;
    float3 D0 = tex2D(decal, tex0 - 2.0 * dx).xyz;
    float3 H5 = tex2D(decal, tex0 + 2.0 * dy).xyz;
    float3 F4 = tex2D(decal, tex0 + 2.0 * dx).xyz;

    float4 b = mul(float4x3(B, D, H, F), yuv_weighted);
    float4 c = mul(float4x3(C, A, G, I), yuv_weighted);
    float4 e = mul(float4x3(E, E, E, E), yuv_weighted);
    float4 d = b.yzwx;
    float4 f = b.wxyz;
    float4 g = c.zwxy;
    float4 h = b.zwxy;
    float4 i = c.wxyz;

    float4 i4 = mul(float4x3(I4, C1, A0, G5), yuv_weighted);
    float4 i5 = mul(float4x3(I5, C4, A1, G0), yuv_weighted);
    float4 h5 = mul(float4x3(H5, F4, B1, D0), yuv_weighted);
    float4 f4 = h5.yzwx;

    float4 Ao = float4(1.0, -1.0, -1.0, 1.0);
    float4 Bo = float4(1.0, 1.0, -1.0, -1.0);
    float4 Co = float4(1.5, 0.5, -0.5, 0.5);
    float4 Ax = float4(1.0, -1.0, -1.0, 1.0);
    float4 Bx = float4(0.5, 2.0, -0.5, -2.0);
    float4 Cx = float4(1.0, 1.0, -0.5, 0.0);
    float4 Ay = float4(1.0, -1.0, -1.0, 1.0);
    float4 By = float4(2.0, 0.5, -2.0, -0.5);
    float4 Cy = float4(2.0, 0.0, -1.0, 0.5);

    // These inequations define the line below which interpolation occurs.
    fx.x      = (Ao.x * fp.y + Bo.x * fp.x > Co.x);
    fx_left.x = (Ax.x * fp.y + Bx.x * fp.x > Cx.x);
    fx_up.x   = (Ay.x * fp.y + By.x * fp.x > Cy.x);

    fx.y      = (Ao.y * fp.y + Bo.y * fp.x > Co.y);
    fx_left.y = (Ax.y * fp.y + Bx.y * fp.x > Cx.y);
    fx_up.y   = (Ay.y * fp.y + By.y * fp.x > Cy.y);

    fx.z      = (Ao.z * fp.y + Bo.z * fp.x > Co.z);
    fx_left.z = (Ax.z * fp.y + Bx.z * fp.x > Cx.z);
    fx_up.z   = (Ay.z * fp.y + By.z * fp.x > Cy.z);

    fx.w      = (Ao.w * fp.y + Bo.w * fp.x > Co.w);
    fx_left.w = (Ax.w * fp.y + Bx.w * fp.x > Cx.w);
    fx_up.w   = (Ay.w * fp.y + By.w * fp.x > Cy.w);

    ir_lv1.x      = ((e.x != f.x) && (e.x != h.x));
    ir_lv2_left.x = ((e.x != g.x) && (d.x != g.x));
    ir_lv2_up.x   = ((e.x != c.x) && (b.x != c.x));

    ir_lv1.y      = ((e.y != f.y) && (e.y != h.y));
    ir_lv2_left.y = ((e.y != g.y) && (d.y != g.y));
    ir_lv2_up.y   = ((e.y != c.y) && (b.y != c.y));

    ir_lv1.z      = ((e.z != f.z) && (e.z != h.z));
    ir_lv2_left.z = ((e.z != g.z) && (d.z != g.z));
    ir_lv2_up.z   = ((e.z != c.z) && (b.z != c.z));

    ir_lv1.w      = ((e.w != f.w) && (e.w != h.w));
    ir_lv2_left.w = ((e.w != g.w) && (d.w != g.w));
    ir_lv2_up.w   = ((e.w != c.w) && (b.w != c.w));

    float4 w1 = weighted_distance(e, c, g, i, h5, f4, h, f);
    float4 w2 = weighted_distance(h, d, i5, f, i4, b, e, i);
    float4 df_fg = df(f, g);
    float4 df_hc = df(h, c);
    float4 t1 = (coef * df_fg);
    float4 t2 = df_hc;
    float4 t3 = df_fg;
    float4 t4 = (coef * df_hc);

    edr      = bool4((w1 < w2) && ir_lv1);
    edr_left = bool4((t1 < t2) && ir_lv2_left);
    edr_up   = bool4((t4 < t3) && ir_lv2_up);

    nc.x = (edr.x && (fx.x || edr_left.x && fx_left.x || edr_up.x && fx_up.x));
    nc.y = (edr.y && (fx.y || edr_left.y && fx_left.y || edr_up.y && fx_up.y));
    nc.z = (edr.z && (fx.z || edr_left.z && fx_left.z || edr_up.z && fx_up.z));
    nc.w = (edr.w && (fx.w || edr_left.w && fx_left.w || edr_up.w && fx_up.w));

    t1 = df(e, f);
    t2 = df(e, h);
    px = bool4(t1 < t2);

    float3 res = nc.x ? px.x ? F : H : 
                 nc.y ? px.y ? B : F : 
                 nc.z ? px.z ? D : B : 
                 nc.w ? px.w ? H : D : E;

    return float4(res.x, res.y, res.z, 1.0);
}

technique T0
{
    pass P0
    {
        VertexShader = compile vs_3_0 main_vertex();
        PixelShader = compile ps_3_0 main_fragment();
    }
}