OpenGLで計算シェーダーを使用して遅延タイルシェーディングを実行しようとしていますが、各タイルの錐台を作成しようとしたときに障害が発生しています。AMDのForward +デモ（D3Dで記述）をガイドとして使用していますが、ライトがカリングされるべきではないときにカリングされているようです。

更新

更新については以下をお読みください。

これは私の（完全な）計算シェーダーです：

    #version 430 core

#define MAX_LIGHTS 1024
#define MAX_LIGHTS_PER_TILE 40

#define WORK_GROUP_SIZE 16

struct PointLight
{
    vec3 position;
    float radius;
    vec3 color;
    float intensity;
};

layout (binding = 0, rgba32f) uniform writeonly image2D outTexture;
layout (binding = 1, rgba32f) uniform readonly image2D normalDepth;
layout (binding = 2, rgba32f) uniform readonly image2D diffuse;
layout (binding = 3, rgba32f) uniform readonly image2D specular;
layout (binding = 4, rgba32f) uniform readonly image2D glowMatID;

layout (std430, binding = 5) buffer BufferObject
{
    PointLight pointLights[];
};

uniform mat4 view;
uniform mat4 proj;
uniform mat4 viewProj;
uniform mat4 invViewProj;
uniform mat4 invProj;
uniform vec2 framebufferDim;

layout (local_size_x = WORK_GROUP_SIZE, local_size_y = WORK_GROUP_SIZE) in;

shared uint minDepth = 0xFFFFFFFF;
shared uint maxDepth = 0;
shared uint pointLightIndex[MAX_LIGHTS];
shared uint pointLightCount = 0;

vec3 ReconstructWP(float z, vec2 uv_f)
{
    vec4 sPos = vec4(uv_f * 2.0 - 1.0, z, 1.0);
    sPos = invViewProj * sPos;

    return (sPos.xyz / sPos.w);
}

vec4 ConvertProjToView( vec4 p )
{
    p = invProj * p;
    p /= p.w;
    return p;
}

// calculate the number of tiles in the horizontal direction
uint GetNumTilesX()
{
    return uint(( ( 1280 + WORK_GROUP_SIZE - 1 ) / float(WORK_GROUP_SIZE) ));
}

// calculate the number of tiles in the vertical direction
uint GetNumTilesY()
{
    return uint(( ( 720 + WORK_GROUP_SIZE - 1 ) / float(WORK_GROUP_SIZE) ));
}


vec4 CreatePlaneEquation( vec4 b, vec4 c )
{
    vec4 n;

    // normalize(cross( b.xyz-a.xyz, c.xyz-a.xyz )), except we know "a" is the origin
     n.xyz = normalize(cross( b.xyz, c.xyz ));

    // -(n dot a), except we know "a" is the origin
    n.w = 0;

    return n;
}

float GetSignedDistanceFromPlane( vec4 p, vec4 eqn )
{
    // dot( eqn.xyz, p.xyz ) + eqn.w, , except we know eqn.w is zero 
    // (see CreatePlaneEquation above)
    return dot( eqn.xyz, p.xyz );
}

vec4 CalculateLighting( PointLight p, vec3 wPos, vec3 wNormal, vec4 wSpec, vec4 wGlow)
{
    vec3 direction = p.position - wPos;

    if(length(direction) > p.radius)
        return vec4(0.0f, 0.0f, 0.0f, 0.0f);

    float attenuation = 1.0f - length(direction) / (p.radius);
    direction = normalize(direction);
    float diffuseFactor = max(0.0f, dot(direction, wNormal)) * attenuation;
    return vec4(p.color.xyz, 0.0f) * diffuseFactor * p.intensity;
}


void main()
{
        ivec2 pixelPos = ivec2(gl_GlobalInvocationID.xy);
        vec2 tilePos = vec2(gl_WorkGroupID.xy * gl_WorkGroupSize.xy) / vec2(1280, 720);

        vec4 normalColor = imageLoad(normalDepth, pixelPos);

        float d = normalColor.w;

        uint depth = uint(d * 0xFFFFFFFF);

        atomicMin(minDepth, depth);
        atomicMax(maxDepth, depth);

        barrier();

        float minDepthZ = float(minDepth / float(0xFFFFFFFF));
        float maxDepthZ = float(maxDepth / float(0xFFFFFFFF));

        vec4 frustumEqn[4];
        uint pxm = WORK_GROUP_SIZE * gl_WorkGroupID.x;
        uint pym = WORK_GROUP_SIZE * gl_WorkGroupID.y;
        uint pxp = WORK_GROUP_SIZE * (gl_WorkGroupID.x + 1);
        uint pyp = WORK_GROUP_SIZE * (gl_WorkGroupID.y + 1);

        uint uWindowWidthEvenlyDivisibleByTileRes = WORK_GROUP_SIZE * GetNumTilesX();
        uint uWindowHeightEvenlyDivisibleByTileRes = WORK_GROUP_SIZE * GetNumTilesY();

        vec4 frustum[4];
        frustum[0] = ConvertProjToView( vec4( pxm / float(uWindowWidthEvenlyDivisibleByTileRes) * 2.0f - 1.0f, (uWindowHeightEvenlyDivisibleByTileRes - pym) / float(uWindowHeightEvenlyDivisibleByTileRes) * 2.0f - 1.0f, 1.0f, 1.0f) );
        frustum[1] = ConvertProjToView( vec4( pxp / float(uWindowWidthEvenlyDivisibleByTileRes) * 2.0f - 1.0f, (uWindowHeightEvenlyDivisibleByTileRes - pym) / float(uWindowHeightEvenlyDivisibleByTileRes) * 2.0f - 1.0f, 1.0f, 1.0f) );
        frustum[2] = ConvertProjToView( vec4( pxp / float(uWindowWidthEvenlyDivisibleByTileRes) * 2.0f - 1.0f, (uWindowHeightEvenlyDivisibleByTileRes - pyp) / float(uWindowHeightEvenlyDivisibleByTileRes) * 2.0f - 1.0f, 1.0f ,1.0f) );
        frustum[3] = ConvertProjToView( vec4( pxm / float(uWindowWidthEvenlyDivisibleByTileRes) * 2.0f - 1.0f, (uWindowHeightEvenlyDivisibleByTileRes - pyp) / float(uWindowHeightEvenlyDivisibleByTileRes) * 2.0f - 1.0f, 1.0f, 1.0f) );

        for (int i = 0; i < 4; i++)
            frustumEqn[i] = CreatePlaneEquation(frustum[i], frustum[(i+1) & 3]);

        barrier();

        int threadsPerTile = WORK_GROUP_SIZE * WORK_GROUP_SIZE;

        for (uint i = 0; i < MAX_LIGHTS; i+= threadsPerTile)
        {
            uint il = gl_LocalInvocationIndex + i;

            if (il < MAX_LIGHTS)
            {
                PointLight p = pointLights[il];

                vec4 viewPos = view * vec4(p.position, 1.0f);
                float r = p.radius;

                if (viewPos.z + minDepthZ < r && viewPos.z - maxDepthZ < r)
                {

                if( ( GetSignedDistanceFromPlane( viewPos, frustumEqn[0] ) < r ) &&
                    ( GetSignedDistanceFromPlane( viewPos, frustumEqn[1] ) < r ) &&
                    ( GetSignedDistanceFromPlane( viewPos, frustumEqn[2] ) < r ) &&
                    ( GetSignedDistanceFromPlane( viewPos, frustumEqn[3] ) < r) )

                    {
                        uint id = atomicAdd(pointLightCount, 1);
                        pointLightIndex[id] = il;
                    }
                }

            }
        }

        barrier();

        vec4 diffuseColor = imageLoad(diffuse, pixelPos);
        vec4 specularColor = imageLoad(specular, pixelPos);
        vec4 glowColor = imageLoad(glowMatID, pixelPos);

        vec2 uv = vec2(pixelPos.x / 1280.0f, pixelPos.y / 720.0f);

        vec3 wp = ReconstructWP(d, uv);
        vec4 color = vec4(0.0f, 0.0f, 0.0f, 1.0f);

        for (int i = 0; i < pointLightCount; i++)
        {
            color += CalculateLighting( pointLights[pointLightIndex[i]], wp, normalColor.xyz, specularColor, glowColor);
        }

        barrier();

        if (gl_LocalInvocationID.x == 0 || gl_LocalInvocationID.y == 0 || gl_LocalInvocationID.x == 16 || gl_LocalInvocationID.y == 16)
            imageStore(outTexture, pixelPos, vec4(.2f, .2f, .2f, 1.0f));
        else
        {
            imageStore(outTexture, pixelPos, color);
            //imageStore(outTexture, pixelPos, vec4(maxDepthZ));
            //imageStore(outTexture, pixelPos, vec4(pointLightCount / 128.0f));
            //imageStore(outTexture, pixelPos, vec4(vec2(tilePos.xy), 0.0f, 1.0f));
        }
}

これは私が問題だと思う部分、カリングの部分です：

        barrier();

    float minDepthZ = float(minDepth / float(0xFFFFFFFF));
    float maxDepthZ = float(maxDepth / float(0xFFFFFFFF));

    vec4 frustumEqn[4];
    uint pxm = WORK_GROUP_SIZE * gl_WorkGroupID.x;
    uint pym = WORK_GROUP_SIZE * gl_WorkGroupID.y;
    uint pxp = WORK_GROUP_SIZE * (gl_WorkGroupID.x + 1);
    uint pyp = WORK_GROUP_SIZE * (gl_WorkGroupID.y + 1);

    uint uWindowWidthEvenlyDivisibleByTileRes = WORK_GROUP_SIZE * GetNumTilesX();
    uint uWindowHeightEvenlyDivisibleByTileRes = WORK_GROUP_SIZE * GetNumTilesY();

    vec4 frustum[4];
    frustum[0] = ConvertProjToView( vec4( pxm / float(uWindowWidthEvenlyDivisibleByTileRes) * 2.0f - 1.0f, (uWindowHeightEvenlyDivisibleByTileRes - pym) / float(uWindowHeightEvenlyDivisibleByTileRes) * 2.0f - 1.0f, 1.0f, 1.0f) );
    frustum[1] = ConvertProjToView( vec4( pxp / float(uWindowWidthEvenlyDivisibleByTileRes) * 2.0f - 1.0f, (uWindowHeightEvenlyDivisibleByTileRes - pym) / float(uWindowHeightEvenlyDivisibleByTileRes) * 2.0f - 1.0f, 1.0f, 1.0f) );
    frustum[2] = ConvertProjToView( vec4( pxp / float(uWindowWidthEvenlyDivisibleByTileRes) * 2.0f - 1.0f, (uWindowHeightEvenlyDivisibleByTileRes - pyp) / float(uWindowHeightEvenlyDivisibleByTileRes) * 2.0f - 1.0f, 1.0f ,1.0f) );
    frustum[3] = ConvertProjToView( vec4( pxm / float(uWindowWidthEvenlyDivisibleByTileRes) * 2.0f - 1.0f, (uWindowHeightEvenlyDivisibleByTileRes - pyp) / float(uWindowHeightEvenlyDivisibleByTileRes) * 2.0f - 1.0f, 1.0f, 1.0f) );

    for (int i = 0; i < 4; i++)
        frustumEqn[i] = CreatePlaneEquation(frustum[i], frustum[(i+1) & 3]);

    barrier();

    int threadsPerTile = WORK_GROUP_SIZE * WORK_GROUP_SIZE;

    for (uint i = 0; i < MAX_LIGHTS; i+= threadsPerTile)
    {
        uint il = gl_LocalInvocationIndex + i;

        if (il < MAX_LIGHTS)
        {
            PointLight p = pointLights[il];

            vec4 viewPos = view * vec4(p.position, 1.0f);
            float r = p.radius;

            if (viewPos.z + minDepthZ < r && viewPos.z - maxDepthZ < r)
            {

            if( ( GetSignedDistanceFromPlane( viewPos, frustumEqn[0] ) < r ) &&
                ( GetSignedDistanceFromPlane( viewPos, frustumEqn[1] ) < r ) &&
                ( GetSignedDistanceFromPlane( viewPos, frustumEqn[2] ) < r ) &&
                ( GetSignedDistanceFromPlane( viewPos, frustumEqn[3] ) < r) )

                {
                    uint id = atomicAdd(pointLightCount, 1);
                    pointLightIndex[id] = il;
                }
            }

        }
    }

    barrier();

奇妙なことに、タイルごとの光の数を視覚化すると、何らかの方法で光があるすべてのタイルが表示されます（最初の画像）。

2番目の画像は、最終出力を示しています。画面の中央に細いライトのラインがあり、上下には何もありません。カリングを削除すると（GetSignedDistanceFromPlane（））、フレームレートが岩のように落ちるにもかかわらず、望ましい結果が得られます。

ここに画像の説明を入力してください

私の推測では、錐台は正しく構築されていないと思われますが、その背後にある数学がよくわからないので、現時点でいくつかの助けを使用できます。

編集：予想される出力を示す別の画像を追加しました。

ここに画像の説明を入力してください

アップデート1

カリングの方法を変更しました。コードは次のようになります。

barrier();

float minDepthZ = float(minDepth / float(0xFFFFFFFF));
float maxDepthZ = float(maxDepth / float(0xFFFFFFFF));

//total tiles = tileScale * 2
vec2 tileScale = vec2(1280, 720) * (1.0f / float(2*WORK_GROUP_SIZE));
vec2 tileBias = tileScale - vec2(gl_WorkGroupID.xy);

vec4 c1 = vec4(-proj[0][0] * tileScale.x, 0.0f, tileBias.x, 0.0f);
vec4 c2 = vec4(0.0f, -proj[1][1] * tileScale.y, tileBias.y, 0.0f);
vec4 c4 = vec4(0.0f, 0.0f, 1.0f, 0.0f);

 // Derive frustum planes
vec4 frustumPlanes[6];
// Sides
//right
frustumPlanes[0] = c4 - c1;
//left
frustumPlanes[1] = c4 + c1;
//bottom
frustumPlanes[2] = c4 - c2;
//top
frustumPlanes[3] = c4 + c2;
// Near/far
frustumPlanes[4] = vec4(0.0f, 0.0f,  1.0f, -minDepthZ);
frustumPlanes[5] = vec4(0.0f, 0.0f, -1.0f,  maxDepthZ);

for(int i = 0; i < 4; i++)
{
    frustumPlanes[i] *= 1.0f / length(frustumPlanes[i].xyz);
}

//DO CULLING HERE
for (uint lightIndex = gl_LocalInvocationIndex; lightIndex < numActiveLights; lightIndex += WORK_GROUP_SIZE)
{
    PointLight p = pointLights[lightIndex];

    if (lightIndex < numActiveLights)
    {
        bool inFrustum = true;
        for (uint i = 0; i < 4; i++)
        {
            float dd = dot(frustumPlanes[i], view * vec4(p.position, 1.0f));
            inFrustum = inFrustum && (dd >= -p.radius_length);
        }

        if (inFrustum)
        {
            uint id = atomicAdd(pointLightCount, 1);
            pointLightIndex[id] = lightIndex;
        }
    }
}

barrier();

これにより、タイルに対してライトが適切にカリングされます（最小深度/最大深度はまだ適切に実装されていないため除く）。これまでのところ、とても良い、しかし！ライトのエッジに問題があります。タイルがライトの半径全体をカバーしておらず、パフォーマンスは素晴らしいです。1024ライトは、スタッターのトンで最高40fpsを提供します。

このビデオは、エッジで何が発生するかを示しています。灰色のタイルは、ライト（単一のポイントライト）の影響を受けるタイルで、赤い部分はシェーディングされたジオメトリです。

http://www.youtube.com/watch?v=PiwGcFb9rWk&feature=youtu.be

カリングの際に半径が大きくなるように半径をスケーリングしますが、パフォーマンスの低下をさらに難しくします。

— 弁当
ソース

5

最終的な回答、パフォーマンスの問題を解決しました！代わりに私のカリングループをこれに変更しました（BF3でDiceが使用したものに基づく）

uint threadCount = WORK_GROUP_SIZE * WORK_GROUP_SIZE;
    uint passCount = (numActiveLights + threadCount - 1) /threadCount;
for (uint passIt = 0; passIt < passCount; ++passIt)
{
    uint lightIndex =  passIt * threadCount + gl_LocalInvocationIndex;

    lightIndex = min(lightIndex, numActiveLights);

    p = pointLights[lightIndex];
    pos = view * vec4(p.position, 1.0f);
    rad = p.radius_length;

    if (pointLightCount < MAX_LIGHTS_PER_TILE)
    {
        inFrustum = true;
        for (uint i = 3; i >= 0 && inFrustum; i--)
        {
            dist = dot(frustumPlanes[i], pos);
            inFrustum = (-rad <= dist);
        }

        if (inFrustum)
        {
            id = atomicAdd(pointLightCount, 1);
            pointLightIndex[id] = lightIndex;
        }
    }
}

これで、80 fpsで4096のライトを作成できるようになり、とても満足しています。

— 弁当
ソース

2

問題を部分的に解決しました。これは、遠近面以外のすべてに機能する新しいカリングコードです。パフォーマンスはまだかなり悪いので、誰かが何が原因かを見ることができればそれは高く評価されます。

        ivec2 pixel = ivec2(gl_GlobalInvocationID.xy);

    vec4 normalColor = imageLoad(normalDepth, pixel);

    float d = normalColor.w;

    uint depth = uint(d * 0xFFFFFFFF);

    atomicMin(minDepth, depth);
    atomicMax(maxDepth, depth);

    barrier();

    float minDepthZ = float(minDepth / float(0xFFFFFFFF));
    float maxDepthZ = float(maxDepth / float(0xFFFFFFFF));

    vec2 tileScale = vec2(1280, 720) * (1.0f / float( 2 * WORK_GROUP_SIZE));
    vec2 tileBias = tileScale - vec2(gl_WorkGroupID.xy);

    vec4 col1 = vec4(-proj[0][0]  * tileScale.x, proj[0][1], tileBias.x, proj[0][3]); 

    vec4 col2 = vec4(proj[1][0], -proj[1][1] * tileScale.y, tileBias.y, proj[1][3]);

    vec4 col4 = vec4(proj[3][0], proj[3][1],  -1.0f, proj[3][3]); 

    vec4 frustumPlanes[6];

    //Left plane
    frustumPlanes[0] = col4 + col1;

    //right plane
    frustumPlanes[1] = col4 - col1;

    //top plane
    frustumPlanes[2] = col4 - col2;

    //bottom plane
    frustumPlanes[3] = col4 + col2;

    //near
    frustumPlanes[4] =vec4(0.0f, 0.0f, -1.0f,  -minDepthZ);

    //far
    frustumPlanes[5] = vec4(0.0f, 0.0f, -1.0f,  maxDepthZ);

    for(int i = 0; i < 4; i++)
    {
        frustumPlanes[i] *= 1.0f / length(frustumPlanes[i].xyz);
    }

    //DO CULLING HERE
    for (uint lightIndex = gl_LocalInvocationIndex; lightIndex < numActiveLights; lightIndex += WORK_GROUP_SIZE)
    {
        PointLight p = pointLights[lightIndex];

        if (pointLightCount < MAX_LIGHTS_PER_TILE)
        {
            bool inFrustum = true;
            for (uint i = 3; i >= 0 && inFrustum; i--)
            {
                float dd = dot(frustumPlanes[i], view * vec4(p.position, 1.0f));
                inFrustum = (dd >= -p.radius_length);
            }

            if (inFrustum)
            {
                uint id = atomicAdd(pointLightCount, 1);
                pointLightIndex[id] = lightIndex;
            }
        }
    }

    barrier();

実行中：

http://www.youtube.com/watch?v=8SnvYya1Jn8&feature=youtu.be

— 弁当
ソース

1

ライトインデックスレンダリング/据え置きを実装した経験があります。ライトのエッジについては、imdoingitwrong.wordpress.com / 2011/01/31 / light-attenuationを確認することをお勧めします。これにより、ライトを遮断するしきい値を指定し、計算式を得ることができます。シェーダーに渡すスケール。近くの飛行機と遠くの飛行機については、インデックスされたライトで多くの問題がありました。私が見つけた最良の方法は、近くの平面と交差するライトに対して画面全体を行うことでした。遠方の平面については、深度クランプ（GL_ARB_depth_clamp）を調べることができます

— ashleysmithgpu

1

すみません、スペースが足りません:) パフォーマンスに関しては、おそらくアプリケーションのプロファイルを作成する必要があります。照明を計算するためにメモリへの書き込み、ループ、メモリからの読み取りを行う必要がないので、照明の計算をif（inFrustum）テストの内部に移動すると役立つと思います。

— ashleysmithgpu 2013

助けてくれてありがとう！私はいくつかのプロファイリングをしようとしてきました、そしてそれは現在パフォーマンスを殺している淘汰の段階です。具体的には、それはinFrustum（inFrustum =（dd> = -p.radius_length）に書き込んでいるようですが、何らかの理由でパフォーマンスが完全に殺害されているので、なぜかわかりません。過度の分岐を引き起こしていますか？各スレッドがライトの完全なリストを必要とするため、ライト計算をif（inFrustum）条件内に移動する方法が完全にわかりませんか？

— Bentebent

OpenGLでの遅延タイルシェーディング、タイルフラスタ計算

更新

アップデート1