XNAでxBRまたはhqxフィルターを実行するにはどうすればよいですか？

シェーダーでhqxフィルター（hq2x、hq3xまたはhq4x）の1つまたはxBRフィルターを使用して、ゲームを拡大表示したいと思います。

XNA 4.0およびSM3でこれを行うにはどうすればよいですか？

補足：この質問は、現在の問題になるように大幅に編集されています。

— テスト
ソース

おもしろい質問です。FXAAの粗いポストプロセスフィルターを使ってポイントサンプリングを行うと、似たような結果が得られます。

— ヤーノシュTuránszki

en.wikipedia.org/wiki/Hqxは、それらがどのように機能するかを大まかに説明し、実装へのリンクをいくつか持っています。

— アダム

おそらく興味深いgithub.com/pdjonov/hqnx

— ClassicThunder

私はhqxSharpプロジェクトを使用してハックバージョンを動作させましたが、これは遅いです（警告）。まともなフレームレートを維持できるものが必要です。

— テスト

また、CGは、XNAのベースとなっているDirectX 9と互換性があると思いました。リンクの例の1つをHLSLファイルであるかのようにコンパイルしてみてください。github.com/libretro/common-shaders/tree/master/hqx

— ClassicThunder

回答:

ベクトル演算を使用して、命令の数を減らすことができます。たとえば、

edr = bool4((w1.x < w2.x) && ir_lv1.x, 
            (w1.y < w2.y) && ir_lv1.y, 
            (w1.z < w2.z) && ir_lv1.z, 
            (w1.w < w2.w) && ir_lv1.w);

あなたは書ける

edr = (w1 < w2) && ir_lv1;

HLSLの演算子は&&、2つのbool3値のような論理的なものであっても、ベクトルに適用できます。これらの演算子は、操作をコンポーネントごとに実行します。

シェーダーコード

float2 texture_size;
float4x4 matrixTransform;

const static float coef = 2.0;
const static float3 yuv_weighted = float3(14.352, 28.176, 5.472);

sampler decal : register(s0);

float4 df(float4 A, float4 B)
{

    // begin optimization: reduction of 42 instruction slots
    float4 result = float4(A.x - B.x, A.y - B.y, A.z - B.z, A.w - B.w);

    return abs(result);
    // end optimization

    /* old code 

    //return float4(abs(A.x - B.x), abs(A.y - B.y), abs(A.z - B.z), abs(A.w - B.w));
    */
}

float4 weighted_distance(float4 a, float4 b, float4 c, float4 d, float4 e, float4 f, float4 g, float4 h)
{
    return (df(a, b) + df(a, c) + df(d, e) + df(d, f) + 4.0 * df(g, h));
}

float4 main_vertex(inout float2 texCoord : TEXCOORD0, inout float4 position : SV_Position) : TEXCOORD1
{
    float2 ps = float2(1.0 / texture_size.x, 1.0 / texture_size.y);
    float4 t1;

    t1.xy = float2(ps.x, 0); // F
    t1.zw = float2(0, ps.y); // H

    position = mul(position, matrixTransform);

    return t1;
}

/*    FRAGMENT SHADER    */
float4 main_fragment(float4 p : POSITION0, float2 tex0 : TEXCOORD0, float4 tex1 : TEXCOORD1) : COLOR0
{
    bool4 edr, edr_left, edr_up, px; // px = pixel, edr = edge detection rule
    bool4 ir_lv1, ir_lv2_left, ir_lv2_up;
    bool4 nc; // new_color
    bool4 fx, fx_left, fx_up; // inequations of straight lines.

    float2 fp = frac(tex0 * texture_size);
    float2 dx = tex1.xy;
    float2 dy = tex1.zw;

    float3 A = tex2D(decal, tex0 - dx - dy).xyz;
    float3 B = tex2D(decal, tex0 - dy).xyz;
    float3 C = tex2D(decal, tex0 + dx - dy).xyz;
    float3 D = tex2D(decal, tex0 - dx).xyz;
    float3 E = tex2D(decal, tex0).xyz;
    float3 F = tex2D(decal, tex0 + dx).xyz;
    float3 G = tex2D(decal, tex0 - dx + dy).xyz;
    float3 H = tex2D(decal, tex0 + dy).xyz;
    float3 I = tex2D(decal, tex0 + dx + dy).xyz;
    float3 A1 = tex2D(decal, tex0 - dx - 2.0*dy).xyz;
    float3 C1 = tex2D(decal, tex0 + dx - 2.0*dy).xyz;
    float3 A0 = tex2D(decal, tex0 - 2.0*dx - dy).xyz;
    float3 G0 = tex2D(decal, tex0 - 2.0*dx + dy).xyz;
    float3 C4 = tex2D(decal, tex0 + 2.0*dx - dy).xyz;
    float3 I4 = tex2D(decal, tex0 + 2.0*dx + dy).xyz;
    float3 G5 = tex2D(decal, tex0 - dx + 2.0*dy).xyz;
    float3 I5 = tex2D(decal, tex0 + dx + 2.0*dy).xyz;
    float3 B1 = tex2D(decal, tex0 - 2.0*dy).xyz;
    float3 D0 = tex2D(decal, tex0 - 2.0*dx).xyz;
    float3 H5 = tex2D(decal, tex0 + 2.0*dy).xyz;
    float3 F4 = tex2D(decal, tex0 + 2.0*dx).xyz;

    float4 b = mul(float4x3(B, D, H, F), yuv_weighted);
    float4 c = mul(float4x3(C, A, G, I), yuv_weighted);
    float4 e = mul(float4x3(E, E, E, E), yuv_weighted);
    float4 d = b.yzwx;
    float4 f = b.wxyz;
    float4 g = c.zwxy;
    float4 h = b.zwxy;
    float4 i = c.wxyz;

    float4 i4 = mul(float4x3(I4, C1, A0, G5), yuv_weighted);
    float4 i5 = mul(float4x3(I5, C4, A1, G0), yuv_weighted);
    float4 h5 = mul(float4x3(H5, F4, B1, D0), yuv_weighted);
    float4 f4 = h5.yzwx;

    float4 Ao = float4(1.0, -1.0, -1.0, 1.0);
    float4 Bo = float4(1.0, 1.0, -1.0, -1.0);
    float4 Co = float4(1.5, 0.5, -0.5, 0.5);
    float4 Ax = float4(1.0, -1.0, -1.0, 1.0);
    float4 Bx = float4(0.5, 2.0, -0.5, -2.0);
    float4 Cx = float4(1.0, 1.0, -0.5, 0.0);
    float4 Ay = float4(1.0, -1.0, -1.0, 1.0);
    float4 By = float4(2.0, 0.5, -2.0, -0.5);
    float4 Cy = float4(2.0, 0.0, -1.0, 0.5);

    // These inequations define the line below which interpolation occurs.
    fx.x = (Ao.x*fp.y + Bo.x*fp.x > Co.x);
    fx_left.x = (Ax.x*fp.y + Bx.x*fp.x > Cx.x);
    fx_up.x = (Ay.x*fp.y + By.x*fp.x > Cy.x);

    fx.y = (Ao.y*fp.y + Bo.y*fp.x > Co.y);
    fx_left.y = (Ax.y*fp.y + Bx.y*fp.x > Cx.y);
    fx_up.y = (Ay.y*fp.y + By.y*fp.x > Cy.y);

    fx.z = (Ao.z*fp.y + Bo.z*fp.x > Co.z);
    fx_left.z = (Ax.z*fp.y + Bx.z*fp.x > Cx.z);
    fx_up.z = (Ay.z*fp.y + By.z*fp.x > Cy.z);

    fx.w = (Ao.w*fp.y + Bo.w*fp.x > Co.w);
    fx_left.w = (Ax.w*fp.y + Bx.w*fp.x > Cx.w);
    fx_up.w = (Ay.w*fp.y + By.w*fp.x > Cy.w);

    //ir_lv1.x = ((e.x != f.x) && (e.x != h.x));
    //ir_lv1.y = ((e.y != f.y) && (e.y != h.y));
    //ir_lv1.z = ((e.z != f.z) && (e.z != h.z));
    //ir_lv1.w = ((e.w != f.w) && (e.w != h.w));
    ir_lv1 = ((e != f) && (e != h));

    //ir_lv2_left.x = ((e.x != g.x) && (d.x != g.x));
    //ir_lv2_left.y = ((e.y != g.y) && (d.y != g.y));
    //ir_lv2_left.z = ((e.z != g.z) && (d.z != g.z));
    //ir_lv2_left.w = ((e.w != g.w) && (d.w != g.w));
    ir_lv2_left = ((e != g) && (d != g));

    //ir_lv2_up.x = ((e.x != c.x) && (b.x != c.x));
    //ir_lv2_up.y = ((e.y != c.y) && (b.y != c.y));
    //ir_lv2_up.z = ((e.z != c.z) && (b.z != c.z));
    //ir_lv2_up.w = ((e.w != c.w) && (b.w != c.w));
    ir_lv2_up = ((e != c) && (b != c));

    float4 w1 = weighted_distance(e, c, g, i, h5, f4, h, f);
    float4 w2 = weighted_distance(h, d, i5, f, i4, b, e, i);

    // begin optimization: reduction of 6 instruction slots
    float4 df_fg = df(f, g);
    float4 df_hc = df(h, c);
    // end optimization

    float4 t1 = (coef * df_fg);
    float4 t2 = df_hc;
    float4 t3 = df_fg;
    float4 t4 = (coef * df_hc);

    //edr = bool4((w1.x < w2.x) && ir_lv1.x, 
    //            (w1.y < w2.y) && ir_lv1.y, 
    //            (w1.z < w2.z) && ir_lv1.z, 
    //            (w1.w < w2.w) && ir_lv1.w);
    edr = (w1 < w2) && ir_lv1;

    //edr_left = bool4((t1.x <= t2.x) && ir_lv2_left.x, 
    //                 (t1.y <= t2.y) && ir_lv2_left.y, 
    //                 (t1.z <= t2.z) && ir_lv2_left.z, 
    //                 (t1.w <= t2.w) && ir_lv2_left.w);
    edr_left = (t1 <= t2) && ir_lv2_left;

    //edr_up = bool4((t4.x <= t3.x) && ir_lv2_up.x, 
    //               (t4.y <= t3.y) && ir_lv2_up.y, 
    //               (t4.z <= t3.z) && ir_lv2_up.z, 
    //               (t4.w <= t3.w) && ir_lv2_up.w);
    edr_up = (t4 <= t3) && ir_lv2_up;

    //nc.x = (edr.x && (fx.x || edr_left.x && fx_left.x || edr_up.x && fx_up.x));
    //nc.y = (edr.y && (fx.y || edr_left.y && fx_left.y || edr_up.y && fx_up.y));
    //nc.z = (edr.z && (fx.z || edr_left.z && fx_left.z || edr_up.z && fx_up.z));
    //nc.w = (edr.w && (fx.w || edr_left.w && fx_left.w || edr_up.w && fx_up.w));
    nc = (edr && (fx || edr_left && fx_left || edr_up && fx_up));

    // to actually compile this shader, uncomment the following line
    // which reduces the instruction count to under 512
    //nc.zw = (float2)0;

    t1 = df(e, f);
    t2 = df(e, h);

    //px = bool4(t1.x <= t2.x, 
    //           t1.y <= t2.y, 
    //           t1.z <= t2.z, 
    //           t1.w <= t2.w);
    px = t1 <= t2;

    float3 res = nc.x ? px.x ? F : H : nc.y ? px.y ? B : F : nc.z ? px.z ? D : B : nc.w ? px.w ? H : D : E;

    return float4(res.x, res.y, res.z, 1.0);
}

technique mainTech
{
    pass mainPass
    {
        VertexShader = compile vs_3_0 main_vertex();
        PixelShader = compile ps_3_0 main_fragment();
    }
}

ピクチャー

Redshrikeによって原画像を 4倍にスケールアップされています。

ポイントサンプリング

— ゾギ
ソース

私はすでにそれらの最適化を私の回答で使用しました。それは私が見ていた命令スロットエラーを乗り越えることができた方法でした。

— テスト

じゃあ心配しないで。私は少し遅すぎました:)

— zogi 2014年

ir_lv1 = ((e != f) && (e != h)); ir_lv2_left = ((e != g) && (d != g)); ir_lv2_up = ((e != c) && (b != c)); これらはあなたが見つけたことを逃してしまった優れた最適化です。他の最適化で命令数を減らすことができたので、問題のためにそれらを必要とすることにはなりませんでした。

— テスト

OK。素敵なトピックですが。あなたの質問の前に、これらのアルゴリズムについて聞いたことがありません。アルゴリズムをある程度理解するのに役立つhqxに関するこのブログ投稿を見つけました。興味のある方は是非お勧めします。

— zogi 2014年

私はこれを手に入れました。これはhqxフィルターを使用せず、xBRフィルターを使用します（私が好む）。私にとって、これは問題ではありません。hqxフィルターが必要な場合は、.cgファイルを適切なXNAに変換する必要があります。

完全性と検索上の理由から、質問をより簡潔になるように編集し、関連するすべての情報を投稿して、ここで質問に回答します。

ステップ1：ゲームコードの設定

最初に、ゲームを1：1のスケールで描画し、次にフィルターをレンダリングするレンダーターゲットをセットアップすることをお勧めします。

using Microsoft.Xna.Framework;
using Microsoft.Xna.Framework.Graphics;

namespace xbr
{
    /// <summary>
    /// This is the main type for your game
    /// </summary>
    public class Game1 : Microsoft.Xna.Framework.Game
    {

        GraphicsDeviceManager graphics;
        SpriteBatch spriteBatch;
        RenderTarget2D renderTarget;
        Effect xbrEffect;
        Matrix projection;
        Matrix halfPixelOffset = Matrix.CreateTranslation(-0.5f, -0.5f, 0);
        Texture2D pretend240x160Scene;

        // the bounds of your 1:1 scene
        Rectangle renderBounds = new Rectangle(0, 0, 240, 160);

        // the bounds of your output scene (same w:h ratio)
        Rectangle outputBounds = new Rectangle(0, 0, 720, 480);

        public Game1()
        {
           base.Content.RootDirectory = "Content";

           this.graphics = new GraphicsDeviceManager(this);
           this.graphics.PreferredBackBufferWidth = outputBounds.Width;
           this.graphics.PreferredBackBufferHeight = outputBounds.Height;
        }

        /// <summary>
        /// Allows the game to perform any initialization it needs to before starting to run.
        /// This is where it can query for any required services and load any non-graphic
        /// related content.  Calling base.Initialize will enumerate through any components
        /// and initialize them as well.
        /// </summary>
        protected override void Initialize()
        {
            // TODO: Add your initialization logic here

            base.Initialize();
        }

        /// <summary>
        /// LoadContent will be called once per game and is the place to load
        /// all of your content.
        /// </summary>
        protected override void LoadContent()
        {
            // Create a new SpriteBatch, which can be used to draw textures.
            this.spriteBatch = new SpriteBatch(base.GraphicsDevice);
            this.xbrEffect = Content.Load<Effect>("xbr");

            // a fake scene that is a 240x160 image
            this.pretend240x160Scene = base.Content.Load<Texture2D>("240x160Scene");
            this.renderTarget = new RenderTarget2D(base.GraphicsDevice, this.renderBounds.Width, this.renderBounds.Height);

            // default vertex matrix for the vertex method
            this.projection = Matrix.CreateOrthographicOffCenter(0, this.outputBounds.Width, this.outputBounds.Height, 0, 0, 1);

            // set the values of this effect, should only have to do this once
            this.xbrEffect.Parameters["matrixTransform"].SetValue(halfPixelOffset * projection);
            this.xbrEffect.Parameters["textureSize"].SetValue(new float[] { renderBounds.Width, renderBounds.Height });
        }

        /// <summary>
        /// UnloadContent will be called once per game and is the place to unload
        /// all content.
        /// </summary>
        protected override void UnloadContent()
        {
        }

        /// <summary>
        /// Allows the game to run logic such as updating the world,
        /// checking for collisions, gathering input, and playing audio.
        /// </summary>
        /// <param name="gameTime">Provides a snapshot of timing values.</param>
        protected override void Update(GameTime gameTime)
        {
            base.Update(gameTime);
        }

        /// <summary>
        /// This is called when the game should draw itself.
        /// </summary>
        /// <param name="gameTime">Provides a snapshot of timing values.</param>
        protected override void Draw(GameTime gameTime)
        {
            base.GraphicsDevice.Clear(Color.CornflowerBlue);
            base.GraphicsDevice.SetRenderTarget(this.renderTarget);

            // draw your scene here scaled 1:1. for now I'll just draw
            // my fake 240x160 texture
            spriteBatch.Begin(SpriteSortMode.Deferred, BlendState.NonPremultiplied, 
                              SamplerState.PointClamp, null, null);

            spriteBatch.Draw(this.pretend240x160Scene, this.renderBounds, this.renderBounds, Color.White);

            spriteBatch.End();

            // now we'll draw to the back buffer
            base.GraphicsDevice.SetRenderTarget(null);

            // this renders the effect
            spriteBatch.Begin(SpriteSortMode.Immediate, BlendState.NonPremultiplied, 
                              SamplerState.PointClamp, null, null, this.xbrEffect);

            spriteBatch.Draw(this.renderTarget, this.outputBounds, this.renderBounds, Color.White);
            spriteBatch.End();

            base.Draw(gameTime);
        }
    }
}

ステップ2：エフェクトファイル

以下は、xBRフィルターを実行するためのXNA互換エフェクトファイルです。

// all identified optimizations have been amalgamated into this file
float2 textureSize;
float4x4 matrixTransform;

const static float coef = 2.0;
const static float3 yuv_weighted = float3(14.352, 28.176, 5.472);

sampler decal : register(s0);

float4 df(float4 A, float4 B)
{
    return abs(A - B);
}

float4 weighted_distance(float4 a, float4 b, float4 c, float4 d, 
                         float4 e, float4 f, float4 g, float4 h)
{
    return (df(a, b) + df(a, c) + df(d, e) + df(d, f) + 4.0 * df(g, h));
}

float4 main_vertex(inout float4 col0 : COLOR0, inout float2 tex0 : TEXCOORD0, 
                   inout float4 pos0 : POSITION0) : TEXCOORD1
{
    float2 ps = 1.0 / textureSize;

    pos0 = mul(pos0, matrixTransform);

    return float4(ps.x, 0, 0, ps.y);
}

float4 main_fragment(float4 pos0 : POSITION0, float2 tex0 : TEXCOORD0, 
                     float4 tex1 : TEXCOORD1) : COLOR0
{
    bool4 edr, edr_left, edr_up, px; // px = pixel, edr = edge detection rule
    bool4 ir_lv1, ir_lv2_left, ir_lv2_up;
    bool4 nc; // new_color
    bool4 fx, fx_left, fx_up; // inequations of straight lines.

    float2 fp = frac(tex0 * textureSize);
    float2 dx = tex1.xy;
    float2 dy = tex1.zw;

    float3 A  = tex2D(decal, tex0 - dx - dy).xyz;
    float3 B  = tex2D(decal, tex0 - dy).xyz;
    float3 C  = tex2D(decal, tex0 + dx - dy).xyz;
    float3 D  = tex2D(decal, tex0 - dx).xyz;
    float3 E  = tex2D(decal, tex0).xyz;
    float3 F  = tex2D(decal, tex0 + dx).xyz;
    float3 G  = tex2D(decal, tex0 - dx + dy).xyz;
    float3 H  = tex2D(decal, tex0 + dy).xyz;
    float3 I  = tex2D(decal, tex0 + dx + dy).xyz;
    float3 A1 = tex2D(decal, tex0 - dx - 2.0 * dy).xyz;
    float3 C1 = tex2D(decal, tex0 + dx - 2.0 * dy).xyz;
    float3 A0 = tex2D(decal, tex0 - 2.0 * dx - dy).xyz;
    float3 G0 = tex2D(decal, tex0 - 2.0 * dx + dy).xyz;
    float3 C4 = tex2D(decal, tex0 + 2.0 * dx - dy).xyz;
    float3 I4 = tex2D(decal, tex0 + 2.0 * dx + dy).xyz;
    float3 G5 = tex2D(decal, tex0 - dx + 2.0 * dy).xyz;
    float3 I5 = tex2D(decal, tex0 + dx + 2.0 * dy).xyz;
    float3 B1 = tex2D(decal, tex0 - 2.0 * dy).xyz;
    float3 D0 = tex2D(decal, tex0 - 2.0 * dx).xyz;
    float3 H5 = tex2D(decal, tex0 + 2.0 * dy).xyz;
    float3 F4 = tex2D(decal, tex0 + 2.0 * dx).xyz;

    float4 b = mul(float4x3(B, D, H, F), yuv_weighted);
    float4 c = mul(float4x3(C, A, G, I), yuv_weighted);
    float4 e = mul(float4x3(E, E, E, E), yuv_weighted);
    float4 d = b.yzwx;
    float4 f = b.wxyz;
    float4 g = c.zwxy;
    float4 h = b.zwxy;
    float4 i = c.wxyz;

    float4 i4 = mul(float4x3(I4, C1, A0, G5), yuv_weighted);
    float4 i5 = mul(float4x3(I5, C4, A1, G0), yuv_weighted);
    float4 h5 = mul(float4x3(H5, F4, B1, D0), yuv_weighted);
    float4 f4 = h5.yzwx;

    float4 Ao = float4(1.0, -1.0, -1.0, 1.0);
    float4 Bo = float4(1.0, 1.0, -1.0, -1.0);
    float4 Co = float4(1.5, 0.5, -0.5, 0.5);
    float4 Ax = float4(1.0, -1.0, -1.0, 1.0);
    float4 Bx = float4(0.5, 2.0, -0.5, -2.0);
    float4 Cx = float4(1.0, 1.0, -0.5, 0.0);
    float4 Ay = float4(1.0, -1.0, -1.0, 1.0);
    float4 By = float4(2.0, 0.5, -2.0, -0.5);
    float4 Cy = float4(2.0, 0.0, -1.0, 0.5);

    // These inequations define the line below which interpolation occurs.
    fx.x = (Ao.x * fp.y + Bo.x * fp.x > Co.x);
    fx.y = (Ao.y * fp.y + Bo.y * fp.x > Co.y);
    fx.z = (Ao.z * fp.y + Bo.z * fp.x > Co.z);
    fx.w = (Ao.w * fp.y + Bo.w * fp.x > Co.w);

    fx_left.x = (Ax.x * fp.y + Bx.x * fp.x > Cx.x);
    fx_left.y = (Ax.y * fp.y + Bx.y * fp.x > Cx.y);
    fx_left.z = (Ax.z * fp.y + Bx.z * fp.x > Cx.z);
    fx_left.w = (Ax.w * fp.y + Bx.w * fp.x > Cx.w);

    fx_up.x = (Ay.x * fp.y + By.x * fp.x > Cy.x);
    fx_up.y = (Ay.y * fp.y + By.y * fp.x > Cy.y);
    fx_up.z = (Ay.z * fp.y + By.z * fp.x > Cy.z);
    fx_up.w = (Ay.w * fp.y + By.w * fp.x > Cy.w);

    ir_lv1      = ((e != f) && (e != h));
    ir_lv2_left = ((e != g) && (d != g));
    ir_lv2_up   = ((e != c) && (b != c));

    float4 w1 = weighted_distance(e, c, g, i, h5, f4, h, f);
    float4 w2 = weighted_distance(h, d, i5, f, i4, b, e, i);
    float4 df_fg = df(f, g);
    float4 df_hc = df(h, c);
    float4 t1 = (coef * df_fg);
    float4 t2 = df_hc;
    float4 t3 = df_fg;
    float4 t4 = (coef * df_hc);

    edr      = (w1 < w2)  && ir_lv1;
    edr_left = (t1 <= t2) && ir_lv2_left;
    edr_up   = (t4 <= t3) && ir_lv2_up;

    nc = (edr && (fx || edr_left && fx_left || edr_up && fx_up));

    t1 = df(e, f);
    t2 = df(e, h);
    px = t1 <= t2;

    float3 res = nc.x ? px.x ? F : H : 
                 nc.y ? px.y ? B : F : 
                 nc.z ? px.z ? D : B : 
                 nc.w ? px.w ? H : D : E;

    return float4(res.xyz, 1.0);
}

technique T0
{
    pass P0
    {
        VertexShader = compile vs_3_0 main_vertex();
        PixelShader = compile ps_3_0 main_fragment();
    }
}

結果

240x160のレンダリングに使用したテクスチャ：

xBR入力

ゲームの実行による出力：

xBR出力

ソース

XNA互換に変換した.cgファイルは、ここから取得したものです。クレジットはそれを書くために彼らに行きます。

— テスト
ソース