319 lines
13 KiB
C
319 lines
13 KiB
C
#define USE_CMP_FIDELITY_FX_H
|
|
#ifndef A_CPU
|
|
#define A_CPU
|
|
#endif
|
|
#include "common_def.h"
|
|
|
|
// Call to setup required constant values (works on CPU or GPU).
|
|
A_STATIC void FsrEasuCon(CMP_OUT CGU_Vec4ui con0,
|
|
CMP_OUT CGU_Vec4ui con1,
|
|
CMP_OUT CGU_Vec4ui con2,
|
|
CMP_OUT CGU_Vec4ui con3,
|
|
// This the rendered part of the input image.
|
|
CGU_FLOAT inputViewportInPixelsX,
|
|
CGU_FLOAT inputViewportInPixelsY,
|
|
// This is the size of the input image.
|
|
CGU_FLOAT inputSizeInPixelsX,
|
|
CGU_FLOAT inputSizeInPixelsY,
|
|
CGU_FLOAT outputSizeInPixelsX,
|
|
CGU_FLOAT outputSizeInPixelsY )
|
|
{
|
|
// Output integer position to a pixel position in viewport.
|
|
|
|
con0[0]=AU1_AF1(inputViewportInPixelsX*ARcpF1(outputSizeInPixelsX));
|
|
con0[1]=AU1_AF1(inputViewportInPixelsY*ARcpF1(outputSizeInPixelsY));
|
|
con0[2]=AU1_AF1(0.5f*inputViewportInPixelsX*ARcpF1(outputSizeInPixelsX)-0.5f);
|
|
con0[3]=AU1_AF1(0.5f*inputViewportInPixelsY*ARcpF1(outputSizeInPixelsY)-0.5f);
|
|
|
|
// Viewport pixel position to normalized image space.
|
|
// This is used to get upper-left of 'F' tap.
|
|
|
|
con1[0]=AU1_AF1(ARcpF1(inputSizeInPixelsX));
|
|
con1[1]=AU1_AF1(ARcpF1(inputSizeInPixelsY));
|
|
|
|
// Centers of gather4, first offset from upper-left of 'F'.
|
|
// +---+---+
|
|
// | | |
|
|
// +--(0)--+
|
|
// | b | c |
|
|
// +---F---+---+---+
|
|
// | e | f | g | h |
|
|
// +--(1)--+--(2)--+
|
|
// | i | j | k | l |
|
|
// +---+---+---+---+
|
|
// | n | o |
|
|
// +--(3)--+
|
|
// | | |
|
|
// +---+---+
|
|
|
|
con1[2]=AU1_AF1( 1.0f*ARcpF1(inputSizeInPixelsX));
|
|
con1[3]=AU1_AF1(-1.0f*ARcpF1(inputSizeInPixelsY));
|
|
|
|
// These are from (0) instead of 'F'.
|
|
|
|
con2[0]=AU1_AF1(-1.0f*ARcpF1(inputSizeInPixelsX));
|
|
con2[1]=AU1_AF1( 2.0f*ARcpF1(inputSizeInPixelsY));
|
|
con2[2]=AU1_AF1( 1.0f*ARcpF1(inputSizeInPixelsX));
|
|
con2[3]=AU1_AF1( 2.0f*ARcpF1(inputSizeInPixelsY));
|
|
con3[0]=AU1_AF1( 0.0f*ARcpF1(inputSizeInPixelsX));
|
|
con3[1]=AU1_AF1( 4.0f*ARcpF1(inputSizeInPixelsY));
|
|
con3[2]=con3[3]=0;
|
|
}
|
|
|
|
//==============================================================================================================================
|
|
// NON-PACKED 32-BIT VERSION
|
|
//==============================================================================================================================
|
|
// CPU Prototypes.: Get pixel value ar given co-ordinates, Red, Green, Blue
|
|
CGU_Vec4f FsrEasuRF(CGU_Vec2f p) {} ;
|
|
CGU_Vec4f FsrEasuGF(CGU_Vec2f p) {} ;
|
|
CGU_Vec4f FsrEasuBF(CGU_Vec2f p) {} ;
|
|
|
|
|
|
//------------------------------------------------------------------------------------------------------------------------------
|
|
// Filtering for a given tap for the scalar.
|
|
void FsrEasuTapF(CGU_Vec3f aC, // Accumulated color, with negative lobe.
|
|
CGU_FLOAT aW, // Accumulated weight.
|
|
CGU_Vec2f off, // Pixel offset from resolve position to tap.
|
|
CGU_Vec2f dir, // Gradient direction.
|
|
CGU_Vec2f len, // Length.
|
|
CGU_FLOAT lob, // Negative lobe strength.
|
|
CGU_FLOAT clp, // Clipping point.
|
|
CGU_Vec3f c)
|
|
{
|
|
// Tap color.
|
|
// Rotate offset by direction.
|
|
CGU_Vec2f v;
|
|
v.x =(off.x*( dir.x))+(off.y*dir.y);
|
|
v.y =(off.x*(-dir.y))+(off.y*dir.x);
|
|
// Anisotropy.
|
|
v = len*v;
|
|
// Compute distance^2.
|
|
CGU_FLOAT d2=v.x*v.x+v.y*v.y;
|
|
// Limit to the window as at corner, 2 taps can easily be outside.
|
|
d2=min(d2,clp);
|
|
// Approximation of lancos2 without sin() or rcp(), or sqrt() to get x.
|
|
// (25/16 * (2/5 * x^2 - 1)^2 - (25/16 - 1)) * (1/4 * x^2 - 1)^2
|
|
// |_______________________________________| |_______________|
|
|
// base window
|
|
// The general form of the 'base' is,
|
|
// (a*(b*x^2-1)^2-(a-1))
|
|
// Where 'a=1/(2*b-b^2)' and 'b' moves around the negative lobe.
|
|
CGU_FLOAT wB=(2.0f/5.0f)*d2-1.0f;
|
|
CGU_FLOAT wA=lob*d2-1.0f;
|
|
wB*=wB;
|
|
wA*=wA;
|
|
wB=(25.0f/16.0f)*wB-(25.0f/16.0f-1.0f);
|
|
CGU_FLOAT w=wB*wA;
|
|
// Do weighted average.
|
|
aC+=c*w;
|
|
aW+=w;
|
|
}
|
|
|
|
|
|
//------------------------------------------------------------------------------------------------------------------------------
|
|
// Accumulate direction and length.
|
|
//------------------------------------------------------------------------------------------------------------------------------
|
|
void FsrEasuSetF( CMP_INOUT CGU_Vec2f dir,
|
|
CMP_INOUT CGU_FLOAT len,
|
|
CGU_Vec2f pp,
|
|
AP1 biS,AP1 biT,AP1 biU,AP1 biV,
|
|
CGU_FLOAT lA,CGU_FLOAT lB,CGU_FLOAT lC,CGU_FLOAT lD,CGU_FLOAT lE)
|
|
{
|
|
// Compute bilinear weight, branches factor out as predicates are compiler time immediates.
|
|
// s t
|
|
// u v
|
|
CGU_FLOAT w;
|
|
if(biS) w= (1.0f-pp.x)*1.0f-pp.y;
|
|
if(biT) w= pp.x *1.0f-pp.y;
|
|
if(biU) w= (1.0f-pp.x)* pp.y ;
|
|
if(biV) w= pp.x * pp.y;
|
|
|
|
// Direction is the '+' diff.
|
|
// a
|
|
// b c d
|
|
// e
|
|
// Then takes magnitude from abs average of both sides of 'c'.
|
|
// Length converts gradient reversal to 0, smoothly to non-reversal at 1, shaped, then adding horz and vert terms.
|
|
|
|
CGU_FLOAT dc=lD-lC;
|
|
CGU_FLOAT cb=lC-lB;
|
|
CGU_FLOAT lenX=max(abs(dc),abs(cb));
|
|
|
|
lenX=APrxLoRcpF1(lenX);
|
|
|
|
CGU_FLOAT dirX=lD-lB;
|
|
dir.x+=dirX*w;
|
|
lenX=ASatF1(abs(dirX)*lenX);
|
|
lenX*=lenX;
|
|
len+=lenX*w;
|
|
|
|
// Repeat for the y axis.
|
|
CGU_FLOAT ec=lE-lC;
|
|
CGU_FLOAT ca=lC-lA;
|
|
CGU_FLOAT lenY=max(abs(ec),abs(ca));
|
|
lenY=APrxLoRcpF1(lenY);
|
|
CGU_FLOAT dirY=lE-lA;
|
|
dir.y+=dirY*w;
|
|
lenY=ASatF1(abs(dirY)*lenY);
|
|
lenY*=lenY;
|
|
len+=lenY*w;
|
|
}
|
|
|
|
|
|
//------------------------------------------------------------------------------------------------------------------------------
|
|
void FsrEasuF( CMP_OUT CGU_Vec3f pix,
|
|
CGU_Vec2ui ip, // Integer pixel position in output.
|
|
CGU_Vec4ui con0, // Constants generated by FsrEasuCon().
|
|
CGU_Vec4ui con1,
|
|
CGU_Vec4ui con2,
|
|
CGU_Vec4ui con3)
|
|
{
|
|
//------------------------------------------------------------------------------------------------------------------------------
|
|
// Get position of 'f'.
|
|
CGU_Vec2f pp = CGU_Vec2f(ip.x,ip.y)*CGU_Vec2f(con0.x,con0.y)+CGU_Vec2f(con0.z,con0.w);
|
|
CGU_Vec2f fp = CGU_Vec2f(cmp_floor(pp.x),cmp_floor(pp.x));
|
|
pp-=fp;
|
|
|
|
//------------------------------------------------------------------------------------------------------------------------------
|
|
// 12-tap kernel.
|
|
// b c
|
|
// e f g h
|
|
// i j k l
|
|
// n o
|
|
// Gather 4 ordering.
|
|
// a b
|
|
// r g
|
|
// For packed FP16, need either {rg} or {ab} so using the following setup for gather in all versions,
|
|
// a b <- unused (z)
|
|
// r g
|
|
// a b a b
|
|
// r g r g
|
|
// a b
|
|
// r g <- unused (z)
|
|
|
|
// Allowing dead-code removal to remove the 'z's.
|
|
CGU_Vec2f p0=fp*CGU_Vec2f(con1.x,con1.y)+CGU_Vec2f(con1.z,con1.w);
|
|
|
|
// These are from p0 to avoid pulling two constants on pre-Navi hardware.
|
|
CGU_Vec2f p1=p0+CGU_Vec2f(con2.x,con2.y);
|
|
CGU_Vec2f p2=p0+CGU_Vec2f(con2.z,con2.w);
|
|
CGU_Vec2f p3=p0+CGU_Vec2f(con3.x,con3.y);
|
|
CGU_Vec4f bczzR=FsrEasuRF(p0);
|
|
CGU_Vec4f bczzG=FsrEasuGF(p0);
|
|
CGU_Vec4f bczzB=FsrEasuBF(p0);
|
|
CGU_Vec4f ijfeR=FsrEasuRF(p1);
|
|
CGU_Vec4f ijfeG=FsrEasuGF(p1);
|
|
CGU_Vec4f ijfeB=FsrEasuBF(p1);
|
|
CGU_Vec4f klhgR=FsrEasuRF(p2);
|
|
CGU_Vec4f klhgG=FsrEasuGF(p2);
|
|
CGU_Vec4f klhgB=FsrEasuBF(p2);
|
|
CGU_Vec4f zzonR=FsrEasuRF(p3);
|
|
CGU_Vec4f zzonG=FsrEasuGF(p3);
|
|
CGU_Vec4f zzonB=FsrEasuBF(p3);
|
|
// ------------------------------------------------------------------------------------------------------------------------------
|
|
// Simplest multi-channel approximate luma possible (luma times 2, in 2 FMA/MAD).
|
|
CGU_Vec4f bczzL=bczzB*CGU_Vec4f(0.5)+(bczzR*CGU_Vec4f(0.5)+bczzG);
|
|
CGU_Vec4f ijfeL=ijfeB*CGU_Vec4f(0.5)+(ijfeR*CGU_Vec4f(0.5)+ijfeG);
|
|
CGU_Vec4f klhgL=klhgB*CGU_Vec4f(0.5)+(klhgR*CGU_Vec4f(0.5)+klhgG);
|
|
CGU_Vec4f zzonL=zzonB*CGU_Vec4f(0.5)+(zzonR*CGU_Vec4f(0.5)+zzonG);
|
|
// Rename.
|
|
CGU_FLOAT bL=bczzL.x;
|
|
CGU_FLOAT cL=bczzL.y;
|
|
CGU_FLOAT iL=ijfeL.x;
|
|
CGU_FLOAT jL=ijfeL.y;
|
|
CGU_FLOAT fL=ijfeL.z;
|
|
CGU_FLOAT eL=ijfeL.w;
|
|
CGU_FLOAT kL=klhgL.x;
|
|
CGU_FLOAT lL=klhgL.y;
|
|
CGU_FLOAT hL=klhgL.z;
|
|
CGU_FLOAT gL=klhgL.w;
|
|
CGU_FLOAT oL=zzonL.z;
|
|
CGU_FLOAT nL=zzonL.w;
|
|
|
|
// Accumulate for bilinear interpolation.
|
|
CGU_Vec2f dir= {0.0f,0.0f};
|
|
CGU_FLOAT len= 0.0f;
|
|
|
|
FsrEasuSetF(dir,len,pp,true, false,false,false,bL,eL,fL,gL,jL);
|
|
FsrEasuSetF(dir,len,pp,false,true ,false,false,cL,fL,gL,hL,kL);
|
|
FsrEasuSetF(dir,len,pp,false,false,true ,false,fL,iL,jL,kL,nL);
|
|
FsrEasuSetF(dir,len,pp,false,false,false,true ,gL,jL,kL,lL,oL);
|
|
|
|
//---------------------------------------------------------
|
|
// Normalize with approximation, and cleanup close to zero.
|
|
//---------------------------------------------------------
|
|
CGU_Vec2f dir2=dir*dir;
|
|
CGU_FLOAT dirR=dir2.x+dir2.y;
|
|
AP1 zro=dirR<(1.0f/32768.0f);
|
|
dirR=APrxLoRsqF1(dirR);
|
|
dirR=zro?1.0f:dirR;
|
|
dir.x=zro?1.0f:dir.x;
|
|
dir = CGU_Vec2f(dirR)*dir;
|
|
|
|
//---------------------------------------------------------
|
|
// Transform from {0 to 2} to {0 to 1} range, and shape with square.
|
|
//---------------------------------------------------------
|
|
len=len*0.5f;
|
|
len*=len;
|
|
|
|
//---------------------------------------------------------
|
|
// Stretch kernel {1.0 vert|horz, to sqrt(2.0) on diagonal}.
|
|
//---------------------------------------------------------
|
|
|
|
CGU_FLOAT stretch=(dir.x*dir.x+dir.y*dir.y)*APrxLoRcpF1(CMP_MAX(abs(dir.x),abs(dir.y)));
|
|
|
|
//---------------------------------------------------------
|
|
// Anisotropic length after rotation,
|
|
// x := 1.0 lerp to 'stretch' on edges
|
|
// y := 1.0 lerp to 2x on edges
|
|
//---------------------------------------------------------
|
|
|
|
CGU_Vec2f len2=CGU_Vec2f(1.0f+(stretch-1.0f)*len,1.0f-0.5*len);
|
|
|
|
//---------------------------------------------------------
|
|
// Based on the amount of 'edge',
|
|
// the window shifts from +/-{sqrt(2.0) to slightly beyond 2.0}.
|
|
//---------------------------------------------------------
|
|
|
|
CGU_FLOAT lob=0.5f+((1.0f/4.0f-0.04f)-0.5f)*len;
|
|
|
|
//---------------------------------------------------------
|
|
// Set distance^2 clipping point to the end of the adjustable window.
|
|
//---------------------------------------------------------
|
|
|
|
CGU_FLOAT clp=APrxLoRcpF1(lob);
|
|
//----------------------------------------------------------
|
|
// Accumulation mixed with min/max of 4 nearest.
|
|
// b c
|
|
// e f g h
|
|
// i j k l
|
|
// n o
|
|
//---------------------------------------------------------
|
|
CGU_Vec3f min4=CMP_MIN(CMP_MIN(CGU_Vec3f(ijfeR.z,ijfeG.z,ijfeB.z),CGU_Vec3f(klhgR.w,klhgG.w,klhgB.w),CGU_Vec3f(ijfeR.y,ijfeG.y,ijfeB.y)),
|
|
CGU_Vec3f(klhgR.x,klhgG.x,klhgB.x));
|
|
CGU_Vec3f max4=CMP_MAX(CMP_MAX(CGU_Vec3f(ijfeR.z,ijfeG.z,ijfeB.z),CGU_Vec3f(klhgR.w,klhgG.w,klhgB.w),CGU_Vec3f(ijfeR.y,ijfeG.y,ijfeB.y)),
|
|
CGU_Vec3f(klhgR.x,klhgG.x,klhgB.x));
|
|
|
|
//---------------------------------------------------------
|
|
// Accumulation.
|
|
//---------------------------------------------------------
|
|
CGU_Vec3f aC = {0.0f,0.0f,0.0f};
|
|
CGU_FLOAT aW = 0.0f;
|
|
|
|
FsrEasuTapF(aC,aW,CGU_Vec2f( 0.0,-1.0)-pp,dir,len2,lob,clp,CGU_Vec3f(bczzR.x,bczzG.x,bczzB.x)); // b
|
|
FsrEasuTapF(aC,aW,CGU_Vec2f( 1.0,-1.0)-pp,dir,len2,lob,clp,CGU_Vec3f(bczzR.y,bczzG.y,bczzB.y)); // c
|
|
FsrEasuTapF(aC,aW,CGU_Vec2f(-1.0, 1.0)-pp,dir,len2,lob,clp,CGU_Vec3f(ijfeR.x,ijfeG.x,ijfeB.x)); // i
|
|
FsrEasuTapF(aC,aW,CGU_Vec2f( 0.0, 1.0)-pp,dir,len2,lob,clp,CGU_Vec3f(ijfeR.y,ijfeG.y,ijfeB.y)); // j
|
|
FsrEasuTapF(aC,aW,CGU_Vec2f( 0.0, 0.0)-pp,dir,len2,lob,clp,CGU_Vec3f(ijfeR.z,ijfeG.z,ijfeB.z)); // f
|
|
FsrEasuTapF(aC,aW,CGU_Vec2f(-1.0, 0.0)-pp,dir,len2,lob,clp,CGU_Vec3f(ijfeR.w,ijfeG.w,ijfeB.w)); // e
|
|
FsrEasuTapF(aC,aW,CGU_Vec2f( 1.0, 1.0)-pp,dir,len2,lob,clp,CGU_Vec3f(klhgR.x,klhgG.x,klhgB.x)); // k
|
|
FsrEasuTapF(aC,aW,CGU_Vec2f( 2.0, 1.0)-pp,dir,len2,lob,clp,CGU_Vec3f(klhgR.y,klhgG.y,klhgB.y)); // l
|
|
FsrEasuTapF(aC,aW,CGU_Vec2f( 2.0, 0.0)-pp,dir,len2,lob,clp,CGU_Vec3f(klhgR.z,klhgG.z,klhgB.z)); // h
|
|
FsrEasuTapF(aC,aW,CGU_Vec2f( 1.0, 0.0)-pp,dir,len2,lob,clp,CGU_Vec3f(klhgR.w,klhgG.w,klhgB.w)); // g
|
|
FsrEasuTapF(aC,aW,CGU_Vec2f( 1.0, 2.0)-pp,dir,len2,lob,clp,CGU_Vec3f(zzonR.z,zzonG.z,zzonB.z)); // o
|
|
FsrEasuTapF(aC,aW,CGU_Vec2f( 0.0, 2.0)-pp,dir,len2,lob,clp,CGU_Vec3f(zzonR.w,zzonG.w,zzonB.w)); // n
|
|
//------------------------------------------------------------------------------------------------------------------------------
|
|
// Normalize and dering.
|
|
pix=min(max4,max(min4,aC*CGU_Vec3f(ARcpF1(aW))));
|
|
}
|