319 lines
13 KiB
C

#define USE_CMP_FIDELITY_FX_H
#ifndef A_CPU
#define A_CPU
#endif
#include "common_def.h"
// Call to setup required constant values (works on CPU or GPU).
A_STATIC void FsrEasuCon(CMP_OUT CGU_Vec4ui con0,
CMP_OUT CGU_Vec4ui con1,
CMP_OUT CGU_Vec4ui con2,
CMP_OUT CGU_Vec4ui con3,
// This the rendered part of the input image.
CGU_FLOAT inputViewportInPixelsX,
CGU_FLOAT inputViewportInPixelsY,
// This is the size of the input image.
CGU_FLOAT inputSizeInPixelsX,
CGU_FLOAT inputSizeInPixelsY,
CGU_FLOAT outputSizeInPixelsX,
CGU_FLOAT outputSizeInPixelsY )
{
// Output integer position to a pixel position in viewport.
con0[0]=AU1_AF1(inputViewportInPixelsX*ARcpF1(outputSizeInPixelsX));
con0[1]=AU1_AF1(inputViewportInPixelsY*ARcpF1(outputSizeInPixelsY));
con0[2]=AU1_AF1(0.5f*inputViewportInPixelsX*ARcpF1(outputSizeInPixelsX)-0.5f);
con0[3]=AU1_AF1(0.5f*inputViewportInPixelsY*ARcpF1(outputSizeInPixelsY)-0.5f);
// Viewport pixel position to normalized image space.
// This is used to get upper-left of 'F' tap.
con1[0]=AU1_AF1(ARcpF1(inputSizeInPixelsX));
con1[1]=AU1_AF1(ARcpF1(inputSizeInPixelsY));
// Centers of gather4, first offset from upper-left of 'F'.
// +---+---+
// | | |
// +--(0)--+
// | b | c |
// +---F---+---+---+
// | e | f | g | h |
// +--(1)--+--(2)--+
// | i | j | k | l |
// +---+---+---+---+
// | n | o |
// +--(3)--+
// | | |
// +---+---+
con1[2]=AU1_AF1( 1.0f*ARcpF1(inputSizeInPixelsX));
con1[3]=AU1_AF1(-1.0f*ARcpF1(inputSizeInPixelsY));
// These are from (0) instead of 'F'.
con2[0]=AU1_AF1(-1.0f*ARcpF1(inputSizeInPixelsX));
con2[1]=AU1_AF1( 2.0f*ARcpF1(inputSizeInPixelsY));
con2[2]=AU1_AF1( 1.0f*ARcpF1(inputSizeInPixelsX));
con2[3]=AU1_AF1( 2.0f*ARcpF1(inputSizeInPixelsY));
con3[0]=AU1_AF1( 0.0f*ARcpF1(inputSizeInPixelsX));
con3[1]=AU1_AF1( 4.0f*ARcpF1(inputSizeInPixelsY));
con3[2]=con3[3]=0;
}
//==============================================================================================================================
// NON-PACKED 32-BIT VERSION
//==============================================================================================================================
// CPU Prototypes.: Get pixel value ar given co-ordinates, Red, Green, Blue
CGU_Vec4f FsrEasuRF(CGU_Vec2f p) {} ;
CGU_Vec4f FsrEasuGF(CGU_Vec2f p) {} ;
CGU_Vec4f FsrEasuBF(CGU_Vec2f p) {} ;
//------------------------------------------------------------------------------------------------------------------------------
// Filtering for a given tap for the scalar.
void FsrEasuTapF(CGU_Vec3f aC, // Accumulated color, with negative lobe.
CGU_FLOAT aW, // Accumulated weight.
CGU_Vec2f off, // Pixel offset from resolve position to tap.
CGU_Vec2f dir, // Gradient direction.
CGU_Vec2f len, // Length.
CGU_FLOAT lob, // Negative lobe strength.
CGU_FLOAT clp, // Clipping point.
CGU_Vec3f c)
{
// Tap color.
// Rotate offset by direction.
CGU_Vec2f v;
v.x =(off.x*( dir.x))+(off.y*dir.y);
v.y =(off.x*(-dir.y))+(off.y*dir.x);
// Anisotropy.
v = len*v;
// Compute distance^2.
CGU_FLOAT d2=v.x*v.x+v.y*v.y;
// Limit to the window as at corner, 2 taps can easily be outside.
d2=min(d2,clp);
// Approximation of lancos2 without sin() or rcp(), or sqrt() to get x.
// (25/16 * (2/5 * x^2 - 1)^2 - (25/16 - 1)) * (1/4 * x^2 - 1)^2
// |_______________________________________| |_______________|
// base window
// The general form of the 'base' is,
// (a*(b*x^2-1)^2-(a-1))
// Where 'a=1/(2*b-b^2)' and 'b' moves around the negative lobe.
CGU_FLOAT wB=(2.0f/5.0f)*d2-1.0f;
CGU_FLOAT wA=lob*d2-1.0f;
wB*=wB;
wA*=wA;
wB=(25.0f/16.0f)*wB-(25.0f/16.0f-1.0f);
CGU_FLOAT w=wB*wA;
// Do weighted average.
aC+=c*w;
aW+=w;
}
//------------------------------------------------------------------------------------------------------------------------------
// Accumulate direction and length.
//------------------------------------------------------------------------------------------------------------------------------
void FsrEasuSetF( CMP_INOUT CGU_Vec2f dir,
CMP_INOUT CGU_FLOAT len,
CGU_Vec2f pp,
AP1 biS,AP1 biT,AP1 biU,AP1 biV,
CGU_FLOAT lA,CGU_FLOAT lB,CGU_FLOAT lC,CGU_FLOAT lD,CGU_FLOAT lE)
{
// Compute bilinear weight, branches factor out as predicates are compiler time immediates.
// s t
// u v
CGU_FLOAT w;
if(biS) w= (1.0f-pp.x)*1.0f-pp.y;
if(biT) w= pp.x *1.0f-pp.y;
if(biU) w= (1.0f-pp.x)* pp.y ;
if(biV) w= pp.x * pp.y;
// Direction is the '+' diff.
// a
// b c d
// e
// Then takes magnitude from abs average of both sides of 'c'.
// Length converts gradient reversal to 0, smoothly to non-reversal at 1, shaped, then adding horz and vert terms.
CGU_FLOAT dc=lD-lC;
CGU_FLOAT cb=lC-lB;
CGU_FLOAT lenX=max(abs(dc),abs(cb));
lenX=APrxLoRcpF1(lenX);
CGU_FLOAT dirX=lD-lB;
dir.x+=dirX*w;
lenX=ASatF1(abs(dirX)*lenX);
lenX*=lenX;
len+=lenX*w;
// Repeat for the y axis.
CGU_FLOAT ec=lE-lC;
CGU_FLOAT ca=lC-lA;
CGU_FLOAT lenY=max(abs(ec),abs(ca));
lenY=APrxLoRcpF1(lenY);
CGU_FLOAT dirY=lE-lA;
dir.y+=dirY*w;
lenY=ASatF1(abs(dirY)*lenY);
lenY*=lenY;
len+=lenY*w;
}
//------------------------------------------------------------------------------------------------------------------------------
void FsrEasuF( CMP_OUT CGU_Vec3f pix,
CGU_Vec2ui ip, // Integer pixel position in output.
CGU_Vec4ui con0, // Constants generated by FsrEasuCon().
CGU_Vec4ui con1,
CGU_Vec4ui con2,
CGU_Vec4ui con3)
{
//------------------------------------------------------------------------------------------------------------------------------
// Get position of 'f'.
CGU_Vec2f pp = CGU_Vec2f(ip.x,ip.y)*CGU_Vec2f(con0.x,con0.y)+CGU_Vec2f(con0.z,con0.w);
CGU_Vec2f fp = CGU_Vec2f(cmp_floor(pp.x),cmp_floor(pp.x));
pp-=fp;
//------------------------------------------------------------------------------------------------------------------------------
// 12-tap kernel.
// b c
// e f g h
// i j k l
// n o
// Gather 4 ordering.
// a b
// r g
// For packed FP16, need either {rg} or {ab} so using the following setup for gather in all versions,
// a b <- unused (z)
// r g
// a b a b
// r g r g
// a b
// r g <- unused (z)
// Allowing dead-code removal to remove the 'z's.
CGU_Vec2f p0=fp*CGU_Vec2f(con1.x,con1.y)+CGU_Vec2f(con1.z,con1.w);
// These are from p0 to avoid pulling two constants on pre-Navi hardware.
CGU_Vec2f p1=p0+CGU_Vec2f(con2.x,con2.y);
CGU_Vec2f p2=p0+CGU_Vec2f(con2.z,con2.w);
CGU_Vec2f p3=p0+CGU_Vec2f(con3.x,con3.y);
CGU_Vec4f bczzR=FsrEasuRF(p0);
CGU_Vec4f bczzG=FsrEasuGF(p0);
CGU_Vec4f bczzB=FsrEasuBF(p0);
CGU_Vec4f ijfeR=FsrEasuRF(p1);
CGU_Vec4f ijfeG=FsrEasuGF(p1);
CGU_Vec4f ijfeB=FsrEasuBF(p1);
CGU_Vec4f klhgR=FsrEasuRF(p2);
CGU_Vec4f klhgG=FsrEasuGF(p2);
CGU_Vec4f klhgB=FsrEasuBF(p2);
CGU_Vec4f zzonR=FsrEasuRF(p3);
CGU_Vec4f zzonG=FsrEasuGF(p3);
CGU_Vec4f zzonB=FsrEasuBF(p3);
// ------------------------------------------------------------------------------------------------------------------------------
// Simplest multi-channel approximate luma possible (luma times 2, in 2 FMA/MAD).
CGU_Vec4f bczzL=bczzB*CGU_Vec4f(0.5)+(bczzR*CGU_Vec4f(0.5)+bczzG);
CGU_Vec4f ijfeL=ijfeB*CGU_Vec4f(0.5)+(ijfeR*CGU_Vec4f(0.5)+ijfeG);
CGU_Vec4f klhgL=klhgB*CGU_Vec4f(0.5)+(klhgR*CGU_Vec4f(0.5)+klhgG);
CGU_Vec4f zzonL=zzonB*CGU_Vec4f(0.5)+(zzonR*CGU_Vec4f(0.5)+zzonG);
// Rename.
CGU_FLOAT bL=bczzL.x;
CGU_FLOAT cL=bczzL.y;
CGU_FLOAT iL=ijfeL.x;
CGU_FLOAT jL=ijfeL.y;
CGU_FLOAT fL=ijfeL.z;
CGU_FLOAT eL=ijfeL.w;
CGU_FLOAT kL=klhgL.x;
CGU_FLOAT lL=klhgL.y;
CGU_FLOAT hL=klhgL.z;
CGU_FLOAT gL=klhgL.w;
CGU_FLOAT oL=zzonL.z;
CGU_FLOAT nL=zzonL.w;
// Accumulate for bilinear interpolation.
CGU_Vec2f dir= {0.0f,0.0f};
CGU_FLOAT len= 0.0f;
FsrEasuSetF(dir,len,pp,true, false,false,false,bL,eL,fL,gL,jL);
FsrEasuSetF(dir,len,pp,false,true ,false,false,cL,fL,gL,hL,kL);
FsrEasuSetF(dir,len,pp,false,false,true ,false,fL,iL,jL,kL,nL);
FsrEasuSetF(dir,len,pp,false,false,false,true ,gL,jL,kL,lL,oL);
//---------------------------------------------------------
// Normalize with approximation, and cleanup close to zero.
//---------------------------------------------------------
CGU_Vec2f dir2=dir*dir;
CGU_FLOAT dirR=dir2.x+dir2.y;
AP1 zro=dirR<(1.0f/32768.0f);
dirR=APrxLoRsqF1(dirR);
dirR=zro?1.0f:dirR;
dir.x=zro?1.0f:dir.x;
dir = CGU_Vec2f(dirR)*dir;
//---------------------------------------------------------
// Transform from {0 to 2} to {0 to 1} range, and shape with square.
//---------------------------------------------------------
len=len*0.5f;
len*=len;
//---------------------------------------------------------
// Stretch kernel {1.0 vert|horz, to sqrt(2.0) on diagonal}.
//---------------------------------------------------------
CGU_FLOAT stretch=(dir.x*dir.x+dir.y*dir.y)*APrxLoRcpF1(CMP_MAX(abs(dir.x),abs(dir.y)));
//---------------------------------------------------------
// Anisotropic length after rotation,
// x := 1.0 lerp to 'stretch' on edges
// y := 1.0 lerp to 2x on edges
//---------------------------------------------------------
CGU_Vec2f len2=CGU_Vec2f(1.0f+(stretch-1.0f)*len,1.0f-0.5*len);
//---------------------------------------------------------
// Based on the amount of 'edge',
// the window shifts from +/-{sqrt(2.0) to slightly beyond 2.0}.
//---------------------------------------------------------
CGU_FLOAT lob=0.5f+((1.0f/4.0f-0.04f)-0.5f)*len;
//---------------------------------------------------------
// Set distance^2 clipping point to the end of the adjustable window.
//---------------------------------------------------------
CGU_FLOAT clp=APrxLoRcpF1(lob);
//----------------------------------------------------------
// Accumulation mixed with min/max of 4 nearest.
// b c
// e f g h
// i j k l
// n o
//---------------------------------------------------------
CGU_Vec3f min4=CMP_MIN(CMP_MIN(CGU_Vec3f(ijfeR.z,ijfeG.z,ijfeB.z),CGU_Vec3f(klhgR.w,klhgG.w,klhgB.w),CGU_Vec3f(ijfeR.y,ijfeG.y,ijfeB.y)),
CGU_Vec3f(klhgR.x,klhgG.x,klhgB.x));
CGU_Vec3f max4=CMP_MAX(CMP_MAX(CGU_Vec3f(ijfeR.z,ijfeG.z,ijfeB.z),CGU_Vec3f(klhgR.w,klhgG.w,klhgB.w),CGU_Vec3f(ijfeR.y,ijfeG.y,ijfeB.y)),
CGU_Vec3f(klhgR.x,klhgG.x,klhgB.x));
//---------------------------------------------------------
// Accumulation.
//---------------------------------------------------------
CGU_Vec3f aC = {0.0f,0.0f,0.0f};
CGU_FLOAT aW = 0.0f;
FsrEasuTapF(aC,aW,CGU_Vec2f( 0.0,-1.0)-pp,dir,len2,lob,clp,CGU_Vec3f(bczzR.x,bczzG.x,bczzB.x)); // b
FsrEasuTapF(aC,aW,CGU_Vec2f( 1.0,-1.0)-pp,dir,len2,lob,clp,CGU_Vec3f(bczzR.y,bczzG.y,bczzB.y)); // c
FsrEasuTapF(aC,aW,CGU_Vec2f(-1.0, 1.0)-pp,dir,len2,lob,clp,CGU_Vec3f(ijfeR.x,ijfeG.x,ijfeB.x)); // i
FsrEasuTapF(aC,aW,CGU_Vec2f( 0.0, 1.0)-pp,dir,len2,lob,clp,CGU_Vec3f(ijfeR.y,ijfeG.y,ijfeB.y)); // j
FsrEasuTapF(aC,aW,CGU_Vec2f( 0.0, 0.0)-pp,dir,len2,lob,clp,CGU_Vec3f(ijfeR.z,ijfeG.z,ijfeB.z)); // f
FsrEasuTapF(aC,aW,CGU_Vec2f(-1.0, 0.0)-pp,dir,len2,lob,clp,CGU_Vec3f(ijfeR.w,ijfeG.w,ijfeB.w)); // e
FsrEasuTapF(aC,aW,CGU_Vec2f( 1.0, 1.0)-pp,dir,len2,lob,clp,CGU_Vec3f(klhgR.x,klhgG.x,klhgB.x)); // k
FsrEasuTapF(aC,aW,CGU_Vec2f( 2.0, 1.0)-pp,dir,len2,lob,clp,CGU_Vec3f(klhgR.y,klhgG.y,klhgB.y)); // l
FsrEasuTapF(aC,aW,CGU_Vec2f( 2.0, 0.0)-pp,dir,len2,lob,clp,CGU_Vec3f(klhgR.z,klhgG.z,klhgB.z)); // h
FsrEasuTapF(aC,aW,CGU_Vec2f( 1.0, 0.0)-pp,dir,len2,lob,clp,CGU_Vec3f(klhgR.w,klhgG.w,klhgB.w)); // g
FsrEasuTapF(aC,aW,CGU_Vec2f( 1.0, 2.0)-pp,dir,len2,lob,clp,CGU_Vec3f(zzonR.z,zzonG.z,zzonB.z)); // o
FsrEasuTapF(aC,aW,CGU_Vec2f( 0.0, 2.0)-pp,dir,len2,lob,clp,CGU_Vec3f(zzonR.w,zzonG.w,zzonB.w)); // n
//------------------------------------------------------------------------------------------------------------------------------
// Normalize and dering.
pix=min(max4,max(min4,aC*CGU_Vec3f(ARcpF1(aW))));
}