#define USE_CMP_FIDELITY_FX_H #ifndef A_CPU #define A_CPU #endif #include "common_def.h" // Call to setup required constant values (works on CPU or GPU). A_STATIC void FsrEasuCon(CMP_OUT CGU_Vec4ui con0, CMP_OUT CGU_Vec4ui con1, CMP_OUT CGU_Vec4ui con2, CMP_OUT CGU_Vec4ui con3, // This the rendered part of the input image. CGU_FLOAT inputViewportInPixelsX, CGU_FLOAT inputViewportInPixelsY, // This is the size of the input image. CGU_FLOAT inputSizeInPixelsX, CGU_FLOAT inputSizeInPixelsY, CGU_FLOAT outputSizeInPixelsX, CGU_FLOAT outputSizeInPixelsY ) { // Output integer position to a pixel position in viewport. con0[0]=AU1_AF1(inputViewportInPixelsX*ARcpF1(outputSizeInPixelsX)); con0[1]=AU1_AF1(inputViewportInPixelsY*ARcpF1(outputSizeInPixelsY)); con0[2]=AU1_AF1(0.5f*inputViewportInPixelsX*ARcpF1(outputSizeInPixelsX)-0.5f); con0[3]=AU1_AF1(0.5f*inputViewportInPixelsY*ARcpF1(outputSizeInPixelsY)-0.5f); // Viewport pixel position to normalized image space. // This is used to get upper-left of 'F' tap. con1[0]=AU1_AF1(ARcpF1(inputSizeInPixelsX)); con1[1]=AU1_AF1(ARcpF1(inputSizeInPixelsY)); // Centers of gather4, first offset from upper-left of 'F'. // +---+---+ // | | | // +--(0)--+ // | b | c | // +---F---+---+---+ // | e | f | g | h | // +--(1)--+--(2)--+ // | i | j | k | l | // +---+---+---+---+ // | n | o | // +--(3)--+ // | | | // +---+---+ con1[2]=AU1_AF1( 1.0f*ARcpF1(inputSizeInPixelsX)); con1[3]=AU1_AF1(-1.0f*ARcpF1(inputSizeInPixelsY)); // These are from (0) instead of 'F'. con2[0]=AU1_AF1(-1.0f*ARcpF1(inputSizeInPixelsX)); con2[1]=AU1_AF1( 2.0f*ARcpF1(inputSizeInPixelsY)); con2[2]=AU1_AF1( 1.0f*ARcpF1(inputSizeInPixelsX)); con2[3]=AU1_AF1( 2.0f*ARcpF1(inputSizeInPixelsY)); con3[0]=AU1_AF1( 0.0f*ARcpF1(inputSizeInPixelsX)); con3[1]=AU1_AF1( 4.0f*ARcpF1(inputSizeInPixelsY)); con3[2]=con3[3]=0; } //============================================================================================================================== // NON-PACKED 32-BIT VERSION //============================================================================================================================== // CPU Prototypes.: Get pixel value ar given co-ordinates, Red, Green, Blue CGU_Vec4f FsrEasuRF(CGU_Vec2f p) {} ; CGU_Vec4f FsrEasuGF(CGU_Vec2f p) {} ; CGU_Vec4f FsrEasuBF(CGU_Vec2f p) {} ; //------------------------------------------------------------------------------------------------------------------------------ // Filtering for a given tap for the scalar. void FsrEasuTapF(CGU_Vec3f aC, // Accumulated color, with negative lobe. CGU_FLOAT aW, // Accumulated weight. CGU_Vec2f off, // Pixel offset from resolve position to tap. CGU_Vec2f dir, // Gradient direction. CGU_Vec2f len, // Length. CGU_FLOAT lob, // Negative lobe strength. CGU_FLOAT clp, // Clipping point. CGU_Vec3f c) { // Tap color. // Rotate offset by direction. CGU_Vec2f v; v.x =(off.x*( dir.x))+(off.y*dir.y); v.y =(off.x*(-dir.y))+(off.y*dir.x); // Anisotropy. v = len*v; // Compute distance^2. CGU_FLOAT d2=v.x*v.x+v.y*v.y; // Limit to the window as at corner, 2 taps can easily be outside. d2=min(d2,clp); // Approximation of lancos2 without sin() or rcp(), or sqrt() to get x. // (25/16 * (2/5 * x^2 - 1)^2 - (25/16 - 1)) * (1/4 * x^2 - 1)^2 // |_______________________________________| |_______________| // base window // The general form of the 'base' is, // (a*(b*x^2-1)^2-(a-1)) // Where 'a=1/(2*b-b^2)' and 'b' moves around the negative lobe. CGU_FLOAT wB=(2.0f/5.0f)*d2-1.0f; CGU_FLOAT wA=lob*d2-1.0f; wB*=wB; wA*=wA; wB=(25.0f/16.0f)*wB-(25.0f/16.0f-1.0f); CGU_FLOAT w=wB*wA; // Do weighted average. aC+=c*w; aW+=w; } //------------------------------------------------------------------------------------------------------------------------------ // Accumulate direction and length. //------------------------------------------------------------------------------------------------------------------------------ void FsrEasuSetF( CMP_INOUT CGU_Vec2f dir, CMP_INOUT CGU_FLOAT len, CGU_Vec2f pp, AP1 biS,AP1 biT,AP1 biU,AP1 biV, CGU_FLOAT lA,CGU_FLOAT lB,CGU_FLOAT lC,CGU_FLOAT lD,CGU_FLOAT lE) { // Compute bilinear weight, branches factor out as predicates are compiler time immediates. // s t // u v CGU_FLOAT w; if(biS) w= (1.0f-pp.x)*1.0f-pp.y; if(biT) w= pp.x *1.0f-pp.y; if(biU) w= (1.0f-pp.x)* pp.y ; if(biV) w= pp.x * pp.y; // Direction is the '+' diff. // a // b c d // e // Then takes magnitude from abs average of both sides of 'c'. // Length converts gradient reversal to 0, smoothly to non-reversal at 1, shaped, then adding horz and vert terms. CGU_FLOAT dc=lD-lC; CGU_FLOAT cb=lC-lB; CGU_FLOAT lenX=max(abs(dc),abs(cb)); lenX=APrxLoRcpF1(lenX); CGU_FLOAT dirX=lD-lB; dir.x+=dirX*w; lenX=ASatF1(abs(dirX)*lenX); lenX*=lenX; len+=lenX*w; // Repeat for the y axis. CGU_FLOAT ec=lE-lC; CGU_FLOAT ca=lC-lA; CGU_FLOAT lenY=max(abs(ec),abs(ca)); lenY=APrxLoRcpF1(lenY); CGU_FLOAT dirY=lE-lA; dir.y+=dirY*w; lenY=ASatF1(abs(dirY)*lenY); lenY*=lenY; len+=lenY*w; } //------------------------------------------------------------------------------------------------------------------------------ void FsrEasuF( CMP_OUT CGU_Vec3f pix, CGU_Vec2ui ip, // Integer pixel position in output. CGU_Vec4ui con0, // Constants generated by FsrEasuCon(). CGU_Vec4ui con1, CGU_Vec4ui con2, CGU_Vec4ui con3) { //------------------------------------------------------------------------------------------------------------------------------ // Get position of 'f'. CGU_Vec2f pp = CGU_Vec2f(ip.x,ip.y)*CGU_Vec2f(con0.x,con0.y)+CGU_Vec2f(con0.z,con0.w); CGU_Vec2f fp = CGU_Vec2f(cmp_floor(pp.x),cmp_floor(pp.x)); pp-=fp; //------------------------------------------------------------------------------------------------------------------------------ // 12-tap kernel. // b c // e f g h // i j k l // n o // Gather 4 ordering. // a b // r g // For packed FP16, need either {rg} or {ab} so using the following setup for gather in all versions, // a b <- unused (z) // r g // a b a b // r g r g // a b // r g <- unused (z) // Allowing dead-code removal to remove the 'z's. CGU_Vec2f p0=fp*CGU_Vec2f(con1.x,con1.y)+CGU_Vec2f(con1.z,con1.w); // These are from p0 to avoid pulling two constants on pre-Navi hardware. CGU_Vec2f p1=p0+CGU_Vec2f(con2.x,con2.y); CGU_Vec2f p2=p0+CGU_Vec2f(con2.z,con2.w); CGU_Vec2f p3=p0+CGU_Vec2f(con3.x,con3.y); CGU_Vec4f bczzR=FsrEasuRF(p0); CGU_Vec4f bczzG=FsrEasuGF(p0); CGU_Vec4f bczzB=FsrEasuBF(p0); CGU_Vec4f ijfeR=FsrEasuRF(p1); CGU_Vec4f ijfeG=FsrEasuGF(p1); CGU_Vec4f ijfeB=FsrEasuBF(p1); CGU_Vec4f klhgR=FsrEasuRF(p2); CGU_Vec4f klhgG=FsrEasuGF(p2); CGU_Vec4f klhgB=FsrEasuBF(p2); CGU_Vec4f zzonR=FsrEasuRF(p3); CGU_Vec4f zzonG=FsrEasuGF(p3); CGU_Vec4f zzonB=FsrEasuBF(p3); // ------------------------------------------------------------------------------------------------------------------------------ // Simplest multi-channel approximate luma possible (luma times 2, in 2 FMA/MAD). CGU_Vec4f bczzL=bczzB*CGU_Vec4f(0.5)+(bczzR*CGU_Vec4f(0.5)+bczzG); CGU_Vec4f ijfeL=ijfeB*CGU_Vec4f(0.5)+(ijfeR*CGU_Vec4f(0.5)+ijfeG); CGU_Vec4f klhgL=klhgB*CGU_Vec4f(0.5)+(klhgR*CGU_Vec4f(0.5)+klhgG); CGU_Vec4f zzonL=zzonB*CGU_Vec4f(0.5)+(zzonR*CGU_Vec4f(0.5)+zzonG); // Rename. CGU_FLOAT bL=bczzL.x; CGU_FLOAT cL=bczzL.y; CGU_FLOAT iL=ijfeL.x; CGU_FLOAT jL=ijfeL.y; CGU_FLOAT fL=ijfeL.z; CGU_FLOAT eL=ijfeL.w; CGU_FLOAT kL=klhgL.x; CGU_FLOAT lL=klhgL.y; CGU_FLOAT hL=klhgL.z; CGU_FLOAT gL=klhgL.w; CGU_FLOAT oL=zzonL.z; CGU_FLOAT nL=zzonL.w; // Accumulate for bilinear interpolation. CGU_Vec2f dir= {0.0f,0.0f}; CGU_FLOAT len= 0.0f; FsrEasuSetF(dir,len,pp,true, false,false,false,bL,eL,fL,gL,jL); FsrEasuSetF(dir,len,pp,false,true ,false,false,cL,fL,gL,hL,kL); FsrEasuSetF(dir,len,pp,false,false,true ,false,fL,iL,jL,kL,nL); FsrEasuSetF(dir,len,pp,false,false,false,true ,gL,jL,kL,lL,oL); //--------------------------------------------------------- // Normalize with approximation, and cleanup close to zero. //--------------------------------------------------------- CGU_Vec2f dir2=dir*dir; CGU_FLOAT dirR=dir2.x+dir2.y; AP1 zro=dirR<(1.0f/32768.0f); dirR=APrxLoRsqF1(dirR); dirR=zro?1.0f:dirR; dir.x=zro?1.0f:dir.x; dir = CGU_Vec2f(dirR)*dir; //--------------------------------------------------------- // Transform from {0 to 2} to {0 to 1} range, and shape with square. //--------------------------------------------------------- len=len*0.5f; len*=len; //--------------------------------------------------------- // Stretch kernel {1.0 vert|horz, to sqrt(2.0) on diagonal}. //--------------------------------------------------------- CGU_FLOAT stretch=(dir.x*dir.x+dir.y*dir.y)*APrxLoRcpF1(CMP_MAX(abs(dir.x),abs(dir.y))); //--------------------------------------------------------- // Anisotropic length after rotation, // x := 1.0 lerp to 'stretch' on edges // y := 1.0 lerp to 2x on edges //--------------------------------------------------------- CGU_Vec2f len2=CGU_Vec2f(1.0f+(stretch-1.0f)*len,1.0f-0.5*len); //--------------------------------------------------------- // Based on the amount of 'edge', // the window shifts from +/-{sqrt(2.0) to slightly beyond 2.0}. //--------------------------------------------------------- CGU_FLOAT lob=0.5f+((1.0f/4.0f-0.04f)-0.5f)*len; //--------------------------------------------------------- // Set distance^2 clipping point to the end of the adjustable window. //--------------------------------------------------------- CGU_FLOAT clp=APrxLoRcpF1(lob); //---------------------------------------------------------- // Accumulation mixed with min/max of 4 nearest. // b c // e f g h // i j k l // n o //--------------------------------------------------------- CGU_Vec3f min4=CMP_MIN(CMP_MIN(CGU_Vec3f(ijfeR.z,ijfeG.z,ijfeB.z),CGU_Vec3f(klhgR.w,klhgG.w,klhgB.w),CGU_Vec3f(ijfeR.y,ijfeG.y,ijfeB.y)), CGU_Vec3f(klhgR.x,klhgG.x,klhgB.x)); CGU_Vec3f max4=CMP_MAX(CMP_MAX(CGU_Vec3f(ijfeR.z,ijfeG.z,ijfeB.z),CGU_Vec3f(klhgR.w,klhgG.w,klhgB.w),CGU_Vec3f(ijfeR.y,ijfeG.y,ijfeB.y)), CGU_Vec3f(klhgR.x,klhgG.x,klhgB.x)); //--------------------------------------------------------- // Accumulation. //--------------------------------------------------------- CGU_Vec3f aC = {0.0f,0.0f,0.0f}; CGU_FLOAT aW = 0.0f; FsrEasuTapF(aC,aW,CGU_Vec2f( 0.0,-1.0)-pp,dir,len2,lob,clp,CGU_Vec3f(bczzR.x,bczzG.x,bczzB.x)); // b FsrEasuTapF(aC,aW,CGU_Vec2f( 1.0,-1.0)-pp,dir,len2,lob,clp,CGU_Vec3f(bczzR.y,bczzG.y,bczzB.y)); // c FsrEasuTapF(aC,aW,CGU_Vec2f(-1.0, 1.0)-pp,dir,len2,lob,clp,CGU_Vec3f(ijfeR.x,ijfeG.x,ijfeB.x)); // i FsrEasuTapF(aC,aW,CGU_Vec2f( 0.0, 1.0)-pp,dir,len2,lob,clp,CGU_Vec3f(ijfeR.y,ijfeG.y,ijfeB.y)); // j FsrEasuTapF(aC,aW,CGU_Vec2f( 0.0, 0.0)-pp,dir,len2,lob,clp,CGU_Vec3f(ijfeR.z,ijfeG.z,ijfeB.z)); // f FsrEasuTapF(aC,aW,CGU_Vec2f(-1.0, 0.0)-pp,dir,len2,lob,clp,CGU_Vec3f(ijfeR.w,ijfeG.w,ijfeB.w)); // e FsrEasuTapF(aC,aW,CGU_Vec2f( 1.0, 1.0)-pp,dir,len2,lob,clp,CGU_Vec3f(klhgR.x,klhgG.x,klhgB.x)); // k FsrEasuTapF(aC,aW,CGU_Vec2f( 2.0, 1.0)-pp,dir,len2,lob,clp,CGU_Vec3f(klhgR.y,klhgG.y,klhgB.y)); // l FsrEasuTapF(aC,aW,CGU_Vec2f( 2.0, 0.0)-pp,dir,len2,lob,clp,CGU_Vec3f(klhgR.z,klhgG.z,klhgB.z)); // h FsrEasuTapF(aC,aW,CGU_Vec2f( 1.0, 0.0)-pp,dir,len2,lob,clp,CGU_Vec3f(klhgR.w,klhgG.w,klhgB.w)); // g FsrEasuTapF(aC,aW,CGU_Vec2f( 1.0, 2.0)-pp,dir,len2,lob,clp,CGU_Vec3f(zzonR.z,zzonG.z,zzonB.z)); // o FsrEasuTapF(aC,aW,CGU_Vec2f( 0.0, 2.0)-pp,dir,len2,lob,clp,CGU_Vec3f(zzonR.w,zzonG.w,zzonB.w)); // n //------------------------------------------------------------------------------------------------------------------------------ // Normalize and dering. pix=min(max4,max(min4,aC*CGU_Vec3f(ARcpF1(aW)))); }