TexConv/CMP_CompressonatorLib/BC7/BC7_Encode.cpp

//===============================================================================
// Copyright (c) 2007-2016  Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2004-2006 ATI Technologies Inc.
//===============================================================================
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files(the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions :
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
//
//
//  BC7_Encode.cpp : A reference encoder for BC7
//

#include <assert.h>
#include <float.h>
#include <stdio.h>
#include <math.h>
#include "common.h"
#include "bc7_definitions.h"
#include "bc7_partitions.h"
#include "bc7_encode.h"
#include "bc7_utils.h"
#include "3dquant_vpc.h"
#include "shake.h"
#include "debug.h"

//#ifdef USE_CMP_CORE_API
//#include "bcn_common_kernel.h"
//#include "bcn_common_api.h"
//#include "bc7_encode_kernel.h"
//#endif

#ifdef BC7_COMPDEBUGGER
#include "compclient.h"
#endif

#ifdef USE_FILEIO
#include <stdio.h>
extern FILE * bc7_File;
#endif

// Threshold quality below which we will always run fast quality and shaking
// Selfnote: User should be able to set this?
// Default FQuality is at 0.1 < g_qFAST_THRESHOLD which will cause the SingleIndex compression to start skipping shape blocks
// during compression
// if user sets a value above this then all shapes will be used for compression scan for quality
double g_qFAST_THRESHOLD  = 0.5;

// This limit is used for DualIndex Block and if fQuality is above this limit then Quantization shaking will always be performed
// on all indexs
double g_HIGHQULITY_THRESHOLD = 0.7;
//
// For a given block mode this sets up the data needed by the compressor
//
// Note that BC7 only uses NO_PBIT, ONE_PBIT and TWO_PBIT encodings
// for endpoints
//

void    BC7BlockEncoder::BlockSetup(CMP_DWORD blockMode) {
#ifdef USE_DBGTRACE
    DbgTrace(());
#endif
    switch(bti_cpu[blockMode].pBitType) {
    case    NO_PBIT:
        m_parityBits = CART;
        break;
    case    ONE_PBIT:
        m_parityBits = SAME_PAR;
        break;
    case    TWO_PBIT:
        m_parityBits = BCC;
        break;
    case    THREE_PBIT:
        m_parityBits = SAME_FCC;
        break;
    case    FOUR_PBIT:
        m_parityBits = FCC;
        break;
    case    FIVE_PBIT:
        m_parityBits = FCC_SAME_BCC;
        break;
    }

    if(bti_cpu[blockMode].encodingType == NO_ALPHA) {
        m_componentBits[COMP_RED] = bti_cpu[blockMode].vectorBits/3;
        m_componentBits[COMP_GREEN] = bti_cpu[blockMode].vectorBits/3;
        m_componentBits[COMP_BLUE] = bti_cpu[blockMode].vectorBits/3;
        m_componentBits[COMP_ALPHA] = 0;

        m_clusters[0] = 1 << bti_cpu[blockMode].indexBits[0];
        m_clusters[1] = 0;
    } else if(bti_cpu[blockMode].encodingType == COMBINED_ALPHA) {
        m_componentBits[COMP_RED] = bti_cpu[blockMode].vectorBits/4;
        m_componentBits[COMP_GREEN] = bti_cpu[blockMode].vectorBits/4;
        m_componentBits[COMP_BLUE] = bti_cpu[blockMode].vectorBits/4;
        m_componentBits[COMP_ALPHA] = bti_cpu[blockMode].vectorBits/4;

        m_clusters[0] = 1 << bti_cpu[blockMode].indexBits[0];
        m_clusters[1] = 0;
    } else if(bti_cpu[blockMode].encodingType == SEPARATE_ALPHA) {
        m_componentBits[COMP_RED] = bti_cpu[blockMode].vectorBits/3;
        m_componentBits[COMP_GREEN] = bti_cpu[blockMode].vectorBits/3;
        m_componentBits[COMP_BLUE] = bti_cpu[blockMode].vectorBits/3;
        m_componentBits[COMP_ALPHA] = bti_cpu[blockMode].scalarBits;

        m_clusters[0] = 1 << bti_cpu[blockMode].indexBits[0];
        m_clusters[1] = 1 << bti_cpu[blockMode].indexBits[1];
    }
}

//
// This function sorts out the bit encoding for the BC7 block and packs everything
// in the right order for the hardware decoder
//
//
//

void BC7BlockEncoder::EncodeSingleIndexBlock(CMP_DWORD blockMode,
        CMP_DWORD partition,
        CMP_DWORD colour[MAX_SUBSETS][2],
        int   indices[MAX_SUBSETS][MAX_SUBSET_SIZE],
        //CMP_DWORD entryCount[MAX_SUBSETS],
        CMP_BYTE  block[COMPRESSED_BLOCK_SIZE]) {
#ifdef USE_DBGTRACE
    DbgTrace(("-> WriteBit()"));
#endif
    CMP_DWORD  i,j,k;
    CMP_DWORD   *partitionTable;
    int     bitPosition = 0;    // Position the pointer at the LSB
    CMP_BYTE    *basePtr = (CMP_BYTE*)block;
    CMP_DWORD   blockIndices[MAX_SUBSET_SIZE];

    // Generate Unary header
    for(i=0; i < (int)blockMode; i++) {
        WriteBit(basePtr, bitPosition++, 0);
    }
    WriteBit(basePtr, bitPosition++, 1);

    // Write partition bits
    for(i=0; i<bti_cpu[blockMode].partitionBits; i++) {
        WriteBit(basePtr, bitPosition++, (CMP_BYTE)(partition>>i) & 0x1);
    }

    // Extract the index bits from the partitions
    partitionTable = (CMP_DWORD*)BC7_PARTITIONS_CPU[bti_cpu[blockMode].subsetCount-1][partition];

    CMP_DWORD   idxCount[3] = {0, 0, 0};
    bool    flipColours[3] = {false, false, false};

    // Sort out the index set and tag whether we need to flip the
    // endpoints to get the correct state in the implicit index bits
    // The implicitly encoded MSB of the fixup index must be 0
    CMP_DWORD   fixup[3] = {0, 0, 0};
    switch(bti_cpu[blockMode].subsetCount) {
    case    3:
        fixup[1] = BC7_FIXUPINDICES[2][partition][1];
        fixup[2] = BC7_FIXUPINDICES[2][partition][2];
        break;
    case    2:
        fixup[1] = BC7_FIXUPINDICES[1][partition][1];
        break;
    default:
        break;
    }

    // Extract indices and mark subsets that need to have their colours flipped to get the
    // right state for the implicit MSB of the fixup index
    for(i=0; i < MAX_SUBSET_SIZE; i++) {
        CMP_DWORD   p = partitionTable[i];
        blockIndices[i] = indices[p][idxCount[p]++];

        for(j=0; j<(int)bti_cpu[blockMode].subsetCount; j++) {
            if(i==fixup[j]) {
                if(blockIndices[i] & (1<<(bti_cpu[blockMode].indexBits[0]-1))) {
                    flipColours[j] = true;
                }
            }
        }
    }

    // Now we must flip the endpoints where necessary so that the implicitly encoded
    // index bits have the correct state
    for(i=0; i<(int)bti_cpu[blockMode].subsetCount; i++) {
        if(flipColours[i]) {
            CMP_DWORD   temp;
            temp = colour[i][0];
            colour[i][0] = colour[i][1];
            colour[i][1] = temp;
        }
    }

    // ...next flip the indices where necessary
    for(i=0; i<MAX_SUBSET_SIZE; i++) {
        CMP_DWORD   p = partitionTable[i];
        if(flipColours[p]) {
            blockIndices[i] = ((1 << bti_cpu[blockMode].indexBits[0]) - 1) - blockIndices[i];
        }
    }

    CMP_DWORD   subset, ep, component;

    // Endpoints are stored in the following order RRRR GGGG BBBB (AAAA) (PPPP)
    // i.e. components are packed together
    CMP_DWORD   unpackedColours[MAX_SUBSETS][2][MAX_DIMENSION_BIG];
    CMP_DWORD   parityBits[MAX_SUBSETS][2];

    // Unpack the colour values for the subsets
    for(i=0; i<bti_cpu[blockMode].subsetCount; i++) {
        CMP_DWORD   packedColours[2] = {colour[i][0],
                                        colour[i][1]
                                       };

        if(bti_cpu[blockMode].pBitType == TWO_PBIT) {
            parityBits[i][0] = packedColours[0] & 1;
            parityBits[i][1] = packedColours[1] & 1;
            packedColours[0] >>= 1;
            packedColours[1] >>= 1;
        } else if(bti_cpu[blockMode].pBitType == ONE_PBIT) {
            parityBits[i][0] = packedColours[1] & 1;
            parityBits[i][1] = packedColours[1] & 1;
            packedColours[0] >>= 1;
            packedColours[1] >>= 1;
        } else {
            parityBits[i][0] = 0;
            parityBits[i][1] = 0;
        }

        CMP_DWORD   component1;
        for(component1=0; component1<MAX_DIMENSION_BIG; component1++) {
            if(m_componentBits[component1]) {
                unpackedColours[i][0][component1] = packedColours[0] & ((1 << m_componentBits[component1]) - 1);
                unpackedColours[i][1][component1] = packedColours[1] & ((1 << m_componentBits[component1]) - 1);
                packedColours[0] >>= m_componentBits[component1];
                packedColours[1] >>= m_componentBits[component1];
            }
        }
    }

    // Loop over components
    for(component=0; component < MAX_DIMENSION_BIG; component++) {
        // loop over subsets
        for(subset=0; subset<(int)bti_cpu[blockMode].subsetCount; subset++) {
            // Loop over endpoints and write colour bits
            for(ep=0; ep<2; ep++) {
                // Write this component
                for(k = 0; k < m_componentBits[component]; k++) {
                    WriteBit(basePtr,
                             bitPosition++,
                             (CMP_BYTE)(unpackedColours[subset][ep][component] >> k) & 0x1);
                }
            }
        }
    }

    // Now write parity bits if present
    if(bti_cpu[blockMode].pBitType != NO_PBIT) {
        for(subset=0; subset<(int)bti_cpu[blockMode].subsetCount; subset++) {
            if(bti_cpu[blockMode].pBitType == ONE_PBIT) {
                WriteBit(basePtr,
                         bitPosition++,
                         parityBits[subset][0] & 1);
            } else if(bti_cpu[blockMode].pBitType == TWO_PBIT) {
                WriteBit(basePtr,
                         bitPosition++,
                         parityBits[subset][0] & 1);
                WriteBit(basePtr,
                         bitPosition++,
                         parityBits[subset][1] & 1);
            }
        }
    }

    // Now encode the index bits
    for(i=0; i<MAX_SUBSET_SIZE; i++) {
        CMP_DWORD   p = partitionTable[i];
        // If this is a fixup index then drop the MSB which is implicitly 0
        if(i==fixup[p]) {
            for(j=0; j<(bti_cpu[blockMode].indexBits[0]-1); j++) {
                WriteBit(basePtr, bitPosition++,(CMP_BYTE)(blockIndices[i]>>j));
            }
        } else {
            for(j=0; j<bti_cpu[blockMode].indexBits[0]; j++) {
                WriteBit(basePtr, bitPosition++,(CMP_BYTE)(blockIndices[i]>>j));
            }
        }
    }

    // Check that we encoded exactly the right number of bits
    if(bitPosition != (COMPRESSED_BLOCK_SIZE * 8)) {
#ifdef USE_DBGTRACE
        DbgTrace(("Error:Encoded incorrect number of bits"));
#endif
        return;
    }

#ifdef USE_DBGTRACE
    DbgTrace(("OUTPUT [%2x,%2x,%2x,%2x,%2x,%2x,%2x,%2x,%2x,%2x,%2x,%2x,%2x,%2x,%2x,%2x]",
              block[ 0],block[ 1],block[ 2],block[ 3],
              block[ 4],block[ 5],block[ 6],block[ 7],
              block[ 8],block[ 9],block[10],block[11],
              block[12],block[13],block[14],block[15]));
#endif

}


//
// This routine can be used to compress a block to any of the modes with a shared index set
//
// It will encode the best result for this mode into a BC7 block
//
//
//
// For debugging this is a no color 4x4 BC7 block
//BYTE BlankBC7Block[16] = { 0x40, 0xC0, 0x1F, 0xF0, 0x07, 0xFC, 0x70, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };


double BC7BlockEncoder::CompressSingleIndexBlock(
    double      in[MAX_SUBSET_SIZE][MAX_DIMENSION_BIG],
    CMP_BYTE    out[COMPRESSED_BLOCK_SIZE],
    CMP_DWORD   blockMode) {
#ifdef USE_DBGTRACE
    DbgTrace(("<---------CompressSingleIndexBlock----------->"));
#endif
    CMP_DWORD   i, k, n;
    CMP_DWORD   dimension;

    // Figure out the effective dimension of this block mode
    if(bti_cpu[blockMode].encodingType == NO_ALPHA) {
        dimension = 3;
    } else {
        dimension = 4;
    }

    CMP_DWORD numPartitionModes = 1 << bti_cpu[blockMode].partitionBits;
    CMP_DWORD partitionsToTry = numPartitionModes;

    // Linearly reduce the number of partitions to try as the quality falls below a threshold
    if(m_quality < g_qFAST_THRESHOLD) {
        partitionsToTry = (CMP_DWORD)floor((double)(partitionsToTry * m_partitionSearchSize) + 0.5);
        partitionsToTry = cmp_minT(numPartitionModes, cmp_maxT(1, partitionsToTry));
    }

    CMP_DWORD   blockPartition;
    double      partition[MAX_SUBSETS][MAX_SUBSET_SIZE][MAX_DIMENSION_BIG];
    CMP_DWORD   entryCount[MAX_SUBSETS];
    CMP_DWORD   subset;


#ifdef    BC7_DEBUG_TO_RESULTS_TXT
    fprintf(fp,"\CompressSingleIndexBlock\n");
    fprintf(fp,"blockMode = %d\n",blockMode);
    fprintf(fp,"numPartitionModes = %d\n",numPartitionModes);
    fprintf(fp,"partitionsToTry = %d\n",partitionsToTry);
    fprintf(fp,"m_blockMaxRange =  %4.0f\n",m_blockMaxRange);
    fprintf(fp,"m_quantizerRangeThreshold = %4.0f\n",m_quantizerRangeThreshold);
    fprintf(fp,"m_clusters[0] = %d\n",m_clusters[0]);
#endif

#ifdef USE_DBGTRACE
    DbgTrace(("blockMode [%d] numPartitionModes [%d] partitionsToTry [%2d]",
              blockMode,
              numPartitionModes,
              partitionsToTry));
    DbgTrace((" m_blockMaxRange [%2d] m_quantizerRangeThreshold [%4.0f] m_clusters[0] = %d",
              m_blockMaxRange,
              m_quantizerRangeThreshold,
              m_clusters[0]));
#endif

    // Loop over the available partitions for the block mode and quantize them
    // to figure out the best candidates for further refinement
    for(blockPartition = 0;
            blockPartition < partitionsToTry;
            blockPartition++) {
        Partition(blockPartition,
                  in,
                  partition,
                  entryCount,
                  blockMode,
                  dimension);


        double  error = 0.;
        double  outB[MAX_SUBSET_SIZE][MAX_DIMENSION_BIG];
        double  direction[MAX_DIMENSION_BIG];
        double  step;

        for(subset=0; subset < bti_cpu[blockMode].subsetCount; subset++) {
            int     indices[MAX_SUBSETS][MAX_SUBSET_SIZE];

            if(entryCount[subset]) {

                if((m_clusters[0] > 8) ||
                        (m_blockMaxRange <= m_quantizerRangeThreshold)) {

#ifdef    BC7_DEBUG_TO_RESULTS_TXT
                    fprintf(fp,"\noptQuantAnD_d\n");
#endif
                    error += optQuantAnD_d(partition[subset],
                                           entryCount[subset],
                                           m_clusters[0],
                                           indices[subset],
                                           outB,
                                           direction,
                                           &step,
                                           dimension);

#ifdef    BC7_DEBUG_TO_RESULTS_TXT
                    if (blockPartition == 11) {
                        fprintf(fp,"\n");
                        for (int row=0; row<16; row++) {
                            fprintf(fp,"partition[%2d] = %4.2f, %4.2f, %4.2f\n",row,partition[subset][row][0],partition[subset][row][1],partition[subset][row][2]);
                        }


                        fprintf(fp,"\n");
                        for (int row=0; row<16; row++) {
                            fprintf(fp,"indices[0][%2d] = %4.2f\n",row,indices[0][row]);
                        }

                        fprintf(fp,"\n");
                        for (int row=0; row<16; row++) {
                            fprintf(fp,"outB[%2d] = %4.2f, %4.2f, %4.2f\n",row,outB[row][0],outB[row][1],outB[row][2]);
                        }

                        fprintf(fp,"\n");
                        fprintf(fp,"entryCount = %d\n",entryCount[subset]);
                        fprintf(fp,"m_clusters[0] = %d\n",m_clusters[0]);
                        fprintf(fp,"Direction = %4.2f, %4.2f, %4.2f\n",direction[0],direction[1],direction[2]);
                        fprintf(fp,"step = %4.2f\n",step);
                        fprintf(fp,"dimension = %4.2f\n",dimension);
                        fprintf(fp,"error = %4.2f\n",error);
                    }
#endif
                } else {
#ifdef    BC7_DEBUG_TO_RESULTS_TXT
                    fprintf(fp,"\optQuantTrace_d\n");
#endif
                    error += optQuantTrace_d(partition[subset],
                                             entryCount[subset],
                                             m_clusters[0],
                                             indices[subset],
                                             outB,
                                             direction,
                                             &step,
                                             dimension);

#ifdef    BC7_DEBUG_TO_RESULTS_TXT
                    if (blockPartition == 11) {
                        fprintf(fp,"\n");
                        for (int row=0; row<16; row++) {
                            fprintf(fp,"partition[%2d] = %4.2f, %4.2f, %4.2f\n",row,partition[subset][row][0],partition[subset][row][1],partition[subset][row][2]);
                        }


                        fprintf(fp,"\n");
                        for (int row=0; row<16; row++) {
                            fprintf(fp,"indices[0][%2d] = %4.2f\n",row,indices[0][row]);
                        }

                        fprintf(fp,"\n");
                        for (int row=0; row<16; row++) {
                            fprintf(fp,"outB[%2d] = %4.2f, %4.2f, %4.2f\n",row,outB[row][0],outB[row][1],outB[row][2]);
                        }

                        fprintf(fp,"\n");
                        fprintf(fp,"entryCount = %d\n",entryCount[subset]);
                        fprintf(fp,"m_clusters[0] = %d\n",m_clusters[0]);
                        fprintf(fp,"Direction = %4.2f, %4.2f, %4.2f\n",direction[0],direction[1],direction[2]);
                        fprintf(fp,"step = %4.2f\n",step);
                        fprintf(fp,"dimension = %4.2f\n",dimension);
                        fprintf(fp,"error = %4.2f\n",error);
                    }
#endif

                }

                // Store off the indices for later
                for(CMP_DWORD idx=0; idx < entryCount[subset]; idx++) {
                    m_storedIndices[blockPartition][subset][idx] = indices[subset][idx];
                }
            }
        }

        m_storedError[blockPartition] = error;
    }

    // Sort the results
    sortProjection(m_storedError,
                   m_sortedModes,
                   partitionsToTry);


    // Run shaking (endpoint refinement) pass for partitions that gave the
    // best set of errors from quantization

    // ep_shaker will take its endpoint information from bits[0-2]
    // ep_shaker_2_d will take its information from bits[3]
    int     bits[4] = {0,0,0,0};

    // ep_shaker_d needs bits specified individually per channel including parity
    bits[0] = m_componentBits[COMP_RED]   + (m_parityBits ? 1:0);
    bits[1] = m_componentBits[COMP_GREEN] + (m_parityBits ? 1:0);
    bits[2] = m_componentBits[COMP_BLUE]  + (m_parityBits ? 1:0);

    // ep_shaker_2_d needs bits specified as total bits for both endpoints including parity
    for(i=0; i < dimension; i++) {
        bits[3] += m_componentBits[i];
    }
    bits[3] *= 2;
    if(m_parityBits == BCC) {
        bits[3] += 2;
    } else if (m_parityBits == SAME_PAR) {
        bits[3] += 1;
    }

    int     epo_code[MAX_SUBSETS][2][MAX_DIMENSION_BIG];
    double  epo[2][MAX_DIMENSION_BIG];
    double  outB[MAX_SUBSET_SIZE][MAX_DIMENSION_BIG];

    int     bestEndpoints[MAX_SUBSETS][2][MAX_DIMENSION_BIG];
    int     bestIndices[MAX_SUBSETS][MAX_SUBSET_SIZE];
    CMP_DWORD   bestEntryCount[MAX_SUBSETS];
    CMP_DWORD   bestPartition = 0;
    double  bestError = DBL_MAX;

    // Extensive shaking is most important when the ramp is short, and
    // when we have less indices. On a long ramp the quality of the
    // initial quantizing is relatively more important
    // We modulate the shake size according to the number of ramp indices
    // - the more indices we have the less shaking should be required to find a near
    // optimal match

    // shakeSize gives the size of the shake cube (for ep_shaker_2_d)
    // ep_shaker always runs on a 1x1x1 cube on both endpoints
    CMP_DWORD   shakeSize = 8 - (CMP_DWORD)floor(1.5 * bti_cpu[blockMode].indexBits[0]);
    shakeSize           = cmp_maxT(2, cmp_minT((CMP_DWORD)floor(shakeSize * m_quality + 0.5), 6));

    // Shake attempts indicates how many partitions to try to shake
    CMP_DWORD numShakeAttempts = cmp_maxT(1, cmp_minT((CMP_DWORD)floor(8 * m_quality + 0.5), partitionsToTry));

    // Set up all the parameters for the shakers
    // Must increase shake size if these block endpoints use parity
    if((m_parityBits == SAME_PAR) ||
            (m_parityBits == BCC)) {
        shakeSize += 2;
    }

#ifdef USE_DBGTRACE
    DbgTrace(("%2d numPartitionModes %2d SearchSize %3.3f shakeSize %2d numShakeAttempts %2d\n",
              partitionsToTry,
              numPartitionModes,
              m_partitionSearchSize,
              shakeSize,
              numShakeAttempts));
#endif

    // Now do the endpoint shaking
    for(i=0; i < numShakeAttempts; i++) {
        double error = 0;

        blockPartition = m_sortedModes[i];

        Partition(blockPartition,
                  in,
                  partition,
                  entryCount,
                  blockMode,
                  dimension);

        for(subset=0; subset < bti_cpu[blockMode].subsetCount; subset++) {
            if(entryCount[subset]) {
                // If quality is set low or the dimension is not compatible with
                // shaker_d then just run shaker_2_d
                if((m_blockMaxRange > m_shakerRangeThreshold) ||
                        (dimension != 3)) {
                    error += ep_shaker_2_d(partition[subset],
                                           entryCount[subset],
                                           m_storedIndices[blockPartition][subset],
                                           outB,
                                           epo_code[subset],
                                           shakeSize,
                                           m_clusters[0]-1,
                                           bits[3],
                                           dimension,
                                           epo);
                } else {
                    double  tempError[2];
                    int     tempIndices[MAX_SUBSET_SIZE];
                    int     temp_epo_code[2][MAX_DIMENSION_BIG];

                    // Step one - run ep_shaker and ep_shaker_2 in parallel, and get the error from each

                    for(k=0; k < entryCount[subset]; k++) {
                        tempIndices[k] = m_storedIndices[blockPartition][subset][k];
                    }
                    tempError[0] = ep_shaker_d(partition[subset],
                                               entryCount[subset],
                                               tempIndices,
                                               outB,
                                               temp_epo_code,
                                               m_clusters[0]-1,
                                               bits,
                                               (CMP_qt_cpu)m_parityBits,
                                               dimension);

                    tempError[1] = ep_shaker_2_d(partition[subset],
                                                 entryCount[subset],
                                                 m_storedIndices[blockPartition][subset],
                                                 outB,
                                                 epo_code[subset],
                                                 shakeSize,
                                                 m_clusters[0]-1,
                                                 bits[3],
                                                 dimension,
                                                 epo);

                    if(tempError[0] < tempError[1]) {
                        // If ep_shaker did better than ep_shaker_2 then we need to reshake
                        // the output from ep_shaker using ep_shaker_2 for further refinement

                        tempError[1] = ep_shaker_2_d(partition[subset],
                                                     entryCount[subset],
                                                     tempIndices,
                                                     outB,
                                                     temp_epo_code,
                                                     shakeSize,
                                                     m_clusters[0]-1,
                                                     bits[3],
                                                     dimension,
                                                     epo);

                        // Copy the results into the expected location
                        for(k=0; k<entryCount[subset]; k++) {
                            m_storedIndices[blockPartition][subset][k] = tempIndices[k];
                        }

                        for(k=0; k < MAX_DIMENSION_BIG; k++) {
                            epo_code[subset][0][k] = temp_epo_code[0][k];
                            epo_code[subset][1][k] = temp_epo_code[1][k];
                        }
                    }

                    error += tempError[1];
                }
            }
        }


        if(error < bestError) {
            bestPartition = blockPartition;

            for(subset=0; subset < bti_cpu[blockMode].subsetCount; subset++) {
                bestEntryCount[subset] = entryCount[subset];

                if(entryCount[subset]) {
                    for(k=0; k < dimension; k++) {
                        bestEndpoints[subset][0][k] = epo_code[subset][0][k];
                        bestEndpoints[subset][1][k] = epo_code[subset][1][k];
                    }

                    for(n=0; n < entryCount[subset]; n++) {
                        bestIndices[subset][n] = m_storedIndices[blockPartition][subset][n];
                    }
                }
            }

            bestError = error;
        }

        // Early out if we  found we can compress with error below the quality threshold
        if (m_errorThreshold > 0) {
            if(bestError <= m_errorThreshold) {
                break;
            }
        }
    }

    // Now we have all the data needed to encode the block
    // We need to pack the endpoints prior to encoding
    CMP_DWORD   packedEndpoints[3][2];
    for(subset=0; subset<bti_cpu[blockMode].subsetCount; subset++) {
        if(bestEntryCount[subset]) {
            CMP_DWORD   rightAlignment = 0;
            packedEndpoints[subset][0] = 0;
            packedEndpoints[subset][1] = 0;

            // Sort out parity bits
            if(m_parityBits != CART) {
                packedEndpoints[subset][0] = bestEndpoints[subset][0][0] & 1;
                packedEndpoints[subset][1] = bestEndpoints[subset][1][0] & 1;
                for(k=0; k<MAX_DIMENSION_BIG; k++) {
                    bestEndpoints[subset][0][k] >>= 1;
                    bestEndpoints[subset][1][k] >>= 1;
                }
                rightAlignment++;
            }

            // Fixup endpoints
            for(k=0; k<dimension; k++) {
                if(m_componentBits[k]) {
                    packedEndpoints[subset][0] |= bestEndpoints[subset][0][k] << rightAlignment;
                    packedEndpoints[subset][1] |= bestEndpoints[subset][1][k] << rightAlignment;
                    rightAlignment += m_componentBits[k];
                }
            }
        }
    }

    // Save the data to output
    EncodeSingleIndexBlock(blockMode,
                           bestPartition,
                           packedEndpoints,
                           bestIndices,
                           out);
    return bestError;
}

static CMP_DWORD   componentRotations[4][4] = {
    {COMP_ALPHA, COMP_RED,   COMP_GREEN, COMP_BLUE},
    {COMP_RED,   COMP_ALPHA, COMP_GREEN, COMP_BLUE},
    {COMP_GREEN, COMP_RED,   COMP_ALPHA, COMP_BLUE},
    {COMP_BLUE,  COMP_RED,   COMP_GREEN, COMP_ALPHA}
};

void BC7BlockEncoder::EncodeDualIndexBlock(CMP_DWORD blockMode,
        CMP_DWORD indexSelection,
        CMP_DWORD componentRotation,
        int endpoint[2][2][MAX_DIMENSION_BIG],
        int indices[2][MAX_SUBSET_SIZE],
        CMP_BYTE   out[COMPRESSED_BLOCK_SIZE]) {

#ifdef USE_DBGTRACE
    DbgTrace(("-> WriteBit()"));
#endif
    CMP_DWORD i,j,k;
    int   bitPosition = 0;    // Position the pointer at the LSB
    CMP_BYTE  *basePtr = out;
    CMP_DWORD idxBits[2];
    CMP_BOOL swapIndices;

    // Generate Unary header for this mode
    for(i=0; i<blockMode; i++) {
        WriteBit(basePtr, bitPosition++, 0);
    }
    WriteBit(basePtr, bitPosition++, 1);

    // Write rotation bits
    for(i=0; i<bti_cpu[blockMode].rotationBits; i++) {
        WriteBit(basePtr, bitPosition++, (CMP_BYTE)((componentRotation>>i) & 0xff));
    }

    // Write index selector bits
    for(i=0; i<bti_cpu[blockMode].indexModeBits; i++) {
        WriteBit(basePtr, bitPosition++, (CMP_BYTE)(indexSelection ? 1: 0));
    }

    if(indexSelection) {
        swapIndices = TRUE;
        idxBits[0] = bti_cpu[blockMode].indexBits[1];
        idxBits[1] = bti_cpu[blockMode].indexBits[0];
    } else {
        swapIndices = FALSE;
        idxBits[0] = bti_cpu[blockMode].indexBits[0];
        idxBits[1] = bti_cpu[blockMode].indexBits[1];
    }

    bool   flipColours[2] = {false, false};

    // Indicate if we need to fixup the indices
    if(indices[0][0] & (1<<(idxBits[0]-1))) {
        flipColours[0] = true;
    }
    if(indices[1][0] & (1<<(idxBits[1]-1))) {
        flipColours[1] = true;
    }

    // Fixup the indices
    for(i=0; i<2; i++) {
        if(flipColours[i]) {
            for(j=0; j<MAX_SUBSET_SIZE; j++) {
                indices[i][j] = ((1 << idxBits[i]) - 1) - indices[i][j];
            }
        }
    }

    // Now fixup the endpoints so that the implicitly encoded
    // index bits have the correct state
    for(i=0; i<2; i++) {
        if(flipColours[i]) {
            for(k=0; k<4; k++) {
                CMP_DWORD   temp;
                temp = endpoint[i][0][k];
                endpoint[i][0][k] = endpoint[i][1][k];
                endpoint[i][1][k] = temp;
            }
        }
    }

    CMP_DWORD   ep, component;
    // Encode the colour and alpha information
    CMP_DWORD   vectorComponentBits = bti_cpu[blockMode].vectorBits / 3;

    // Loop over components
    for(component=0; component < MAX_DIMENSION_BIG; component++) {
        if(component != COMP_ALPHA) {
            for(ep=0; ep<2; ep++) {
                for(k=0; k<vectorComponentBits; k++) {
                    WriteBit(basePtr,
                             bitPosition++,
                             (CMP_BYTE)((endpoint[0][ep][component] >> k) & 0x1));
                }
            }
        } else {
            for(ep=0; ep<2; ep++) {
                for(j=0; j<bti_cpu[blockMode].scalarBits; j++) {
                    WriteBit(basePtr,
                             bitPosition++,
                             (CMP_BYTE)((endpoint[1][ep][0] >> j) & 0x1));
                }
            }
        }
    }

    // Now encode the index bits
    for(i=0; i<2; i++) {
        CMP_DWORD   idxSelect = i;

        if(swapIndices) {
            idxSelect = i ^ 1;
        }
        for(j=0; j<MAX_SUBSET_SIZE; j++) {
            if(j==0) {
                for(k=0; k<(idxBits[idxSelect]-1); k++) {
                    WriteBit(basePtr, bitPosition++,(CMP_BYTE)(indices[idxSelect][j]>>k));
                }
            } else {
                for(k=0; k<idxBits[idxSelect]; k++) {
                    WriteBit(basePtr, bitPosition++,(CMP_BYTE)(indices[idxSelect][j]>>k));
                }
            }
        }
    }

    // Check that we encoded exactly the right number of bits
    if(bitPosition != (COMPRESSED_BLOCK_SIZE * 8)) {
        return;
    }

#ifdef USE_DBGTRACE
    DbgTrace(("OUTPUT [%2x,%2x,%2x,%2x,%2x,%2x,%2x,%2x,%2x,%2x,%2x,%2x,%2x,%2x,%2x,%2x]",
              out[ 0],out[ 1],out[ 2],out[ 3],
              out[ 4],out[ 5],out[ 6],out[ 7],
              out[ 8],out[ 9],out[10],out[11],
              out[12],out[13],out[14],out[15]));
#endif

}


double BC7BlockEncoder::CompressDualIndexBlock(double in[MAX_SUBSET_SIZE][MAX_DIMENSION_BIG],
        CMP_BYTE   out[COMPRESSED_BLOCK_SIZE],
        CMP_DWORD  blockMode) {
#ifdef USE_DBGTRACE
    DbgTrace(("<---------CompressDualIndexBlock----------->"));
#endif
    CMP_DWORD   i;
    double  cBlock[MAX_SUBSET_SIZE][MAX_DIMENSION_BIG];
    double  aBlock[MAX_SUBSET_SIZE][MAX_DIMENSION_BIG];

    CMP_DWORD maxRotation = 1 << bti_cpu[blockMode].rotationBits;
    CMP_DWORD rotation;

    CMP_DWORD maxIndexSelection = 1 << bti_cpu[blockMode].indexModeBits;
    CMP_DWORD indexSelection;

    int        indices[2][MAX_SUBSET_SIZE];
    double  outQ[2][MAX_SUBSET_SIZE][MAX_DIMENSION_BIG];
    double  direction[MAX_DIMENSION_BIG];
    double  step;

    double quantizerError;
    double bestQuantizerError = DBL_MAX;
    double overallError;
    double bestOverallError   = DBL_MAX;

#ifdef    BC7_DEBUG_TO_RESULTS_TXT
    fprintf(fp,"\nCompressDualIndexBlock\n");
    fprintf(fp,"blockMode = %d\n",blockMode);
    fprintf(fp,"maxIndexSelection = %d\n",maxIndexSelection);
    fprintf(fp,"maxRotation = %d\n",maxRotation);
    fprintf(fp,"m_blockMaxRange =  %4.0f\n",m_blockMaxRange);
    fprintf(fp,"m_quantizerRangeThreshold = %4.0f\n",m_quantizerRangeThreshold);
#endif

    // Go through each possible rotation and selection of indices
    for(rotation = 0; rotation < maxRotation; rotation++) {
        // A


        for(i=0; i<MAX_SUBSET_SIZE; i++) {
            cBlock[i][COMP_RED]   = in[i][componentRotations[rotation][1]];
            cBlock[i][COMP_GREEN] = in[i][componentRotations[rotation][2]];
            cBlock[i][COMP_BLUE]  = in[i][componentRotations[rotation][3]];

            aBlock[i][COMP_RED]   = in[i][componentRotations[rotation][0]];
            aBlock[i][COMP_GREEN] = in[i][componentRotations[rotation][0]];
            aBlock[i][COMP_BLUE]  = in[i][componentRotations[rotation][0]];
        }

#ifdef    BC7_DEBUG_TO_RESULTS_TXT
        fprintf(fp,"\ncBlock[16][3]\n");
        for(i=0; i<MAX_SUBSET_SIZE; i++) {
            fprintf(fp,"%4.0f, %4.0f, %4.0f\n",cBlock[i][COMP_RED],cBlock[i][COMP_GREEN],cBlock[i][COMP_BLUE]);
        }


        fprintf(fp,"\naBlock[16][3]\n");
        for(i=0; i<MAX_SUBSET_SIZE; i++) {
            fprintf(fp,"%4.0f, %4.0f, %4.0f\n",aBlock[i][COMP_RED],aBlock[i][COMP_GREEN],aBlock[i][COMP_BLUE]);
        }
#endif

        for(indexSelection = 0; indexSelection < maxIndexSelection; indexSelection++) {
            // B
            quantizerError = 0.;

#ifdef    BC7_DEBUG_TO_RESULTS_TXT
            fprintf(fp,"\n-------------- Quantize the vector block ----------------\n");
#endif
            // Quantize the vector block
            if(m_blockMaxRange <= m_quantizerRangeThreshold) {

#ifdef    BC7_DEBUG_TO_RESULTS_TXT
                fprintf(fp,"\noptQuantAnD_d\n");
                fprintf(fp,"IndexSelection = %d\n",indexSelection);
                fprintf(fp,"NumClusters = %d\n",1 << bti_cpu[blockMode].indexBits[0 ^ indexSelection]);
#endif
                quantizerError = optQuantAnD_d(cBlock,
                                               MAX_SUBSET_SIZE,
                                               (1 << bti_cpu[blockMode].indexBits[0 ^ indexSelection]),
                                               indices[0],
                                               outQ[0],
                                               direction,
                                               &step,
                                               3);

#ifdef    BC7_DEBUG_TO_RESULTS_TXT
                fprintf(fp,"\n");
                for (int row=0; row<16; row++) {
                    fprintf(fp,"indices[0][%2d] = %4.2f\n",row,indices[0][row]);
                }

                fprintf(fp,"\n");
                for (int row=0; row<16; row++) {
                    fprintf(fp,"outQ[0][%2d] = %4.2f, %4.2f, %4.2f\n",row,outQ[0][row][0],outQ[0][row][1],outQ[0][row][2]);
                }

                fprintf(fp,"\n");
                fprintf(fp,"Direction = %4.2f, %4.2f, %4.2f\n",direction[0],direction[1],direction[2]);
                fprintf(fp,"step = %4.2f\n",step);
                fprintf(fp,"quantizerError = %4.2f\n",quantizerError);
#endif

            } else {

#ifdef    BC7_DEBUG_TO_RESULTS_TXT
                fprintf(fp,"\noptQuantTrace_d\n");
                fprintf(fp,"IndexSelection = %d\n",indexSelection);
                fprintf(fp,"NumClusters = %d\n",1 << bti_cpu[blockMode].indexBits[0 ^ indexSelection]);
#endif
                quantizerError = optQuantTrace_d(cBlock,
                                                 MAX_SUBSET_SIZE,
                                                 (1 << bti_cpu[blockMode].indexBits[0 ^ indexSelection]),
                                                 indices[0],
                                                 outQ[0],
                                                 direction,
                                                 &step,
                                                 3);

#ifdef    BC7_DEBUG_TO_RESULTS_TXT
                fprintf(fp,"\n");
                for (int row=0; row<16; row++) {
                    fprintf(fp,"indices[0][%2d] = %4.2f\n",row,indices[0][row]);
                }

                fprintf(fp,"\n");
                for (int row=0; row<16; row++) {
                    fprintf(fp,"outQ[0][%2d] = %4.2f, %4.2f, %4.2f\n",row,outQ[0][row][0],outQ[0][row][1],outQ[0][row][2]);
                }

                fprintf(fp,"\n");
                fprintf(fp,"Direction = %4.2f, %4.2f, %4.2f\n",direction[0],direction[1],direction[2]);
                fprintf(fp,"step = %4.2f\n",step);
                fprintf(fp,"quantizerError = %4.2f\n",quantizerError);
#endif

            }

            // Quantize the scalar block
#ifdef    BC7_DEBUG_TO_RESULTS_TXT
            fprintf(fp,"\nQuantize the scalar block\n");
#endif
            if(m_blockMaxRange <= m_quantizerRangeThreshold) {

#ifdef    BC7_DEBUG_TO_RESULTS_TXT
                fprintf(fp,"\noptQuantAnD_d\n");
                fprintf(fp,"IndexSelection = %d\n",indexSelection);
                fprintf(fp,"NumClusters = %d\n",1 << bti_cpu[blockMode].indexBits[1 ^ indexSelection]);
#endif
                quantizerError += optQuantAnD_d(aBlock,
                                                MAX_SUBSET_SIZE,
                                                (1 << bti_cpu[blockMode].indexBits[1 ^ indexSelection]),
                                                indices[1],
                                                outQ[1],
                                                direction,
                                                &step,
                                                3) / 3.;

#ifdef    BC7_DEBUG_TO_RESULTS_TXT
                fprintf(fp,"\n");
                for (int row=0; row<16; row++) {
                    fprintf(fp,"indices[1][%2d] = %4.2f\n",row,indices[1][row]);
                }

                fprintf(fp,"\n");
                for (int row=0; row<16; row++) {
                    fprintf(fp,"outQ[1][%2d] = %4.2f, %4.2f, %4.2f\n",row,outQ[1][row][0],outQ[1][row][1],outQ[1][row][2]);
                }

                fprintf(fp,"\n");
                fprintf(fp,"Direction = %4.2f, %4.2f, %4.2f\n",direction[0],direction[1],direction[2]);
                fprintf(fp,"step = %4.2f\n",step);
                fprintf(fp,"quantizerError = %4.2f\n",quantizerError);
#endif
            } else {
#ifdef    BC7_DEBUG_TO_RESULTS_TXT
                fprintf(fp,"\noptQuantTrace_d\n");
                fprintf(fp,"IndexSelection = %d\n",indexSelection);
                fprintf(fp,"NumClusters = %d\n",1 << bti_cpu[blockMode].indexBits[1 ^ indexSelection]);
#endif
                quantizerError += optQuantTrace_d(aBlock,
                                                  MAX_SUBSET_SIZE,
                                                  (1 << bti_cpu[blockMode].indexBits[1 ^ indexSelection]),
                                                  indices[1],
                                                  outQ[1],
                                                  direction,
                                                  &step,
                                                  3) / 3.;

#ifdef    BC7_DEBUG_TO_RESULTS_TXT
                fprintf(fp,"\n");
                for (int row=0; row<16; row++) {
                    fprintf(fp,"indices[1][%2d] = %4.2f\n",row,indices[1][row]);
                }

                fprintf(fp,"\n");
                for (int row=0; row<16; row++) {
                    fprintf(fp,"outQ[1][%2d] = %4.2f, %4.2f, %4.2f\n",row,outQ[1][row][0],outQ[1][row][1],outQ[1][row][2]);
                }

                fprintf(fp,"\n");
                fprintf(fp,"Direction = %4.2f, %4.2f, %4.2f\n",direction[0],direction[1],direction[2]);
                fprintf(fp,"step = %4.2f\n",step);
                fprintf(fp,"quantizerError = %4.2f\n",quantizerError);
#endif

            }

            // If quality is high then run the full shaking for this config and
            // store the result if it beats the best overall error
            // Otherwise only run the shaking if the error is better than the best
            // quantizer error
            if((m_quality > g_HIGHQULITY_THRESHOLD) || (quantizerError <= bestQuantizerError)) {
                // Shake size gives the size of the shake cube
                CMP_DWORD   shakeSize;

                shakeSize = cmp_maxT(2, cmp_minT((CMP_DWORD)(6 * m_quality), 6));

                int     bits[2][4];

                // Specify number of bits for vector block
                bits[0][COMP_RED] = m_componentBits[COMP_RED];
                bits[0][COMP_GREEN] = m_componentBits[COMP_GREEN];
                bits[0][COMP_BLUE] = m_componentBits[COMP_BLUE];
                bits[0][3] = 2 * (m_componentBits[COMP_RED] + m_componentBits[COMP_GREEN] + m_componentBits[COMP_BLUE]);

                // Specify number of bits for scalar block
                bits[1][0] = m_componentBits[COMP_ALPHA];
                bits[1][1] = m_componentBits[COMP_ALPHA];
                bits[1][2] = m_componentBits[COMP_ALPHA];
                bits[1][3] = 6 * m_componentBits[COMP_ALPHA];

                overallError = 0;
                int     epo_code[2][2][MAX_DIMENSION_BIG];
                double  epo[2][MAX_DIMENSION_BIG];

                if(m_blockMaxRange > m_shakerRangeThreshold) {
                    overallError += ep_shaker_2_d(cBlock,
                                                  MAX_SUBSET_SIZE,
                                                  indices[0],
                                                  outQ[0],
                                                  epo_code[0],
                                                  shakeSize,
                                                  (1 << bti_cpu[blockMode].indexBits[0 ^ indexSelection])-1,
                                                  bits[0][3],
                                                  3,
                                                  epo);
                } else {
                    ep_shaker_d(cBlock,
                                MAX_SUBSET_SIZE,
                                indices[0],
                                outQ[0],
                                epo_code[0],
                                (1 << bti_cpu[blockMode].indexBits[0 ^ indexSelection])-1,
                                bits[0],
                                (CMP_qt_cpu)0,
                                3);

                    overallError += ep_shaker_2_d(cBlock,
                                                  MAX_SUBSET_SIZE,
                                                  indices[0],
                                                  outQ[0],
                                                  epo_code[0],
                                                  shakeSize,
                                                  (1 << bti_cpu[blockMode].indexBits[0 ^ indexSelection])-1,
                                                  bits[0][3],
                                                  3,
                                                  epo);
                }

                if(m_blockMaxRange > m_shakerRangeThreshold) {
                    overallError += ep_shaker_2_d(aBlock,
                                                  MAX_SUBSET_SIZE,
                                                  indices[1],
                                                  outQ[1],
                                                  epo_code[1],
                                                  shakeSize,
                                                  (1 << bti_cpu[blockMode].indexBits[1 ^ indexSelection])-1,
                                                  bits[1][3],
                                                  3,
                                                  epo) / 3.;
                } else {
                    ep_shaker_d(aBlock,
                                MAX_SUBSET_SIZE,
                                indices[1],
                                outQ[1],
                                epo_code[1],
                                (1 << bti_cpu[blockMode].indexBits[1 ^ indexSelection])-1,
                                bits[1],
                                (CMP_qt_cpu)0,
                                3);

                    overallError += ep_shaker_2_d(aBlock,
                                                  MAX_SUBSET_SIZE,
                                                  indices[1],
                                                  outQ[1],
                                                  epo_code[1],
                                                  shakeSize,
                                                  (1 << bti_cpu[blockMode].indexBits[1 ^ indexSelection])-1,
                                                  bits[1][3],
                                                  3,
                                                  epo) / 3.;
                }

                // If we beat the previous best then encode the block
                if(overallError < bestOverallError) {
                    EncodeDualIndexBlock(blockMode,
                                         indexSelection,
                                         rotation,
                                         epo_code,
                                         indices,
                                         out);

                    bestOverallError = overallError;
                }

                if(quantizerError < bestQuantizerError) {
                    bestQuantizerError = quantizerError;
                }

            }
        } // B
    } // A
    return bestOverallError;
}


//
// This routine compresses a block and returns the RMS error
//
//
//
//
#include <stdio.h>


double BC7BlockEncoder::CompressBlock(double in[MAX_SUBSET_SIZE][MAX_DIMENSION_BIG],
                                      CMP_BYTE   out[COMPRESSED_BLOCK_SIZE]) {

#ifdef USE_DBGTRACE
    DbgTrace(());
#endif
    CMP_DWORD   i, j;
    CMP_BOOL    blockNeedsAlpha        = FALSE;
    CMP_BOOL    blockAlphaZeroOne      = FALSE;
    CMP_DWORD   validModeMask          = m_validModeMask;
    CMP_BOOL    encodedBlock           = FALSE;

#ifdef USE_CMP_BC7_CORE
    if (m_performance < 0.01) {
        // prototype code for next revision, currently accessible only through SDK
        unsigned char srcBlock[64];

        int px=0;
        for (i=0; i<16; i++)
        {
            srcBlock[px++] = (unsigned char)(in[i][0]);
            srcBlock[px++] = (unsigned char)(in[i][1]);
            srcBlock[px++] = (unsigned char)(in[i][2]);
            srcBlock[px++] = (unsigned char)(in[i][3]);
        }

        CompressBlockBC7(srcBlock,16,out);

        return 0.0f;
    }
#endif

#ifdef    BC7_DEBUG_TO_RESULTS_TXT
    fp = fopen("debugdata.txt","w");
    if (fp) {
        fprintf(fp,"Data INPUT\n");
        double  data[16][4];
        memcpy(data,in,sizeof(data));
        for (int row=0; row<16; row++)
            fprintf(fp,"%4.0f, %4.0f, %4.0f\n", data[row][0],data[row][1],data[row][2]);
#endif

        for(i=0; i<MAX_DIMENSION_BIG; i++) {
            m_blockMin[i] = DBL_MAX;
            m_blockMax[i] = 0.0;
            m_blockRange[i] = 0.0;
        }

        // Check if the input block has any alpha values that are not 1
        // We assume 8-bit input here, so 1 is mapped to 255.
        // Also check if the block encodes an explicit zero or one in the
        // alpha channel. If so then we might need also need special as the
        // block may have a thresholded or punch-through alpha
        for(i=0; i<MAX_SUBSET_SIZE; i++) {
            if(in[i][COMP_ALPHA] != 255.0) {
                blockNeedsAlpha = TRUE;
            } else if((in[i][COMP_ALPHA] == 255.0) ||
                      (in[i][COMP_ALPHA] == 0.0)) {
                blockAlphaZeroOne = TRUE;
            }
        }

        for(i=0; i<MAX_SUBSET_SIZE; i++) {
            for(j=0; j<MAX_DIMENSION_BIG; j++) {
                m_blockMin[j] = (in[i][j] < m_blockMin[j]) ? in[i][j] : m_blockMin[j];
                m_blockMax[j] = (in[i][j] > m_blockMax[j]) ? in[i][j] : m_blockMax[j];
            }
        }

#ifdef    BC7_DEBUG_TO_RESULTS_TXT
        fprintf(fp,"m_blockMin[0] = %4.2f\n",m_blockMin[0]);
        fprintf(fp,"m_blockMin[1] = %4.2f\n",m_blockMin[1]);
        fprintf(fp,"m_blockMin[2] = %4.2f\n",m_blockMin[2]);
        fprintf(fp,"m_blockMin[3] = %4.2f\n\n",m_blockMin[3]);

        fprintf(fp,"m_blockMax[0] = %4.2f\n",m_blockMax[0]);
        fprintf(fp,"m_blockMax[1] = %4.2f\n",m_blockMax[1]);
        fprintf(fp,"m_blockMax[2] = %4.2f\n",m_blockMax[2]);
        fprintf(fp,"m_blockMax[3] = %4.2f\n\n",m_blockMax[3]);
#endif

        m_blockRange[0] = m_blockMax[0] - m_blockMin[0];
        m_blockRange[1] = m_blockMax[1] - m_blockMin[1];
        m_blockRange[2] = m_blockMax[2] - m_blockMin[2];
        m_blockRange[3] = m_blockMax[3] - m_blockMin[3];
        m_blockMaxRange = cmp_maxT(m_blockRange[0], m_blockRange[1]);
        m_blockMaxRange = cmp_maxT(m_blockMaxRange, m_blockRange[2]);
        m_blockMaxRange = cmp_maxT(m_blockMaxRange, m_blockRange[3]);


#ifdef    BC7_DEBUG_TO_RESULTS_TXT
        fprintf(fp,"m_blockRange[0] = %4.2f\n",m_blockRange[0]);
        fprintf(fp,"m_blockRange[1] = %4.2f\n",m_blockRange[1]);
        fprintf(fp,"m_blockRange[2] = %4.2f\n",m_blockRange[2]);
        fprintf(fp,"m_blockRange[3] = %4.2f\n",m_blockRange[3]);
        fprintf(fp,"m_blockMaxRange = %4.2f\n\n",m_blockMaxRange);

        fprintf(fp,"=========================================\n");
#endif


        // Initial loop - go through the block modes and get the ones that are valid
        for(CMP_DWORD blockMode=0; blockMode < NUM_BLOCK_TYPES; blockMode++) {
            // Check if this mode is allowed based on the global settings
            if(!(validModeMask & (1 << blockMode))) {
                continue;
            }

            // If the block needs Alpha and this mode doesn't support alpha then
            // indicate that this is not a valid mode and continue
            if((blockNeedsAlpha == TRUE) &&
                    (bti_cpu[blockMode].encodingType == NO_ALPHA)) {
                validModeMask &= ~(1<<blockMode);
            }

            // Optional restriction for colour-only blocks so that they
            // don't use modes that have combined colour+alpha - this
            // avoids the possibility that the encoder might choose an
            // alpha other than 1.0 (due to parity) and cause something to
            // become accidentally slightly transparent (it's possible that
            // when encoding 3-component texture applications will assume that
            // the 4th component can safely be assumed to be 1.0 all the time)
            if((blockNeedsAlpha == FALSE) &&
                    (m_colourRestrict == TRUE) &&
                    (bti_cpu[blockMode].encodingType == COMBINED_ALPHA)) {
                validModeMask &= ~(1<<blockMode);
            }

            // Optional restriction for blocks with alpha to avoid issues with
            // punch-through or thresholded alpha encoding
            if((blockNeedsAlpha == TRUE) &&
                    (m_alphaRestrict == TRUE) &&
                    (blockAlphaZeroOne == TRUE) &&
                    (bti_cpu[blockMode].encodingType == COMBINED_ALPHA)) {
                validModeMask &= ~(1<<blockMode);
            }
        }

        assert(validModeMask != 0);

#ifdef USE_DBGTRACE
        DbgTrace(("validModeMask [%x]",validModeMask));
#endif
        // Try all the legal block modes that we flagged

        CMP_BYTE    temporaryOutputBlock[COMPRESSED_BLOCK_SIZE];
        double bestError = DBL_MAX;
        double thisError;
        CMP_DWORD bestblockMode=99;

        // We change the order in which we visit the block modes to try to maximize the chance
        // that we manage to early out as quickly as possible.
        // This is a significant performance optimization for the lower quality modes where the
        // exit threshold is higher, and also tends to improve quality (as the generally higher quality
        // modes are now enumerated earlier, so the first encoding that passes the threshold will
        // tend to pass by a greater margin than if we used a dumb ordering, and thus overall error will
        // be improved)
        CMP_DWORD   blockModeOrder[NUM_BLOCK_TYPES] = {6, 4, 3, 1, 0, 2, 7, 5};

        // used for debugging and mode tests
        //                76543210
        // validModeMask = 0b00100000;

        for(CMP_DWORD j1=0; j1 < NUM_BLOCK_TYPES; j1++) {
            CMP_DWORD blockMode = blockModeOrder[j1];
            CMP_DWORD Mode = 0x0001 << blockMode;

            if(!(validModeMask & Mode)) {
                continue;
            }

            // CPU:HPC #1
            // Setup mode parameters for this block
            BlockSetup(blockMode);

            if(bti_cpu[blockMode].encodingType != SEPARATE_ALPHA) {

#ifdef    BC7_DEBUG_TO_RESULTS_TXT
                fprintf(fp,"=================== CompressSingleIndexBlock ======================\n");
#endif
                thisError = CompressSingleIndexBlock(in, temporaryOutputBlock, blockMode);

            } else {

#ifdef    BC7_DEBUG_TO_RESULTS_TXT
                fprintf(fp,"==================  CompressDualIndexBlock =======================\n");
#endif

                thisError = CompressDualIndexBlock(in, temporaryOutputBlock, blockMode);
            }

            // If this compression did better than all previous attempts then copy the result
            // to the output block
            if(thisError < bestError) {
                for(i=0; i < COMPRESSED_BLOCK_SIZE; i++) {
                    out[i] = temporaryOutputBlock[i];
                }
                bestError = thisError;
                encodedBlock = TRUE;
                bestblockMode = blockMode;
            }

            // If we have achieved an error lower than the requirement threshold then just exit now
            // Early out if we  found we can compress with error below the quality threshold
            if (m_errorThreshold > 0) {
                if(bestError <= m_errorThreshold) {
                    break;
                }
            }
        }

        if(bestError < m_smallestError) {
            m_smallestError = bestError;
        }
        if(bestError > m_largestError) {
            m_largestError = bestError;
        }

        if (!encodedBlock) {
            // return some sort of error and abort sequence!
            encodedBlock = FALSE;
        }

#ifdef    BC7_DEBUG_TO_RESULTS_TXT
        fclose(fp);
#endif

        return bestError;

#ifdef    BC7_DEBUG_TO_RESULTS_TXT
    } else return (0);
#endif

}