//===============================================================================
// Copyright (c) 2007-2016  Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2004-2006 ATI Technologies Inc.
//===============================================================================
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files(the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions :
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
//

#include <assert.h>
#include <math.h>
#include <float.h>

#include "3dquant_constants.h"
#include "3dquant_vpc.h"
#include "shake.h"
#include "bc7_utils.h"
#include "debug.h"
#include "bc7_encode.h"

#define LOG_CL_BASE 2
#define BIT_BASE 5

#define LOG_CL_RANGE 5
#define BIT_RANGE 9

#define CLT(cl)   (cl-LOG_CL_BASE)
#define BTT(bits) (bits-BIT_BASE)

int npv_nd[][2*MAX_DIMENSION_BIG]= {
    {}, // 0 - for alingment
    {}, // 1
    {}, // 2
    {1,2,4,8,16,32}, //3
    {1,2,4}          //4
};

// Reworking these tables
// Since endpoints can be flipped for the encoding (reclaims an index bit) this means
// that the parity tables must be symmetric for the two endpoints, as parity could also be flipped
// The tables above do not guarantee this behaviour, but this one does

int par_vectors_nd[][2*MAX_DIMENSION_BIG][(1<<(2*MAX_DIMENSION_BIG-1))][2][MAX_DIMENSION_BIG] = {
    {{{}}}, // 0 - for alingment

    {{{}}}, //1D

    {{{}}}, //2D

    {
        {{}},

// 3*n+1    BCC          3*n+1        Cartesian 3*n            //same parity
        {
            // SAME_PAR
            {{0,0,0},{0,0,0}},
            {{1,1,1},{1,1,1}}
        },
// 3*n+2    BCC          3*n+1        BCC          3*n+1
        {
            // BCC
            {{0,0,0},{0,0,0}},
            {{0,0,0},{1,1,1}},
            {{1,1,1},{0,0,0}},
            {{1,1,1},{1,1,1}}
        },
// 3*n+3    FCC                    ???                        // ??????
// BCC with FCC same or inverted, symmetric
        {
            // BCC_SAME_FCC
            {{0,0,0},{0,0,0}},
            {{1,1,0},{1,1,0}},
            {{1,0,1},{1,0,1}},
            {{0,1,1},{0,1,1}},

            {{0,0,0},{1,1,1}},
            {{1,1,1},{0,0,0}},
            {{0,1,0},{0,1,0}},  // ??
            {{1,1,1},{1,1,1}},

        },
// 3*n+4    FCC          3*n+2        FCC          3*n+2
        {

            {{0,0,0},{0,0,0}},
            {{1,1,0},{0,0,0}},
            {{1,0,1},{0,0,0}},
            {{0,1,1},{0,0,0}},

            {{0,0,0},{1,1,0}},
            {{1,1,0},{1,1,0}},
            {{1,0,1},{1,1,0}},
            {{0,1,1},{1,1,0}},

            {{0,0,0},{1,0,1}},
            {{1,1,0},{1,0,1}},
            {{1,0,1},{1,0,1}},
            {{0,1,1},{1,0,1}},

            {{0,0,0},{0,1,1}},
            {{1,1,0},{0,1,1}},
            {{1,0,1},{0,1,1}},
            {{0,1,1},{0,1,1}}
        },


// 3*n+5    Cartesian 3*n+3        FCC          3*n+2            //D^*[6]
        {

            {{0,0,0},{0,0,0}},
            {{1,1,0},{0,0,0}},
            {{1,0,1},{0,0,0}},
            {{0,1,1},{0,0,0}},

            {{0,0,0},{1,1,0}},
            {{1,1,0},{1,1,0}},
            {{1,0,1},{1,1,0}},
            {{0,1,1},{1,1,0}},

            {{0,0,0},{1,0,1}},
            {{1,1,0},{1,0,1}},
            {{1,0,1},{1,0,1}},
            {{0,1,1},{1,0,1}},

            {{0,0,0},{0,1,1}},
            {{1,1,0},{0,1,1}},
            {{1,0,1},{0,1,1}},
            {{0,1,1},{0,1,1}},


            {{1,0,0},{1,1,1}},
            {{0,1,0},{1,1,1}},
            {{0,0,1},{1,1,1}},
            {{1,1,1},{1,1,1}},

            {{1,0,0},{0,0,1}},
            {{0,1,0},{0,0,1}},
            {{0,0,1},{0,0,1}},
            {{1,1,1},{0,0,1}},

            {{1,0,0},{1,0,0}},
            {{0,1,0},{1,0,0}},
            {{0,0,1},{1,0,0}},
            {{1,1,1},{1,0,0}},

            {{1,0,0},{0,1,0}},
            {{0,1,0},{0,1,0}},
            {{0,0,1},{0,1,0}},
            {{1,1,1},{0,1,0}}
        }
    },

    {
        {{}},

// 3*n+1    BCC          3*n+1        Cartesian 3*n            //same parity
        {
            // SAME_PAR
            {{0,0,0,0},{0,0,0,0}},
            {{1,1,1,1},{1,1,1,1}}
        },
// 3*n+2    BCC          3*n+1        BCC          3*n+1
        {
            // BCC
            {{0,0,0,0},{0,0,0,0}},
            {{0,0,0,0},{1,1,1,1}},
            {{1,1,1,1},{0,0,0,0}},
            {{1,1,1,1},{1,1,1,1}}
        },
// 3 PBIT
        {
            {{0,0,0,0},{0,0,0,0}},
            {{0,0,0,0},{0,1,1,1}},
            {{0,1,1,1},{0,0,0,0}},
            {{0,1,1,1},{0,1,1,1}},

            {{1,0,0,0},{1,0,0,0}},
            {{1,0,0,0},{1,1,1,1}},
            {{1,1,1,1},{1,0,0,0}},
            {{1,1,1,1},{1,1,1,1}}
        },

// 4 PBIT
        {
            {{0,0,0,0},{0,0,0,0}},
            {{0,0,0,0},{0,1,1,1}},
            {{0,1,1,1},{0,0,0,0}},
            {{0,1,1,1},{0,1,1,1}},

            {{1,0,0,0},{1,0,0,0}},
            {{1,0,0,0},{1,1,1,1}},
            {{1,1,1,1},{1,0,0,0}},
            {{1,1,1,1},{1,1,1,1}},

            {{0,0,0,0},{0,0,0,0}},
            {{0,0,0,0},{0,0,1,1}},
            {{0,0,1,1},{0,0,0,0}},
            {{0,1,0,1},{0,1,0,1}},

            {{1,0,0,0},{1,0,0,0}},
            {{1,0,0,0},{1,0,1,1}},
            {{1,0,1,1},{1,0,0,0}},
            {{1,1,0,1},{1,1,0,1}},

        },

    }, //4

};


//#define GIG_TABLE
#ifdef  GIG_TABLE
static double ramp_err[LOG_CL_RANGE-LOG_CL_BASE][BIT_RANGE-BIT_BASE][256][256][256][16];
#endif

static double ramp[LOG_CL_RANGE-LOG_CL_BASE][BIT_RANGE-BIT_BASE][256][256][16];
static double ep_d[BIT_RANGE-BIT_BASE][256];
// inverted table
// <log2 clusters >,  bits, value, par1, par2, <ep1>
static int      sp_idx[LOG_CL_RANGE-LOG_CL_BASE][BIT_RANGE-BIT_BASE][256][2][2][MAX_CLUSTERS_BIG][2];
// <log2 clusters >,  bits, value, par1, par2,
static double   sp_err[LOG_CL_RANGE-LOG_CL_BASE][BIT_RANGE-BIT_BASE][256][2][2][MAX_CLUSTERS_BIG];
//#endif

//
int expand_ (int bits, int v) {
    assert(bits >=4);
    return (  v << (8-bits) | v >> (2* bits - 8));
}

static bool ramp_init = false;

void init_ramps (void) {
    if (ramp_init) return;
#ifdef USE_DBGTRACE
    DbgTrace(());
#endif
    int clog1;
    int bits;
    int p1;
    int p2;
    int i,j;
    int o1,o2;

    for (bits=BIT_BASE; bits<BIT_RANGE; bits++)
        for (p1=0; p1<(1<<bits); p1++)
            ep_d[BTT(bits)][p1]=(double) expand_(bits,p1);


    for (clog1=LOG_CL_BASE; clog1<LOG_CL_RANGE; clog1++)
        for (bits=BIT_BASE; bits<BIT_RANGE; bits++)
            for (p1=0; p1<(1<<bits); p1++)
                for (p2=0; p2<(1<<bits); p2++) {
                    for (i=0; i<(1<<clog1); i++)
                        ramp[CLT(clog1)][BTT(bits)][p1][p2][i] =
                            floor(
                                (double) ep_d[BTT(bits)][p1] + rampLerpWeights[clog1][i] * (double)((ep_d[BTT(bits)][p2]- ep_d[BTT(bits)][p1]))
                                +0.5);

#ifdef GIG_TABLE
                    double v;
                    int vi;
                    for (vi=0; vi<256; vi++)
                        for (i=0; i<(1<<clog1); i++)
                            ramp_err[CLT(clog1)][BTT(bits)][p1][p2][vi][i] =
                                (ramp[CLT(clog1)][BTT(bits)][p1][p2][i]-(double) vi) *
                                (ramp[CLT(clog1)][BTT(bits)][p1][p2][i]-(double) vi);
#endif
                }


    //-----------------------------------------------------------------------------

    for (clog1=LOG_CL_BASE; clog1<LOG_CL_RANGE; clog1++)
        for (bits=BIT_BASE; bits<BIT_RANGE; bits++)
            for(j=0; j<256; j++)
                for (o1=0; o1<2; o1++)
                    for (o2=0; o2<2; o2++)
                        for(i=0; i<16; i++) {
                            sp_idx[CLT(clog1)][BTT(bits)][j][o1][o2][i][0]=-1;
                            sp_err[CLT(clog1)][BTT(bits)][j][o1][o2][i]=DBL_MAX;
                        }

    for (clog1=LOG_CL_BASE; clog1<LOG_CL_RANGE; clog1++)
        for (bits=BIT_BASE; bits<BIT_RANGE; bits++)
            for (p1=0; p1<(1<<bits); p1++)
                for (p2=0; p2<(1<<bits); p2++)
                    for (i=0; i<(1<<clog1); i++) {
                        sp_idx[CLT(clog1)][BTT(bits)][(int) ramp[CLT(clog1)][BTT(bits)][p1][p2][i]][p1 & 0x1][p2 & 0x1][i][0]=p1;
                        sp_idx[CLT(clog1)][BTT(bits)][(int) ramp[CLT(clog1)][BTT(bits)][p1][p2][i]][p1 & 0x1][p2 & 0x1][i][1]=p2;
                        sp_err[CLT(clog1)][BTT(bits)][(int) ramp[CLT(clog1)][BTT(bits)][p1][p2][i]][p1 & 0x1][p2 & 0x1][i]=0.;
                    }

    for (clog1=LOG_CL_BASE; clog1<LOG_CL_RANGE; clog1++)
        for (bits=BIT_BASE; bits<BIT_RANGE; bits++)
            for(j=0; j<256; j++)
                for (o1=0; o1<2; o1++)
                    for (o2=0; o2<2; o2++)
                        for(i=0; i<(1<<clog1); i++)
                            if (sp_idx[CLT(clog1)][BTT(bits)][j][o1][o2][i][0]<0) {
                                int k;
                                for (k=1; k<256; k++)
                                    if ( (j-k >= 0 && sp_err[CLT(clog1)][BTT(bits)][j-k][o1][o2][i]==0) ||
                                            (j+k < 256 && sp_err[CLT(clog1)][BTT(bits)][j+k][o1][o2][i]==0) )
                                        break;
                                {
                                    if ( (j-k >= 0 && sp_err[CLT(clog1)][BTT(bits)][j-k][o1][o2][i]==0)) {
                                        sp_idx[CLT(clog1)][BTT(bits)][j][o1][o2][i][0]=sp_idx[CLT(clog1)][BTT(bits)][j-k][o1][o2][i][0];
                                        sp_idx[CLT(clog1)][BTT(bits)][j][o1][o2][i][1]=sp_idx[CLT(clog1)][BTT(bits)][j-k][o1][o2][i][1];
                                    } else if ((j+k < 256 && sp_err[CLT(clog1)][BTT(bits)][j+k][o1][o2][i]==0)) {
                                        sp_idx[CLT(clog1)][BTT(bits)][j][o1][o2][i][0]=sp_idx[CLT(clog1)][BTT(bits)][j+k][o1][o2][i][0];
                                        sp_idx[CLT(clog1)][BTT(bits)][j][o1][o2][i][1]=sp_idx[CLT(clog1)][BTT(bits)][j+k][o1][o2][i][1];
                                    }
                                    sp_err[CLT(clog1)][BTT(bits)][j][o1][o2][i]=k*k;
                                }
                            }

    ramp_init = true;
}

// finds "floor in the set" if exists, otherwise returns min
inline int ep_find_floor( double v, int bits, int use_par, int odd) {
    assert(use_par==0 || use_par==1 || odd==0 || odd==1 );

    double *p = ep_d[BTT(bits)];
    int i1=0;
    int i2=1<<(bits-use_par);
    odd = use_par ? odd : 0;
    while (i2-i1>1) {
        int j = (i1+i2)/2;
        if (v >= p[(j<<use_par)+odd] )
            i1=j;
        else
            i2=j;
    }
    return (i1<<use_par)+odd;
}

// find closest one
inline int ep_find_near( double v, int bits, int use_par, int odd) {
#ifdef USE_DBGTRACE
    DbgTrace(());
#endif
    assert(use_par==0 || use_par==1 || odd==0 || odd==1 ) ;
    double *p = ep_d[BTT(bits)];
    int p1 = ep_find_floor(v, bits, use_par,odd);
    int p2 = p1+(1<<use_par);
    p2 = p2 < (1<<bits) ? p2 : p1;
    if (fabs(v-p[p2]) < fabs(v-p[p1]))
        return p2;
    else
        return p1;
}

inline void mean_d (double d[][DIMENSION], double mean[DIMENSION], int n) {
#ifdef USE_DBGTRACE
    DbgTrace(());
#endif
    int i,j;
    assert(n!=0);
    for(j=0; j< DIMENSION; j++)
        mean[j] =0;
    for(i=0; i< n; i++)
        for(j=0; j< DIMENSION; j++)
            mean[j] +=d[i][j];
    for(j=0; j< DIMENSION; j++)
        mean[j] /=(double) n;
}

inline void mean_d_d (double d[][MAX_DIMENSION_BIG], double mean[MAX_DIMENSION_BIG], int n, int dimension) {
#ifdef USE_DBGTRACE
    DbgTrace(());
#endif
    int i,j;
    assert(n!=0);
    for(j=0; j< dimension; j++)
        mean[j] =0;
    for(i=0; i< n; i++)
        for(j=0; j< dimension; j++)
            mean[j] +=d[i][j];
    for(j=0; j< dimension; j++)
        mean[j] /=(double) n;
}

inline int cluster_mean_d (double d[][DIMENSION],  double mean[][DIMENSION], int index[],int i_comp[],int i_cnt[], int n) {
#ifdef USE_DBGTRACE
    DbgTrace(());
#endif
    // unused index values are underfined
    int i,j,k;
    assert(n!=0);

    for(i=0; i< n; i++)
        for(j=0; j< DIMENSION; j++) {
            assert(index[i]<MAX_CLUSTERS_BIG);
            mean[index[i]][j] =0;
            i_cnt[index[i]]=0;
        }
    k=0;
    for(i=0; i< n; i++) {
        for(j=0; j< DIMENSION; j++)
            mean[index[i]][j] +=d[i][j];
        if (i_cnt[index[i]]==0)
            i_comp[k++]=index[i];
        i_cnt[index[i]]++;
    }

    for(i=0; i< k; i++)
        for(j=0; j< DIMENSION; j++)
            mean[i_comp[i]][j] /=(double) i_cnt[i_comp[i]];
    return k;
}

inline int cluster_mean_d_d (double d[][MAX_DIMENSION_BIG],  double mean[][MAX_DIMENSION_BIG], int index[],int i_comp[],int i_cnt[], int n, int dimension) {
    // unused index values are underfined
    int i,j,k;
    assert(n!=0);

    for(i=0; i< n; i++)
        for(j=0; j< dimension; j++) {
            assert(index[i]<MAX_CLUSTERS_BIG);
            mean[index[i]][j] =0;
            i_cnt[index[i]]=0;
        }
    k=0;
    for(i=0; i< n; i++) {
        for(j=0; j< dimension; j++)
            mean[index[i]][j] +=d[i][j];
        if (i_cnt[index[i]]==0)
            i_comp[k++]=index[i];
        i_cnt[index[i]]++;
    }

    for(i=0; i< k; i++)
        for(j=0; j< dimension; j++)
            mean[i_comp[i]][j] /=(double) i_cnt[i_comp[i]];
    return k;
}

inline int all_same (double d[][DIMENSION],  int n) {
#ifdef USE_DBGTRACE
    DbgTrace(());
#endif
    assert(n>0);
    int i,j;
    int same = 1;
    for(i=1; i< n; i++)
        for(j=0; j< DIMENSION; j++)
            same = same && (d[0][j] ==d[i][j]);

    return(same);
}

inline int all_same_d (double d[][MAX_DIMENSION_BIG],  int n, int dimension) {
    assert(n>0);
    int i,j;
    int same = 1;
    for(i=1; i< n; i++)
        for(j=0; j< dimension; j++)
            same = same && (d[0][j] ==d[i][j]);

#ifdef USE_DBGTRACE
    DbgTrace(("[%d]",same));
#endif

    return(same);
}

inline int max_i (int a[], int n) {
    assert(n>0);
    int i,m=a[0];
    for(i=0; i< n; i++)
        m = m > a[i] ? m : a[i];
    return (m);
}

void index_collapse_ (int index[], int numEntries) {
#ifdef USE_DBGTRACE
    DbgTrace(());
#endif
    int k;
    int d,D;
    int mi;
    int Mi;
    if (numEntries ==0)
        return;

    mi=Mi=index[0];
    for (k=1; k<numEntries; k++) {
        mi = mi < index[k] ? mi : index[k];
        Mi = Mi > index[k] ? Mi : index[k];
    }
    D=1;
    for (d=2; d<=Mi-mi; d++) {

        for (k=0; k<numEntries; k++)
            if ((index[k] -mi) % d !=0)
                break;
        if (k>=numEntries)
            D =d;
    }
    for (k=0; k<numEntries; k++)
        index[k] = (index[k]- mi)/D;
}

//
// New version that will take varying dimension
//
//
//

double BC7BlockEncoder::quant_single_point_d
(
    double data[MAX_ENTRIES][MAX_DIMENSION_BIG],
    int numEntries, int index[MAX_ENTRIES],
    double out[MAX_ENTRIES][MAX_DIMENSION_BIG],
    int epo_1[2][MAX_DIMENSION_BIG],
    int Mi_,                // last cluster
    int bits[3],            // including parity
    int type,
    int dimension
) {
#ifdef USE_DBGTRACE
    DbgTrace(());
#endif

    int i,j;

    double err_0=DBL_MAX;

    double err_1=DBL_MAX;
    int idx = 0;
    int idx_1 = 0;

    int epo_0[2][MAX_DIMENSION_BIG];

    int use_par =(type !=0);

    int clog2=0;
    i = Mi_+1;
    while (i>>=1)
        clog2++;
    assert((1<<clog2)== Mi_+1);

    int pn;
    int (*pv)[MAX_DIMENSION_BIG];
    for (pn=0; pn<npv_nd[dimension][type]; pn++) {
        {
            pv = par_vectors_nd[dimension][type][pn];

            int o1[2][MAX_DIMENSION_BIG]= {0,2};
            int o2[2][MAX_DIMENSION_BIG]= {0,2};

            for (j=0; j<dimension; j++) {
                o2[0][j]=o1[0][j]=0;
                o2[1][j]=o1[1][j]=2;

                if (use_par) {
                    if (pv[0][j])
                        o1[0][j]=1;
                    else
                        o1[1][j]=1;
                    if (pv[1][j])
                        o2[0][j]=1;
                    else
                        o2[1][j]=1;
                }
            };

            int t1,t2;

            int dr[MAX_DIMENSION_BIG];
            int dr_0[MAX_DIMENSION_BIG];
            double tr;

            for (i=0; i< (1<<clog2); i++) {
                double t=0;
                int t1o[MAX_DIMENSION_BIG],t2o[MAX_DIMENSION_BIG];

                for (j=0; j<dimension; j++) {
                    double t_=DBL_MAX;

                    for (t1=o1[0][j]; t1<o1[1][j]; t1++) {
                        for (t2=o2[0][j]; t2<o2[1][j]; t2++)
                            // This is needed for non-integer mean points of "collapsed" sets
                        {
                            int tf = (int) floor(data[0][j]);
                            int tc = (int) ceil (data[0][j]);
                            // if data[o][j] is integer the above are equal, and so are errors

                            assert(tf >=0 && tc <=255.);

                            if (  sp_err[CLT(clog2)][BTT(bits[j])][tf][t1][t2][i]
                                    > sp_err[CLT(clog2)][BTT(bits[j])][tc][t1][t2][i])
                                // if they are not equal, the same representalbe point is used for
                                // both of them, as all representable points are integers in the rage
                                dr[j]=tc;
                            else if (sp_err[CLT(clog2)][BTT(bits[j])][ tf ][t1][t2][i]
                                     < sp_err[CLT(clog2)][BTT(bits[j])][ tc ][t1][t2][i])
                                dr[j]=tf;
                            else
                                dr[j]=(int)floor(data[0][j]+0.5);

                            tr = sp_err[CLT(clog2)][BTT(bits[j])][dr[j]][t1][t2][i] +
                                 2*sqrt(sp_err[CLT(clog2)][BTT(bits[j])][dr[j]][t1][t2][i]) * fabs((double)dr[j]-data[0][j])+
                                 (dr[j]-data[0][j])* (dr[j]-data[0][j]);

                            if (tr < t_) {
                                t_=tr;
                                t1o[j]=t1;
                                t2o[j]=t2;
                                dr_0[j]=dr[j];
                            }
                        }
                    }

                    t += t_;
                }

                if (t < err_0) {
                    idx=i;
                    for (j=0; j<dimension; j++) {
                        epo_0[0][j]=sp_idx[CLT(clog2)][BTT(bits[j])][ dr_0[j]][t1o[j]][t2o[j]][i][0];
                        epo_0[1][j]=sp_idx[CLT(clog2)][BTT(bits[j])][ dr_0[j]][t1o[j]][t2o[j]][i][1];
                    }
                    err_0=t;
                }
                if (err_0==0)
                    break;
            }

            if (err_0 < err_1) {
                idx_1=idx;
                for (j=0; j<dimension; j++) {
                    epo_1[0][j]=epo_0[0][j];
                    epo_1[1][j]=epo_0[1][j];
                }
                err_1=err_0;
            }

            if (err_1==0)
                break;
        }
    }

    for (i=0; i< numEntries; i++) {
        index[i]=idx_1;
        for (j=0; j<dimension; j++) {
            out[i][j]=ramp[CLT(clog2)][BTT(bits[j])][epo_1[0][j]][epo_1[1][j]][idx_1];
        }
    }
    return err_1 * numEntries;
}

double BC7BlockEncoder::ep_shaker_2_d(
    double data[MAX_ENTRIES][MAX_DIMENSION_BIG],
    int numEntries,
    int index_[MAX_ENTRIES],
    double out[MAX_ENTRIES][MAX_DIMENSION_BIG],
    int epo_code[2][MAX_DIMENSION_BIG],
    int size,
    int Mi_,             // last cluster
    int bits,            // total for all channels
    // defined by total numbe of bits and dimensioin
    int dimension,
    double epo[2][MAX_DIMENSION_BIG]

) {
#ifdef USE_DBGTRACE
    DbgTrace(("<-------------------->"));
#endif
    int i,j,k;

//
//###############################
// decode for "new style" interface, this can be passed in directly
//
    int max_bits[MAX_DIMENSION_BIG];
    int type = bits % (2*dimension);

    int use_par =(type !=0);
    for (j=0; j<dimension; j++)
        max_bits[j] = (bits+2*dimension-1) / (2*dimension);
//
//###############################
//

    int clog3=0;
    i = Mi_+1;
    while (i>>=1)
        clog3++;

    assert((1<<clog3)== Mi_+1);

    double mean[MAX_DIMENSION_BIG];
    int index[MAX_ENTRIES];
    int Mi;

    int maxTry=8;

    for (k=0; k<numEntries; k++)
        index[k]=index_[k];


    int done;
    int change;

    int better;

    double err_o = DBL_MAX;

    int epo_0[2][MAX_DIMENSION_BIG];

    double outg[MAX_ENTRIES][MAX_DIMENSION_BIG];

    // handled below automatically
    int alls= all_same_d(data, numEntries, dimension);
    mean_d_d (data, mean, numEntries, dimension);

    do {
        index_collapse_ (index, numEntries);

        Mi= max_i(index, numEntries);  // index can be from requantizer

        int p, q;
        int p0=-1,q0=-1;

        double err_0 = DBL_MAX;

#ifdef USE_DBGTRACE
        DbgTrace(("Mi [%2d] numEntries [%2d]",Mi,numEntries));
#endif

        if (Mi==0) {
            double t;
            // either single point from the beginning or collapsed index
            if (alls) {
                t =  quant_single_point_d( data,numEntries,index, outg, epo_0,  Mi_, max_bits,type, dimension);
            } else {
                quant_single_point_d(&mean,numEntries,index, outg, epo_0, Mi_, max_bits, type, dimension);
                t = totalError_d(data,outg,numEntries, dimension);
            }

            if (t < err_o) {
                for (k=0; k<numEntries; k++) {
                    index_[k]=index[k];
                    for (j=0; j<dimension; j++) {
                        out[k][j]=outg[k][j];
                        epo_code[0][j]=epo_0[0][j];
                        epo_code[1][j]=epo_0[1][j];
                    }
                };
                err_o=t;
            }

            for(j=0; j<dimension; j++) {
                epo[0][j]= ramp[CLT(clog3)][BTT(max_bits[j])][epo_code[0][j]][epo_code[1][j]][0];
                epo[1][j]= ramp[CLT(clog3)][BTT(max_bits[j])][epo_code[0][j]][epo_code[1][j]][(1<<clog3)-1];
            }

            return err_o;
        }

//          set stuff for  collapsed index shake (if needed)??
//          if it did collaps should we just check if it's better and get out ?

        assert(Mi <= Mi_);

        int cluster_mean_calls = 0;

        for (q=1; Mi!=0 && q*Mi <= Mi_; q++) // does not work for single point collapsed index!!!
            for (p=0; p<=Mi_-q*Mi; p++) {
                int cidx[MAX_ENTRIES];

                for (k=0; k<numEntries; k++)
                    cidx[k]=index[k] * q + p;

                double epa[2][MAX_DIMENSION_BIG];

                {
                    //
                    // solve RMS problem for center
                    //

                    double im [2][2] = {{0,0},{0,0}};   // matrix /inverse matrix
                    double rp[2][MAX_DIMENSION_BIG];            // right part for RMS fit problem

                    // get ideal clustr centers
                    double cc[MAX_CLUSTERS_BIG][MAX_DIMENSION_BIG];
                    int i_cnt[MAX_CLUSTERS_BIG]; // count of index entries
                    int i_comp[MAX_CLUSTERS_BIG];   // compacted index
                    int ncl;                        // number of unique indexes

                    cluster_mean_calls++; // used for debugging code: Remove when optimizing
                    ncl=cluster_mean_d_d (data,  cc, cidx, i_comp, i_cnt, numEntries, dimension); // unrounded

                    // round
                    for (i=0; i<ncl; i++)
                        for (j=0; j<dimension; j++)
                            cc[i_comp[i]][j]=floor(cc[i_comp[i]][j]+0.5); // more or less ideal location

                    for (j=0; j<dimension; j++)
                        rp[0][j]=rp[1][j]=0;

                    // weight with cnt if runnning on compacted index
                    for (k=0; k<numEntries; k++) {
                        im[0][0] += (Mi_-cidx[k])* (Mi_-cidx[k]);
                        im[0][1] +=      cidx[k] * (Mi_-cidx[k]);           // im is symmetric
                        im[1][1] +=      cidx[k] *      cidx[k];

                        for (j=0; j<dimension; j++) {
                            rp[0][j]+=(Mi_-cidx[k]) * cc[cidx[k]][j];
                            rp[1][j]+=     cidx[k]  * cc[cidx[k]][j];
                        }
                    }

                    double dd = im[0][0]*im[1][1]-im[0][1]*im[0][1];

                    assert(dd !=0);

                    // dd=0 means that cidx[k] and (Mi_-cidx[k]) collinear which implies only one active index;
                    // taken care of separately

                    im[1][0]=im[0][0];
                    im[0][0]=im[1][1]/dd;
                    im[1][1]=im[1][0]/dd;
                    im[1][0]=im[0][1]=-im[0][1]/dd;

                    for (j=0; j<dimension; j++) {
                        epa[0][j]=(im[0][0]*rp[0][j]+im[0][1]*rp[1][j])*Mi_;
                        epa[1][j]=(im[1][0]*rp[0][j]+im[1][1]*rp[1][j])*Mi_;
                    }
                }
                // shake single or                                   - cartesian
                // shake odd/odd and even/even or                    - same parity
                // shake odd/odd odd/even , even/odd and even/even   - bcc

                double err_1 = DBL_MAX;
                int epo_1[2][MAX_DIMENSION_BIG];

                double ed[2][2][MAX_DIMENSION_BIG];
                int epo_2_[2][2][2][MAX_DIMENSION_BIG];

                for (j=0; j<dimension; j++) {
                    double  (*rb) [256][16] =ramp[CLT(clog3)][BTT(max_bits[j])];

                    int pp[2]= {0,0};
                    int rr = (use_par ? 2:1);

                    int epi[2][2];  // first/second, coord, begin rage end range


                    for (pp[0]=0; pp[0]<rr; pp[0]++) {
                        for (pp[1]=0; pp[1]<rr; pp[1]++) {

                            for (i=0; i<2; i++) {   // set range
                                epi[i][0]= epi[i][1]= ep_find_floor( epa[i][j],max_bits[j], use_par, pp[i]);

                                epi[i][0]-= ( (epi[i][0]  < (size>>1)-1 ? epi[i][0] : (size>>1)-1 ) )  & (~use_par);

                                epi[i][1]+= ((1<<max_bits[j])-1 - epi[i][1]  < (size>>1) ?
                                             (1<<max_bits[j])-1 - epi[i][1]  : (size>>1) ) & (~use_par);
                            }
                            int p1,p2, step=(1<<use_par);

                            ed[pp[0]][pp[1]][j]=DBL_MAX;

                            for (p1=epi[0][0]; p1<=epi[0][1]; p1+=step)
                                for (p2=epi[1][0]; p2<=epi[1][1]; p2+=step) {
                                    double *rbp = rb[p1][p2];
                                    double t=0;
                                    int    *ci=cidx;
                                    int    m =numEntries;


                                    int _mc = m;

                                    while(_mc > 0) {
                                        t += (rbp[ci[_mc-1]]-data[_mc-1][j])*(rbp[ci[_mc-1]]-data[_mc-1][j]);
                                        _mc--;
                                    }

                                    if (t<ed[pp[0]][pp[1]][j]) {
                                        ed[pp[0]][pp[1]][j]=t;
                                        epo_2_[pp[0]][pp[1]][0][j]=p1;
                                        epo_2_[pp[0]][pp[1]][1][j]=p2;
                                    }
                                }
                        }
                    }
                }

                int pn;
                int (*pv)[MAX_DIMENSION_BIG];
                for (pn=0; pn<npv_nd[dimension][type]; pn++) {
                    pv = par_vectors_nd[dimension][type][pn];
                    int j1;
                    double err_2=0;
                    for (j1=0; j1<dimension; j1++)
                        err_2+=ed[pv[0][j1]][pv[1][j1]][j1];
                    if (err_2 < err_1) {
                        err_1 = err_2;
                        for (j1=0; j1<dimension; j1++) {
                            epo_1[0][j1]=epo_2_[pv[0][j1]][pv[1][j1]][0][j1];
                            epo_1[1][j1]=epo_2_[pv[0][j1]][pv[1][j1]][1][j1];
                        }
                    }
                }


                if (err_1 <= err_0) { // we'd want to get expanded index;
                    err_0 = err_1;
                    p0=p;
                    q0=q;
                    for (j=0; j<dimension; j++) {
                        epo_0[0][j]=epo_1[0][j];
                        epo_0[1][j]=epo_1[1][j];
                    }
                }
            }


#ifdef USE_DBGTRACE
        DbgTrace(("cluster_mean_d_d [%2d] q[%d] p[%d]",cluster_mean_calls,q,p));
#endif

        // requantize
        double *r[MAX_DIMENSION_BIG];
        int idg[MAX_ENTRIES];

        double err_r=0;

        for (j=0; j<dimension; j++)
            r[j]= ramp[CLT(clog3)][BTT(max_bits[j])][epo_0[0][j]][epo_0[1][j]];

        for (i=0; i<numEntries; i++) {
            double  cmin = DBL_MAX;
            int        ci = 0;
            double    *d=data[i];

            for(j=0; j < (1<<clog3); j++) {
                double t_=0.;

                for(k=0; k<dimension; k++) {
                    t_+=(r[k][j]-d[k])*(r[k][j]-d[k]);
                }

                if(t_<cmin) {
                    cmin = t_;
                    ci = j;
                }
            }

            idg[i]=ci;
            for(k=0; k<dimension; k++) {
                outg[i][k]=r[k][ci];
            }

            err_r+=cmin;
        }

        // change/better
        change =0;
        for (k=0; k<numEntries; k++)
            change = change || (index[k] * q0 + p0!=idg[k]);

        better = err_r < err_o;

        if (better) {
            for (k=0; k<numEntries; k++) {
                index_[k]=index[k]=idg[k];
                for (j=0; j<dimension; j++) {
                    out[k][j]=outg[k][j];
                    epo_code[0][j]=epo_0[0][j];
                    epo_code[1][j]=epo_0[1][j];
                }
            }
            err_o=err_r;
        }

        done = !(change  &&  better) ;

    } while (! done && maxTry--);

    for(j=0; j<dimension; j++) {
        epo[0][j]= ramp[CLT(clog3)][BTT(max_bits[j])][epo_code[0][j]][epo_code[1][j]][0];
        epo[1][j]= ramp[CLT(clog3)][BTT(max_bits[j])][epo_code[0][j]][epo_code[1][j]][(1<<clog3)-1];
    }

    return err_o;
}


double BC7BlockEncoder::ep_shaker_d(
    double data[MAX_ENTRIES][MAX_DIMENSION_BIG],
    int numEntries,
    int index_[MAX_ENTRIES],
    double out[MAX_ENTRIES][MAX_DIMENSION_BIG],
    int epo_code[2][MAX_DIMENSION_BIG],
    int Mi_,                // last cluster
    int bits[3],            // including parity
    CMP_qt_cpu type,
    int dimension
) {
#ifdef USE_DBGTRACE
    DbgTrace(());
#endif
    //###############################
    int i,j,k;

    int use_par =(type == BCC || type == SAME_PAR);
    int bcc = (type == BCC);
    int clog4=0;
    i = Mi_+1;
    while (i>>=1)
        clog4++;

    assert((1<<clog4)== Mi_+1);

    //###########################

    double mean[MAX_DIMENSION_BIG];
    int index[MAX_ENTRIES];
    int Mi;

    int maxTry=1;

    for (k=0; k<numEntries; k++)
        index[k]=index_[k];

    int done;
    int change;

    int better;

    double err_o = DBL_MAX;

    // handled below automatically
    int alls= all_same_d(data, numEntries, dimension);

    mean_d_d(data, mean, numEntries, dimension);

    do {
        index_collapse_ (index, numEntries);

        Mi= max_i(index, numEntries);  // index can be from requantizer

        int p, q;
        int p0=-1,q0=-1;

        double err_2 = DBL_MAX;
        double out_2[MAX_ENTRIES][MAX_DIMENSION_BIG];
        int idx_2[MAX_ENTRIES];
        int    epo_2[2][MAX_DIMENSION_BIG];

        if (Mi==0) {
            double t;
            int    epo_0[2][MAX_DIMENSION_BIG];
            // either sinle point from the beginning or collapsed index
            if (alls) {
                t =  quant_single_point_d( data,numEntries,index, out_2, epo_0, Mi_, bits,  type, dimension);
            } else {
                quant_single_point_d(&mean,numEntries,index, out_2, epo_0, Mi_, bits,  type, dimension);
                t = totalError_d(data,out_2,numEntries, dimension);
            }

            if (t < err_o) {
                for (k=0; k<numEntries; k++) {
                    index_[k]=index[k];
                    for (j=0; j<dimension; j++) {
                        out[k][j]=out_2[k][j];
                        epo_code[0][j]=epo_0[0][j];
                        epo_code[1][j]=epo_0[1][j];
                    }
                };
                err_o=t;
            }
            return err_o;
        }


//          set stuff for  collapsed index shake (if needed)??
//            if it did collaps should we just check if it's better and get out ?

        int cluster_mean_calls = 0;

        for (q=1; Mi!=0 && q*Mi <= Mi_; q++) { // does not work for single point collapsed index!!!
            for (p=0; p<=Mi_-q*Mi; p++) {
                int cidx[MAX_ENTRIES];

                for (k=0; k<numEntries; k++) {
                    cidx[k]=index[k] * q + p;
                }

                double epa[2][MAX_DIMENSION_BIG];

                {
                    //
                    // solve RMS problem for center
                    //

                    double im [2][2] = {{0,0},{0,0}};   // matrix /inverse matrix
                    double rp[2][MAX_DIMENSION_BIG];            // right part for RMS fit problem

                    // get ideal clustr centers
                    double cc[MAX_CLUSTERS_BIG][MAX_DIMENSION_BIG];
                    int i_cnt[MAX_CLUSTERS_BIG]; // count of index entries
                    int i_comp[MAX_CLUSTERS_BIG];   // compacted index
                    int ncl;                        // number of unique indexes

                    cluster_mean_calls++; // used for debugging code: Remove when optimizing
                    ncl=cluster_mean_d_d (data,  cc, cidx, i_comp, i_cnt, numEntries, dimension); // unrounded

                    // round
                    for (i=0; i<ncl; i++)
                        for (j=0; j<dimension; j++)
                            cc[i_comp[i]][j]=floor(cc[i_comp[i]][j]+0.5); // more or less ideal location

                    for (j=0; j<dimension; j++) {
                        rp[0][j]=rp[1][j]=0;
                    }

                    // weight with cnt if runnning on compacted index
                    for (k=0; k<numEntries; k++) {
                        im[0][0] += (Mi_-cidx[k])* (Mi_-cidx[k]);
                        im[0][1] +=      cidx[k] * (Mi_-cidx[k]);           // im is symmetric
                        im[1][1] +=      cidx[k] *      cidx[k];

                        for (j=0; j<dimension; j++) {
                            rp[0][j]+=(Mi_-cidx[k]) * cc[cidx[k]][j];
                            rp[1][j]+=     cidx[k]  * cc[cidx[k]][j];
                        }
                    }

                    double dd = im[0][0]*im[1][1]-im[0][1]*im[0][1];

                    assert(dd !=0);

                    // dd=0 means that cidx[k] and (Mi_-cidx[k]) collinear which implies only one active index;
                    // taken care of separately

                    im[1][0]=im[0][0];
                    im[0][0]=im[1][1]/dd;
                    im[1][1]=im[1][0]/dd;
                    im[1][0]=im[0][1]=-im[0][1]/dd;

                    for (j=0; j<dimension; j++) {
                        epa[0][j]=(im[0][0]*rp[0][j]+im[0][1]*rp[1][j])*Mi_;
                        epa[1][j]=(im[1][0]*rp[0][j]+im[1][1]*rp[1][j])*Mi_;
                    }
                }
                // shake single or                                   - cartesian
                // shake odd/odd and even/even or                    - same parity
                // shake odd/odd odd/even , even/odd and even/even   - bcc
                int odd,flip1;

                double err_1 = DBL_MAX;
                double out_1[MAX_ENTRIES][MAX_DIMENSION_BIG];
                int idx_1[MAX_ENTRIES];
                int epo_1[2][MAX_DIMENSION_BIG];
                int s1 = 0;

                for (odd=0; odd<=use_par; odd++) {
                    for (flip1=0; flip1 <= bcc; flip1++) {
                        // flip partiy on the second point
                        int epi[2][MAX_DIMENSION_BIG][2];   // first second, coord, begin rage end range

                        for (j=0; j<dimension; j++) {
                            for (i=0; i<2; i++) {
                                // set range
                                epi[i][j][0]= epi[i][j][1]= ep_find_floor( epa[i][j], bits[j], use_par, (odd ^ (flip1 & i)) & 0x1);

                                epi[i][j][1]+= ((1<<bits[j])-1 - epi[i][j][1]  < (1<<use_par) ?
                                                (1<<bits[j])-1 - epi[i][j][1]  : (1<<use_par) ) & (~use_par);
                            }
                        }

                        double *r[MAX_DIMENSION_BIG];

                        double ce[MAX_ENTRIES][MAX_CLUSTERS_BIG][MAX_DIMENSION_BIG];

                        for (j=0; j<dimension; j++)
                            r[j]= ramp[CLT(clog4)][BTT(bits[j])][epi[0][j][0]][epi[1][j][0]];

                        double err_0 = 0;
                        double out_0[MAX_ENTRIES][MAX_DIMENSION_BIG];
                        int idx_0[MAX_ENTRIES];


                        for(i=0; i<numEntries; i++) {
                            double *d=data[i];
                            for(j=0; j<(1<<clog4); j++)
                                for(k=0; k<dimension; k++)
                                    ce[i][j][k] = (r[k][j]-d[k])*(r[k][j]-d[k]);
                        }

                        int s=0, p1, g;
                        int ei0=0, ei1=0;

                        for (p1 =0; p1<64 ; p1++) {
                            int j0=0;

                            // Gray code increment
                            g = p1 & (-p1);

                            err_0=0;

                            for (j=0; j<dimension; j++) {
                                if ( ((g >> (2 *j)) & 0x3) !=0) {
                                    j0 =j;
                                    // new cords
                                    ei0 = ( ( (s^g) >>(2 *j))    & 0x1);
                                    ei1 = ( ( (s^g) >>(2 *j+1))    & 0x1);
                                }
                            }
                            s = s ^ g;
                            r[j0]= ramp[CLT(clog4)][BTT(bits[j0])][epi[0][j0][ei0]][epi[1][j0][ei1]];

                            err_0 = 0;

                            for (i=0; i<numEntries; i++) {
                                double *d=data[i];
                                int    ci = 0;
                                double cmin = DBL_MAX;

                                for(j=0; j<(1<<clog4); j++) {
                                    double t_ = 0.;
                                    ce[i][j][j0] = (r[j0][j]-d[j0])*(r[j0][j]-d[j0]);

                                    for(k=0; k<dimension; k++) {
                                        t_ += ce[i][j][k];
                                    }

                                    if(t_< cmin) {
                                        cmin = t_;
                                        ci = j;
                                    }
                                }

                                idx_0[i]=ci;
                                for(k=0; k<dimension; k++) {
                                    out_0[i][k]=r[k][ci];
                                }
                                err_0+=cmin;
                            }

                            if (err_0 < err_1) {
                                // best in the curent ep cube run
                                for (i=0; i < numEntries; i++) {
                                    idx_1[i]=idx_0[i];
                                    for (j=0; j<dimension; j++)
                                        out_1[i][j]=out_0[i][j];
                                }
                                err_1=err_0;

                                s1=s; // epo coding
                            }
                        }

                        // reconstruct epo
                        for (j=0; j<dimension; j++) {
                            {
                                // new cords
                                ei0 = ( ( s1 >>(2 *j)  )    & 0x1);
                                ei1 = ( ( s1 >>(2 *j+1))    & 0x1);
                                epo_1[0][j]=epi[0][j][ei0];
                                epo_1[1][j]=epi[1][j][ei1];
                            }
                        }
                    }
                }
                if (err_1 < err_2) {
                    // best in the curent ep cube run
                    for (i=0; i < numEntries; i++) {
                        idx_2[i]=idx_1[i];
                        for (j=0; j<dimension; j++)
                            out_2[i][j]=out_1[i][j];
                    }
                    err_2=err_1;
                    for (j=0; j<dimension; j++) {
                        epo_2[0][j]=epo_1[0][j];
                        epo_2[1][j]=epo_1[1][j];
                    }
                    p0=p;
                    q0=q;
                }
            }
        }

#ifdef USE_DBGTRACE
        DbgTrace(("cluster_mean_d_d [%2d]",cluster_mean_calls));
#endif


        // change/better
        change =0;
        for (k=0; k<numEntries; k++)
            change = change || (index[k] * q0 + p0!=idx_2[k]);

        better = err_2 < err_o;

        if (better) {
            for (k=0; k<numEntries; k++) {
                index_[k]=index[k]=idx_2[k];
                for (j=0; j<dimension; j++) {
                    out[k][j]=out_2[k][j];
                    epo_code[0][j]=epo_2[0][j];
                    epo_code[1][j]=epo_2[1][j];
                }
            }
            err_o=err_2;
        }

        done = !(change  &&  better) ;

    } while (! done && maxTry--);

    return err_o;
}