TexConv/CMP_CompressonatorLib/ASTC/ASTC_Encode_Kernel.cpp

/*----------------------------------------------------------------------------*/
/**
*    This confidential and proprietary software may be used only as
*    authorised by a licensing agreement from ARM Limited
*    (C) COPYRIGHT 2011-2012 ARM Limited
*    ALL RIGHTS RESERVED
*
*    The entire notice above must be reproduced on all authorised
*    copies and copies may only be made to the extent permitted
*    by a licensing agreement from ARM Limited.
*
*/
/*----------------------------------------------------------------------------*/
//===========================================================================
// Copyright (c) 2014-2017  Advanced Micro Devices, Inc. All rights reserved.
//===========================================================================


//====================================== Kernal Compliant Code ============================================

#include "astc_encode_kernel.h"

namespace ASTC_Encoder {


// This is tempory and will be removed
// global2 is used to identify the ASTC_Encode arrays that need to be removed or reduced in size from OpenCL call stack!
#define __global2           __global

#ifndef __OPENCL_VERSION__
#define DEBUG(x)        //printf("%s\n",x);
#else
#define DEBUG(x)        //if ((get_global_id(0) == 0) && (get_global_id(1) == 0)) printf("%s\n",x);
#endif

#define astc_codec_internal_error(x)         printf(x);

#ifndef __OPENCL_VERSION__
float dot(float2 p, float2 q) {
    return p.x * q.x + p.y * q.y;
}

float dot(float3 p, float3 q) {
    return p.x * q.x + p.y * q.y + p.z * q.z;
}

float dot(float4 p, float4 q) {
    return p.x * q.x + p.y * q.y + p.z * q.z + p.w * q.w;
}

float3 cross(float3 p, float3 q) {
    return p.yzx * q.zxy - p.zxy * q.yzx;
}

float length(float2 p) {
    return (float)(sqrt(dot(p, p)));
}

float length(float3 p) {
    return (float)(sqrt(dot(p, p)));
}

float length(float4 p) {
    return (float)(sqrt(dot(p, p)));
}

float2 normalize(float2 p) {
    return p / length(p);
}

float3 normalize(float3 p) {
    return p / length(p);
}

float4 normalize(float4 p) {
    return p / length(p);
}

#endif

int compute_ise_bitcount(int items, quantization_method quant) {
    switch (quant) {
    case QUANT_2:
        return items;
    case QUANT_3:
        return (8 * items + 4) / 5;
    case QUANT_4:
        return 2 * items;
    case QUANT_5:
        return (7 * items + 2) / 3;
    case QUANT_6:
        return (13 * items + 4) / 5;
    case QUANT_8:
        return 3 * items;
    case QUANT_10:
        return (10 * items + 2) / 3;
    case QUANT_12:
        return (18 * items + 4) / 5;
    case QUANT_16:
        return items * 4;
    case QUANT_20:
        return (13 * items + 2) / 3;
    case QUANT_24:
        return (23 * items + 4) / 5;
    case QUANT_32:
        return 5 * items;
    case QUANT_40:
        return (16 * items + 2) / 3;
    case QUANT_48:
        return (28 * items + 4) / 5;
    case QUANT_64:
        return 6 * items;
    case QUANT_80:
        return (19 * items + 2) / 3;
    case QUANT_96:
        return (33 * items + 4) / 5;
    case QUANT_128:
        return 7 * items;
    case QUANT_160:
        return (22 * items + 2) / 3;
    case QUANT_192:
        return (38 * items + 4) / 5;
    case QUANT_256:
        return 8 * items;
    default:
        return 100000;
    }

}

//================================
// float float_to_lns(float p)
//================================
#define astc_isnan(p) ((p)!=(p))

float float_to_lns(float p) {
    if (astc_isnan(p) || p <= 1.0f / 67108864.0f) {
        // underflow or NaN value, return 0.
        // We count underflow if the input value is smaller than 2^-26.
        return 0;
    }

    if (fabs(p) >= 65536.0f) {
        // overflow, return a +INF value
        return 65535;
    }

    int expo;
    float normfrac = (float)frexp(p, &expo);
    float p1;
    if (expo < -13) {
        // input number is smaller than 2^-14. In this case, multiply by 2^25.
        p1 = p * 33554432.0f;
        expo = 0;
    } else {
        expo += 14;
        p1 = (normfrac - 0.5f) * 4096.0f;
    }

    if (p1 < 384.0f)
        p1 *= 4.0f / 3.0f;
    else if (p1 <= 1408.0f)
        p1 += 128.0f;
    else
        p1 = (p1 + 512.0f) * (4.0f / 5.0f);

    p1 += expo * 2048.0f;
    return p1 + 1.0f;
}

void imageblock_initialize_deriv_from_work_and_orig(imageblock * pb, int pixelcount) {
    float *fptr = pb->orig_data;
    float *wptr = pb->work_data;
    float *dptr = pb->deriv_data;

    for (int i = 0; i < pixelcount; i++) {

        // compute derivatives for RGB first
        if (pb->rgb_lns[i]) {
            float r = (std::max)(fptr[0], 6e-5f);
            float g = (std::max)(fptr[1], 6e-5f);
            float b = (std::max)(fptr[2], 6e-5f);

            float rderiv = (float_to_lns(r * 1.05f) - float_to_lns(r)) / (r * 0.05f);
            float gderiv = (float_to_lns(g * 1.05f) - float_to_lns(g)) / (g * 0.05f);
            float bderiv = (float_to_lns(b * 1.05f) - float_to_lns(b)) / (b * 0.05f);

            // the derivative may not actually take values smaller than 1/32 or larger than 2^25;
            // if it does, we clamp it.
            if (rderiv < (1.0f / 32.0f))
                rderiv = (1.0f / 32.0f);
            else if (rderiv > 33554432.0f)
                rderiv = 33554432.0f;

            if (gderiv < (1.0f / 32.0f))
                gderiv = (1.0f / 32.0f);
            else if (gderiv > 33554432.0f)
                gderiv = 33554432.0f;

            if (bderiv < (1.0f / 32.0f))
                bderiv = (1.0f / 32.0f);
            else if (bderiv > 33554432.0f)
                bderiv = 33554432.0f;

            dptr[0] = rderiv;
            dptr[1] = gderiv;
            dptr[2] = bderiv;
        } else {
            dptr[0] = 65535.0f;
            dptr[1] = 65535.0f;
            dptr[2] = 65535.0f;
        }


        // then compute derivatives for Alpha
        if (pb->alpha_lns[i]) {
            float a = (std::max)(fptr[3], 6e-5f);
            float aderiv = (float_to_lns(a * 1.05f) - float_to_lns(a)) / (a * 0.05f);
            // the derivative may not actually take values smaller than 1/32 or larger than 2^25;
            // if it does, we clamp it.
            if (aderiv < (1.0f / 32.0f))
                aderiv = (1.0f / 32.0f);
            else if (aderiv > 33554432.0f)
                aderiv = 33554432.0f;

            dptr[3] = aderiv;
        } else {
            dptr[3] = 65535.0f;
        }

        fptr += 4;
        wptr += 4;
        dptr += 4;
    }
}

//=================================================================
// helper function to initialize the work-data from the orig-data
//=================================================================
void imageblock_initialize_work_from_orig(imageblock * pb, int pixelcount) {
    float *fptr = pb->orig_data;
    float *wptr = pb->work_data;

    for (int i = 0; i < pixelcount; i++) {
        if (pb->rgb_lns[i]) {
            wptr[0] = float_to_lns(fptr[0]);
            wptr[1] = float_to_lns(fptr[1]);
            wptr[2] = float_to_lns(fptr[2]);
        } else {
            wptr[0] = fptr[0] * 65535.0f;
            wptr[1] = fptr[1] * 65535.0f;
            wptr[2] = fptr[2] * 65535.0f;
        }

        if (pb->alpha_lns[i]) {
            wptr[3] = float_to_lns(fptr[3]);
        } else {
            wptr[3] = fptr[3] * 65535.0f;
        }
        fptr += 4;
        wptr += 4;
    }

    imageblock_initialize_deriv_from_work_and_orig(pb, pixelcount);
}

//=========================================================================
// For an imageblock, update its flags.
//
// The updating is done based on work_data, not orig_data.
//=========================================================================

void update_imageblock_flags(imageblock * pb, __global ASTC_Encode *ASTC_Encode) {
    float red_min = FLOAT_38, red_max = -FLOAT_38;
    float green_min = FLOAT_38, green_max = -FLOAT_38;
    float blue_min = FLOAT_38, blue_max = -FLOAT_38;
    float alpha_min = FLOAT_38, alpha_max = -FLOAT_38;

    int grayscale = 1;
    //printf(" m_texels_per_block %d ", ASTC_Encode->m_texels_per_block);

    for (int i = 0; i < ASTC_Encode->m_texels_per_block; i++) {
        float red = pb->work_data[4 * i];
        float green = pb->work_data[4 * i + 1];
        float blue = pb->work_data[4 * i + 2];
        float alpha = pb->work_data[4 * i + 3];

        //printf(" alpha %f ", alpha);

        if (red < red_min)
            red_min = red;
        if (red > red_max)
            red_max = red;
        if (green < green_min)
            green_min = green;
        if (green > green_max)
            green_max = green;
        if (blue < blue_min)
            blue_min = blue;
        if (blue > blue_max)
            blue_max = blue;
        if (alpha < alpha_min)
            alpha_min = alpha;
        if (alpha > alpha_max)
            alpha_max = alpha;

        if (grayscale == 1 && (red != green || red != blue))
            grayscale = 0;
    }

    pb->red_min = red_min;
    pb->red_max = red_max;
    pb->green_min = green_min;
    pb->green_max = green_max;
    pb->blue_min = blue_min;
    pb->blue_max = blue_max;
    pb->alpha_min = alpha_min;
    pb->alpha_max = alpha_max;
    pb->grayscale = grayscale;
}

void fetch_imageblock(
    astc_codec_image *input_image,
    imageblock *blk,
    unsigned int pixelcount,
    __global ASTC_Encode *ASTC_Encode
) {
    //---------------------------------------------------------------
    // Convert the source RGBA byte pixels to float for ASTC
    //---------------------------------------------------------------
    // Note: Swizzel is set at {0,1,2,3}
    // The image source data is set to 8 bit
    unsigned int findex = 0;
    for (unsigned int px=0; px<pixelcount; px++) {
        blk->orig_data[findex++] = input_image->pixels[px].x / 255.0f;    // Red
        blk->orig_data[findex++] = input_image->pixels[px].y / 255.0f;    // Green
        blk->orig_data[findex++] = input_image->pixels[px].z / 255.0f;    // Blue
        blk->orig_data[findex++] = input_image->pixels[px].w / 255.0f;    // Alpha
    }

    //---------------------------------------------------------------
    // perform sRGB-to-linear transform on input data, if requested.
    //---------------------------------------------------------------
    // if (g_ASTCEncode.m_perform_srgb_transform)     { Not implemented }
    //---------------------------------------------------------------
    // collect color max-value, in order to determine whether to use LDR or HDR
    // interpolation.
    //---------------------------------------------------------------
    // float max_red, max_green, max_blue, max_alpha;
    // max_red = 0.0f;
    // max_green = 0.0f;
    // max_blue = 0.0f;
    // max_alpha = 0.0f;
    //
    // findex = 0;
    // for (unsigned int i = 0; i < pixelcount; i++)
    // {
    //     float r = pb->orig_data[findex++];
    //     float g = pb->orig_data[findex++];
    //     float b = pb->orig_data[findex++];
    //     float a = pb->orig_data[findex++];
    //
    //     if (r > max_red)
    //         max_red = r;
    //     if (g > max_green)
    //         max_green = g;
    //     if (b > max_blue)
    //         max_blue = b;
    //     if (a > max_alpha)
    //         max_alpha = a;
    // }
    //
    // float max_rgb = (std::max)(max_red, (std::max)(max_green, max_blue));
    // printf("max rgb = %2.2f",max_rgb);
    //---------------------------------------------------------------------------
    // use LNS if:
    // * RGB-maximum is less than 0.15
    // * RGB-maximum is greater than 1
    // * Alpha-maximum is greater than 1
    //----------------------------------------------------------------------------
    //  int rgb_lns  = (max_rgb < 0.15f || max_rgb > 1.0f || max_alpha > 1.0f) ? 1 : 0;
    //  int alpha_lns  = rgb_lns ? (max_alpha > 1.0f || max_alpha < 0.15f) : 0;

    // impose the choice on every pixel when encoding.
    for (unsigned int i = 0; i < pixelcount; i++) {
        blk->rgb_lns[i]   = (uint8_t)ASTC_Encode->m_rgb_force_use_of_hdr;
        blk->alpha_lns[i] = (uint8_t)ASTC_Encode->m_alpha_force_use_of_hdr;
        blk->nan_texel[i] = 0;
    }

    imageblock_initialize_work_from_orig(blk, pixelcount);
    update_imageblock_flags(blk, ASTC_Encode);
}


uint32_t rtne_shift32(uint32_t inp, uint32_t shamt) {
    uint32_t vl1 = (uint32_t)(1) << shamt;
    uint32_t inp2 = inp + (vl1 >> 1);    /* added 0.5 ulp */
    uint32_t msk = (inp | (uint32_t)(1)) & vl1;    /* nonzero if odd. '| 1' forces it to 1 if the shamt is 0. */
    msk--;                        /* negative if even, nonnegative if odd. */
    inp2 -= (msk >> 31);        /* subtract epsilon before shift if even. */
    inp2 >>= shamt;
    return inp2;
}

uint32_t rtna_shift32(uint32_t inp, uint32_t shamt) {
    uint32_t vl1 = ((uint32_t)(1) << shamt) >> 1;
    inp += vl1;
    inp >>= shamt;
    return inp;
}

uint32_t rtup_shift32(uint32_t inp, uint32_t shamt) {
    uint32_t vl1 = (uint32_t)(1) << shamt;
    inp += vl1;
    inp--;
    inp >>= shamt;
    return inp;
}

CGU_SHORT sf32_to_sf16(CGU_UINT inp, roundmode rmode) {

    uint32_t p;
    uint32_t idx = rmode + tab[inp >> 23];
    uint32_t vlx = tabx[idx];
    switch (idx) {
    /*
          Positive number which may be Infinity or NaN.
        We need to check whether it is NaN; if it is, quieten it by setting the top bit of the mantissa.
        (If we don't do this quieting, then a NaN  that is distinguished only by having
        its low-order bits set, would be turned into an INF. */
    case 50:
    case 51:
    case 52:
    case 53:
    case 54:
    case 55:
    case 56:
    case 57:
    case 58:
    case 59:
        /*
            the input value is 0x7F800000 or 0xFF800000 if it is INF.
            By subtracting 1, we get 7F7FFFFF or FF7FFFFF, that is, bit 23 becomes zero.
            For NaNs, however, this operation will keep bit 23 with the value 1.
            We can then extract bit 23, and logical-OR bit 9 of the result with this
            bit in order to quieten the NaN (a Quiet NaN is a NaN where the top bit
            of the mantissa is set.)
        */
        p = (inp - 1) & (uint32_t)(0x800000);    /* zero if INF, nonzero if NaN. */
        return (CGU_SHORT)(((inp + (CGU_SHORT)vlx) >> 13) | (p >> 14));
    /*
        positive, exponent = 0, round-mode == UP; need to check whether number actually is 0.
        If it is, then return 0, else return 1 (the smallest representable nonzero number)
    */
    case 0:
        /*
            -inp will set the MSB if the input number is nonzero.
            Thus (-inp) >> 31 will turn into 0 if the input number is 0 and 1 otherwise.
        */
        return (uint32_t) (-(int32_t) inp) >> 31;

    /*
        negative, exponent = , round-mode == DOWN, need to check whether number is
        actually 0. If it is, return 0x8000 ( float -0.0 )
        Else return the smallest negative number ( 0x8001 ) */
    case 6:
        /*
            in this case 'vlx' is 0x80000000. By subtracting the input value from it,
            we obtain a value that is 0 if the input value is in fact zero and has
            the MSB set if it isn't. We then right-shift the value by 31 places to
            get a value that is 0 if the input is -0.0 and 1 otherwise.
        */
        return ((vlx - inp) >> 31) + (uint32_t)(0x8000);

    /*
        for all other cases involving underflow/overflow, we don't need to
        do actual tests; we just return 'vlx'.
    */
    case 1:
    case 2:
    case 3:
    case 4:
    case 5:
    case 7:
    case 8:
    case 9:
    case 10:
    case 11:
    case 12:
    case 13:
    case 14:
    case 15:
    case 16:
    case 17:
    case 18:
    case 19:
    case 40:
    case 41:
    case 42:
    case 43:
    case 44:
    case 45:
    case 46:
    case 47:
    case 48:
    case 49:
        return (CGU_SHORT)vlx;

    /*
        for normal numbers, 'vlx' is the difference between the FP32 value of a number and the
        FP16 representation of the same number left-shifted by 13 places. In addition, a rounding constant is
        baked into 'vlx': for rounding-away-from zero, the constant is 2^13 - 1, causing roundoff away
        from zero. for round-to-nearest away, the constant is 2^12, causing roundoff away from zero.
        for round-to-nearest-even, the constant is 2^12 - 1. This causes correct round-to-nearest-even
        except for odd input numbers. For odd input numbers, we need to add 1 to the constant. */

    /* normal number, all rounding modes except round-to-nearest-even: */
    case 30:
    case 31:
    case 32:
    case 34:
    case 35:
    case 36:
    case 37:
    case 39:
        return (CGU_SHORT)((inp + vlx) >> 13);

    /* normal number, round-to-nearest-even. */
    case 33:
    case 38:
        p = inp + vlx;
        p += (inp >> 13) & 1;
        return (CGU_SHORT)(p >> 13);

    /*
        the various denormal cases. These are not expected to be common, so their performance is a bit
        less important. For each of these cases, we need to extract an exponent and a mantissa
        (including the implicit '1'!), and then right-shift the mantissa by a shift-amount that
        depends on the exponent. The shift must apply the correct rounding mode. 'vlx' is used to supply the
        sign of the resulting denormal number.
    */
    case 21:
    case 22:
    case 25:
    case 27:
        /* denormal, round towards zero. */
        p = 126 - ((inp >> 23) & 0xFF);
        return (CGU_SHORT)((((inp & (uint32_t)(0x7FFFFF)) + (uint32_t)(0x800000)) >> p) | vlx);
    case 20:
    case 26:
        /* denornal, round away from zero. */
        p = 126 - ((inp >> 23) & 0xFF);
        return (CGU_SHORT)(rtup_shift32((inp & (uint32_t)(0x7FFFFF)) + (uint32_t)(0x800000), p) | vlx);
    case 24:
    case 29:
        /* denornal, round to nearest-away */
        p = 126 - ((inp >> 23) & 0xFF);
        return (CGU_SHORT)(rtna_shift32((inp & (uint32_t)(0x7FFFFF)) + (uint32_t)(0x800000), p) | vlx);
    case 23:
    case 28:
        /* denormal, round to nearest-even. */
        p = 126 - ((inp >> 23) & 0xFF);
        return (CGU_SHORT)(rtne_shift32((inp & (uint32_t)(0x7FFFFF)) + (uint32_t)(0x800000), p) | vlx);
    }

    return 0;
}

CGU_SHORT float_to_sf16(float p, roundmode rm) {
    if32 i;
    i.f = p;
    return sf32_to_sf16(i.u, rm);
}

// napatel this code needs optimization
float prepare_error_weight_block(
    imageblock * blk,
    error_weight_block * ewb,
    error_weight_block_orig * ewbo,
    __global ASTC_Encode *ASTCEncode) {
    DEBUG("prepare_error_weight_block");

    if (ASTCEncode->m_compress_to_mono) {
        return 1.0f;
    }

    unsigned int x, y;
    int idx = 0;

    int any_mean_stdev_weight =
        ASTCEncode->m_ewp.rgb_base_weight != 1.0 || ASTCEncode->m_ewp.alpha_base_weight != 1.0 || ASTCEncode->m_ewp.rgb_mean_weight != 0.0 || ASTCEncode->m_ewp.rgb_stdev_weight != 0.0 || ASTCEncode->m_ewp.alpha_mean_weight != 0.0 || ASTCEncode->m_ewp.alpha_stdev_weight != 0.0;

    float4 color_weights = {ASTCEncode->m_ewp.rgba_weights[0],
                            ASTCEncode->m_ewp.rgba_weights[1],
                            ASTCEncode->m_ewp.rgba_weights[2],
                            ASTCEncode->m_ewp.rgba_weights[3]
                           };

//#ifdef __OPENCL_VERSION__
//    if ((get_global_id(0) == 0) && (get_global_id(1) == 0))
//#endif
//        printf("color_weights %3.3f %3.3f %3.3f %3.3f\n", color_weights.x, color_weights.y, color_weights.z, color_weights.w );

    ewb->contains_zeroweight_texels = 0;
    float4 normals = {1.0f, 1.0f, 1.0f, 1.0f};

#ifdef ENABLE_3D_SUPPORT
    for (int z = 0; z < ASTCEncode->m_zdim; z++)
#endif
        for (y = 0; y < ASTCEncode->m_ydim; y++)
            for (x = 0; x < ASTCEncode->m_xdim; x++) {
                float4 error_weight = { ASTCEncode->m_ewp.rgb_base_weight,
                                        ASTCEncode->m_ewp.rgb_base_weight,
                                        ASTCEncode->m_ewp.rgb_base_weight,
                                        ASTCEncode->m_ewp.alpha_base_weight
                                      };

                if (any_mean_stdev_weight) {
                    // napatel
                    // <=== This needs proper sizes ===> made local and should be set to image dimensions!!
                    // float4 input_averages[4][4][4];
                    //--------------------------------------

                    float4 avg = { 0.0f, 0.0f, 0.0f, 0.0f }; //  input_averages[0][0][0]; // was [zpos][ypos][xpos] need to use correct value
                    if (avg.x < 6e-5f)
                        avg.x = 6e-5f;
                    if (avg.y < 6e-5f)
                        avg.y = 6e-5f;
                    if (avg.z < 6e-5f)
                        avg.z = 6e-5f;
                    if (avg.w < 6e-5f)
                        avg.w = 6e-5f;
                    /*
                       printf("avg: %f %f %f %f\n", avg.x, avg.y, avg.z, avg.w ); */
                    avg = avg * avg;

                    //# Check this sections of code!
                    // napatel
                    // <=== This needs proper sizes ===> made local and should be set to image dimensions!!
                    // float4 input_variances[4][4][4];

                    //--------------------------------------
                    float4 variance = { 0.0f, 0.0f, 0.0f, 0.0f }; // input_variances[0][0][0]; // was [zpos][ypos][xpos] need to use correct value
                    variance = variance * variance;

                    float favg = (avg.x + avg.y + avg.z) * (1.0f / 3.0f);
                    //float fvar = (variance.x + variance.y + variance.z) * (1.0f / 3.0f);

                    float mixing = ASTCEncode->m_ewp.rgb_mean_and_stdev_mixing;
                    float3 favg3 = { favg, favg, favg };

                    avg.xyz = favg3 * mixing + avg.xyz * (1.0f - mixing);
                    variance.xyz = favg3 * mixing + variance.xyz * (1.0f - mixing);

                    float4 stdev = {(float)sqrt((std::max)(variance.x, 0.0f)),
                                    (float)sqrt((std::max)(variance.y, 0.0f)),
                                    (float)sqrt((std::max)(variance.z, 0.0f)),
                                    (float)sqrt((std::max)(variance.w, 0.0f))
                                   };

                    avg.xyz = avg.xyz * ASTCEncode->m_ewp.rgb_mean_weight;
                    avg.w = avg.w * ASTCEncode->m_ewp.alpha_mean_weight;
                    stdev.xyz = stdev.xyz * ASTCEncode->m_ewp.rgb_stdev_weight;
                    stdev.w = stdev.w * ASTCEncode->m_ewp.alpha_stdev_weight;
                    error_weight = error_weight + avg + stdev;
                    float4 onef = { 1.0f, 1.0f, 1.0f, 1.0f };
                    error_weight = onef / error_weight;
                }

#ifdef USE_RA_NORMAL_ANGULAR_SCALE
                if (ASTCEncode->m_ewp.ra_normal_angular_scale) {
                    float x1 = (blk->orig_data[4 * idx] - 0.5f) * 2.0f;
                    float y1 = (blk->orig_data[4 * idx + 3] - 0.5f) * 2.0f;
                    float denom = 1.0f - x1 * x1 - y1 * y1;
                    if (denom < 0.1f)
                        denom = 0.1f;
                    denom = 1.0f / denom;
                    error_weight.x *= 1.0f + x1 * x1 * denom;
                    error_weight.w *= 1.0f + y1 * y1 * denom;
                }
#endif

#ifdef USE_RGB_SCALE_WITH_ALPHA
                if (ASTCEncode->m_ewp.enable_rgb_scale_with_alpha) {
                    // napatel
                    // <=== This needs proper sizes ===> made local and should be set to image dimensions!!
                    // float4 input_alpha_averages[12][12][12];
                    //--------------------------------------

                    float alpha_scale;
                    if (ASTCEncode->m_ewp.alpha_radius != 0)
                        alpha_scale = 0.0f; //input_alpha_averages[0][0][0].x; // was [zpos][ypos][xpos] need to use correct value
                    else
                        alpha_scale = blk->orig_data[4 * idx + 3];
                    if (alpha_scale < 0.0001f)
                        alpha_scale = 0.0001f;
                    alpha_scale *= alpha_scale;
                    error_weight.xyz = error_weight.xyz * alpha_scale;
                }
#endif


                error_weight = error_weight * color_weights;
                error_weight = error_weight * ASTCEncode->m_ewp.block_artifact_suppression_expanded[idx];

#ifdef USE_PERFORMM_SRGB_TRANSFORM
                // if we perform a conversion from linear to sRGB, then we multiply
                // the weight with the derivative of the linear->sRGB transform function.
                if (ASTCEncode->m_perform_srgb_transform) {
                    float r = blk->orig_data[4 * idx];
                    float g = blk->orig_data[4 * idx + 1];
                    float b = blk->orig_data[4 * idx + 2];
                    if (r < 0.0031308f)
                        r = 12.92f;
                    else
                        r = 0.4396f * pow(r, -0.58333f);
                    if (g < 0.0031308f)
                        g = 12.92f;
                    else
                        g = 0.4396f * pow(g, -0.58333f);
                    if (b < 0.0031308f)
                        b = 12.92f;
                    else
                        b = 0.4396f * pow(b, -0.58333f);
                    error_weight.x *= r;
                    error_weight.y *= g;
                    error_weight.z *= b;
                }
#endif

                // when we loaded the block to begin with, we applied a transfer function
                // and computed the derivative of the transfer function. However, the
                // error-weight computation so far is based on the original color values,
                // not the transfer-function values. As such, we must multiply the
                // error weights by the derivative of the inverse of the transfer function,
                // which is equivalent to dividing by the derivative of the transfer
                // function.

                ewbo->error_weights[idx] = error_weight;

                error_weight.x /= (blk->deriv_data[4 * idx] * blk->deriv_data[4 * idx] * FLOAT_n10);
                error_weight.y /= (blk->deriv_data[4 * idx + 1] * blk->deriv_data[4 * idx + 1] * FLOAT_n10);
                error_weight.z /= (blk->deriv_data[4 * idx + 2] * blk->deriv_data[4 * idx + 2] * FLOAT_n10);
                error_weight.w /= (blk->deriv_data[4 * idx + 3] * blk->deriv_data[4 * idx + 3] * FLOAT_n10);

                /*
                    printf("--> %f %f %f %f\n", error_weight.x, error_weight.y, error_weight.z, error_weight.w );
                */

                ewb->error_weights[idx] = error_weight;
                float res = dot(error_weight, normals);
                if (res < FLOAT_n10)
                    ewb->contains_zeroweight_texels = 1;

                idx++;
            }

    int i;

    float4 error_weight_sum = { 0.0f, 0.0f, 0.0f, 0.0f };

    for (i = 0; i < ASTCEncode->m_texels_per_block; i++) {
        error_weight_sum = error_weight_sum + ewb->error_weights[i];

        ewb->texel_weight_r[i] = ewb->error_weights[i].x;
        ewb->texel_weight_g[i] = ewb->error_weights[i].y;
        ewb->texel_weight_b[i] = ewb->error_weights[i].z;
        ewb->texel_weight_a[i] = ewb->error_weights[i].w;

        ewb->texel_weight_rg[i] = (ewb->error_weights[i].x + ewb->error_weights[i].y) * 0.5f;
        ewb->texel_weight_rb[i] = (ewb->error_weights[i].x + ewb->error_weights[i].z) * 0.5f;
        ewb->texel_weight_gb[i] = (ewb->error_weights[i].y + ewb->error_weights[i].z) * 0.5f;
        ewb->texel_weight_ra[i] = (ewb->error_weights[i].x + ewb->error_weights[i].w) * 0.5f;

        ewb->texel_weight_gba[i] = (ewb->error_weights[i].y + ewb->error_weights[i].z + ewb->error_weights[i].w) * 0.333333f;
        ewb->texel_weight_rba[i] = (ewb->error_weights[i].x + ewb->error_weights[i].z + ewb->error_weights[i].w) * 0.333333f;
        ewb->texel_weight_rga[i] = (ewb->error_weights[i].x + ewb->error_weights[i].y + ewb->error_weights[i].w) * 0.333333f;
        ewb->texel_weight_rgb[i] = (ewb->error_weights[i].x + ewb->error_weights[i].y + ewb->error_weights[i].z) * 0.333333f;
        ewb->texel_weight[i] = (ewb->error_weights[i].x + ewb->error_weights[i].y + ewb->error_weights[i].z + ewb->error_weights[i].w) * 0.25f;
    }

    return dot(error_weight_sum, normals);
}

void compute_partition_error_color_weightings(
    error_weight_block * ewb,
    __global partition_info * pi,
    float4 error_weightings[4],
    float4 color_scalefactors[4],
    __global ASTC_Encode *ASTCEncode) {
    DEBUG("compute_partition_error_color_weightings");

    int i;
    int pcnt = pi->partition_count;
    float4 one12f = { FLOAT_n12, FLOAT_n12, FLOAT_n12, FLOAT_n12 };
    for (i = 0; i < pcnt; i++)
        error_weightings[i] = one12f;
    for (i = 0; i < ASTCEncode->m_texels_per_block; i++) {
        int part = pi->partition_of_texel[i];
        error_weightings[part] = error_weightings[part] + ewb->error_weights[i];
    }
    for (i = 0; i < pcnt; i++) {
        error_weightings[i] = error_weightings[i] * (1.0f / pi->texels_per_partition[i]);
    }
    for (i = 0; i < pcnt; i++) {
        color_scalefactors[i].x = sqrt(error_weightings[i].x);
        color_scalefactors[i].y = sqrt(error_weightings[i].y);
        color_scalefactors[i].z = sqrt(error_weightings[i].z);
        color_scalefactors[i].w = sqrt(error_weightings[i].w);
    }

}

void compute_averages_and_directions_rgba(
    __global partition_info * pt,
    imageblock * blk,
    error_weight_block * ewb,
    float4 * color_scalefactors,
    float4 * averages,
    float4 * directions_rgba,
    float3 * directions_gba,
    float3 * directions_rba,
    float3 * directions_rga,
    float3 * directions_rgb) {
    int i;
    int partition_count = pt->partition_count;
    int partition;

    for (partition = 0; partition < partition_count; partition++) {
        int texelcount = pt->texels_per_partition[partition];

        float4 base_sum = {0.0f, 0.0f, 0.0f, 0.0f };
        float partition_weight = 0.0f;

        for (i = 0; i < texelcount; i++) {
            int iwt = pt->texels_of_partition[partition][i];
            float weight = ewb->texel_weight[iwt];
            float4 texel_datum = {blk->work_data[4 * iwt],
                                  blk->work_data[4 * iwt + 1],
                                  blk->work_data[4 * iwt + 2],
                                  blk->work_data[4 * iwt + 3]
                                 };
            texel_datum =  texel_datum * weight;
            partition_weight += weight;

            base_sum = base_sum + texel_datum;
        }

        float4 average = base_sum * 1.0f / MAX(partition_weight, FLOAT_n7);
        averages[partition] = average * color_scalefactors[partition];


        float4 sum_xp = {0.0f, 0.0f, 0.0f, 0.0f};
        float4 sum_yp = {0.0f, 0.0f, 0.0f, 0.0f};
        float4 sum_zp = {0.0f, 0.0f, 0.0f, 0.0f};
        float4 sum_wp = {0.0f, 0.0f, 0.0f, 0.0f};

        for (i = 0; i < texelcount; i++) {
            int iwt = pt->texels_of_partition[partition][i];
            float weight = ewb->texel_weight[iwt];
            float4 texel_datum = {blk->work_data[4 * iwt],
                                  blk->work_data[4 * iwt + 1],
                                  blk->work_data[4 * iwt + 2],
                                  blk->work_data[4 * iwt + 3]
                                 };
            texel_datum = (texel_datum - average) * weight;

            if (texel_datum.x > 0.0f)
                sum_xp = sum_xp + texel_datum;
            if (texel_datum.y > 0.0f)
                sum_yp = sum_yp + texel_datum;
            if (texel_datum.z > 0.0f)
                sum_zp = sum_zp + texel_datum;
            if (texel_datum.w > 0.0f)
                sum_wp = sum_wp + texel_datum;
        }

        float prod_xp = dot(sum_xp, sum_xp);
        float prod_yp = dot(sum_yp, sum_yp);
        float prod_zp = dot(sum_zp, sum_zp);
        float prod_wp = dot(sum_wp, sum_wp);

        float4 best_vector = sum_xp;
        float best_sum = prod_xp;
        if (prod_yp > best_sum) {
            best_vector = sum_yp;
            best_sum = prod_yp;
        }
        if (prod_zp > best_sum) {
            best_vector = sum_zp;
            best_sum = prod_zp;
        }
        if (prod_wp > best_sum) {
            best_vector = sum_wp;
            best_sum = prod_wp;
        }

        directions_rgba[partition] = best_vector;
        directions_rgb[partition] = best_vector.xyz;
        directions_rga[partition] = best_vector.xyw;
        directions_rba[partition] = best_vector.xzw;
        directions_gba[partition] = best_vector.yzw;
    }
}

void compute_endpoints_and_ideal_weights_rgba(
    __global  partition_info * pt,
    imageblock * blk,
    error_weight_block * ewb,
    endpoints_and_weights * ei,
    __global ASTC_Encode *ASTCEncode) {
    DEBUG("compute_endpoints_and_ideal_weights_rgba");

    int i;

    float *error_weights = ewb->texel_weight;

    int partition_count = pt->partition_count;
    float lowparam[4], highparam[4];
    for (i = 0; i < partition_count; i++) {
        lowparam[i] = FLOAT_10;
        highparam[i] = -FLOAT_10;
    }

    float4 averages[4];
    float4 directions_rgba[4];
    float3 directions_gba[4];
    float3 directions_rba[4];
    float3 directions_rga[4];
    float3 directions_rgb[4];

    line4 lines[4];

    float scale[4];
    float length_squared[4];

    float4 error_weightings[4];
    float4 color_scalefactors[4];
    float4 scalefactors[4];

    compute_partition_error_color_weightings(ewb, pt, error_weightings, color_scalefactors, ASTCEncode);

    for (i = 0; i < partition_count; i++)
        scalefactors[i] = normalize(color_scalefactors[i]) * 2.0f;


    compute_averages_and_directions_rgba(pt, blk, ewb, scalefactors, averages, directions_rgba, directions_gba, directions_rba, directions_rga, directions_rgb);

    // if the direction-vector ends up pointing from light to dark, FLIP IT!
    // this will make the first endpoint the darkest one.
    float4 zerof = { 0.0f, 0.0f, 0.0f, 0.0f};
    float4 onef = { 1.0f, 1.0f, 1.0f, 1.0f };

    for (i = 0; i < partition_count; i++) {
        float4 direc = directions_rgba[i];
        if (direc.x + direc.y + direc.z < 0.0f)
            directions_rgba[i] = zerof - direc;
    }

    for (i = 0; i < partition_count; i++) {
        lines[i].a = averages[i];
        if (dot(directions_rgba[i], directions_rgba[i]) == 0.0f)
            lines[i].b = normalize(onef);
        else
            lines[i].b = normalize(directions_rgba[i]);
    }

    for (i = 0; i < ASTCEncode->m_texels_per_block; i++) {
        if (error_weights[i] > FLOAT_n10) {
            int partition = pt->partition_of_texel[i];

            float4 point = { blk->work_data[4 * i], blk->work_data[4 * i + 1], blk->work_data[4 * i + 2], blk->work_data[4 * i + 3] };
            point = point * scalefactors[partition];
            line4 l = lines[partition];

            float param = dot(point - l.a, l.b);
            ei->weights[i] = param;
            if (param < lowparam[partition])
                lowparam[partition] = param;
            if (param > highparam[partition])
                highparam[partition] = param;
        } else {
            ei->weights[i] = -FLOAT_38;
        }
    }

    for (i = 0; i < partition_count; i++) {
        float length = highparam[i] - lowparam[i];
        if (length < 0) {
            lowparam[i] = 0.0f;
            highparam[i] = FLOAT_n7;
        }


        // it is possible for a uniform-color partition to produce length=0; this
        // causes NaN-production and NaN-propagation later on. Set length to
        // a small value to avoid this problem.
        if (length < FLOAT_n7)
            length = FLOAT_n7;

        length_squared[i] = length * length;
        scale[i] = 1.0f / length;

        ei->ep.endpt0[i] = (lines[i].a + lines[i].b * lowparam[i]) / scalefactors[i];
        ei->ep.endpt1[i] = (lines[i].a + lines[i].b * highparam[i]) / scalefactors[i];
    }

    for (i = 0; i < ASTCEncode->m_texels_per_block; i++) {
        int partition = pt->partition_of_texel[i];
        float idx = (ei->weights[i] - lowparam[partition]) * scale[partition];
        if (idx > 1.0f)
            idx = 1.0f;
        else if (!(idx > 0.0f))
            idx = 0.0f;
        ei->weights[i] = idx;
        ei->weight_error_scale[i] = error_weights[i] * length_squared[partition];
        if (astc_isnan(ei->weight_error_scale[i])) {
            astc_codec_internal_error("ERROR: compute_endpoints_and_ideal_weights_rgba");
        }
    }
}

void compute_averages_and_directions_rgb(__global partition_info * pt,
        imageblock * blk,
        error_weight_block * ewb,
        float4 * color_scalefactors,
        float3 * averages,
        float3 * directions_rgb,
        float2 * directions_rg,
        float2 * directions_rb,
        float2 * directions_gb) {
    int i;
    int partition_count = pt->partition_count;
    int partition;

    float *texel_weights = ewb->texel_weight_rgb;

    for (partition = 0; partition < partition_count; partition++) {
        int texelcount = pt->texels_per_partition[partition];

        float3 one3f     = { 0.0f, 0.0f, 0.0f };
        float3 base_sum  = one3f;
        float partition_weight = 0.0f;

        for (i = 0; i < texelcount; i++) {
            int iwt = pt->texels_of_partition[partition][i];
            float weight = texel_weights[iwt];

            float3 texel_datum = { blk->work_data[4 * iwt],
                                   blk->work_data[4 * iwt + 1],
                                   blk->work_data[4 * iwt + 2]
                                 };
            texel_datum = texel_datum * weight;
            partition_weight += weight;

            base_sum = base_sum + texel_datum;
        }

        float4 csf = color_scalefactors[partition];
        float3 average = base_sum * 1.0f / MAX(partition_weight, FLOAT_n7);
        averages[partition] = average * csf.xyz;


        float3 sum_xp = one3f;
        float3 sum_yp = one3f;
        float3 sum_zp = one3f;

        for (i = 0; i < texelcount; i++) {
            int iwt = pt->texels_of_partition[partition][i];
            float weight = texel_weights[iwt];
            float3 texel_datum = { blk->work_data[4 * iwt],
                                   blk->work_data[4 * iwt + 1],
                                   blk->work_data[4 * iwt + 2]
                                 };
            texel_datum = (texel_datum - average) * weight;

            if (texel_datum.x > 0.0f)
                sum_xp = sum_xp + texel_datum;
            if (texel_datum.y > 0.0f)
                sum_yp = sum_yp + texel_datum;
            if (texel_datum.z > 0.0f)
                sum_zp = sum_zp + texel_datum;
        }

        float prod_xp = dot(sum_xp, sum_xp);
        float prod_yp = dot(sum_yp, sum_yp);
        float prod_zp = dot(sum_zp, sum_zp);

        float3 best_vector = sum_xp;
        float best_sum = prod_xp;
        if (prod_yp > best_sum) {
            best_vector = sum_yp;
            best_sum = prod_yp;
        }
        if (prod_zp > best_sum) {
            best_vector = sum_zp;
            best_sum = prod_zp;
        }

        directions_rgb[partition] = best_vector;
        directions_gb[partition] = best_vector.yz;
        directions_rb[partition] = best_vector.xz;
        directions_rg[partition] = best_vector.xy;
    }
}

void compute_averages_and_directions_3_components(__global partition_info * pt,
        imageblock * blk,
        error_weight_block * ewb,
        float3 * color_scalefactors, int component1, int component2, int component3, float3 * averages, float3 * directions) {
    DEBUG("compute_averages_and_directions_3_components");

    int i;
    int partition_count = pt->partition_count;
    int partition;

    float *texel_weights;
    if (component1 == 1 && component2 == 2 && component3 == 3)
        texel_weights = ewb->texel_weight_gba;
    else if (component1 == 0 && component2 == 2 && component3 == 3)
        texel_weights = ewb->texel_weight_rba;
    else if (component1 == 0 && component2 == 1 && component3 == 3)
        texel_weights = ewb->texel_weight_rga;
    else if (component1 == 0 && component2 == 1 && component3 == 2)
        texel_weights = ewb->texel_weight_rgb;
    else {
        texel_weights = ewb->texel_weight_gba;
        astc_codec_internal_error("ERROR: compute_averages_and_directions_3_components");
    }


    for (partition = 0; partition < partition_count; partition++) {
        int texelcount = pt->texels_per_partition[partition];

        float3 base_sum = { 0.0f, 0.0f, 0.0f };
        float partition_weight = 0.0f;

        for (i = 0; i < texelcount; i++) {
            int iwt = pt->texels_of_partition[partition][i];
            float weight = texel_weights[iwt];
            float3 texel_datum = { blk->work_data[4 * iwt + component1],
                                   blk->work_data[4 * iwt + component2],
                                   blk->work_data[4 * iwt + component3]
                                 };
            texel_datum = texel_datum * weight;
            partition_weight += weight;

            base_sum = base_sum + texel_datum;
        }

        float3 csf = color_scalefactors[partition];

        float3 average = base_sum * 1.0f / MAX(partition_weight, FLOAT_n7);
        averages[partition] = average * csf.xyz;


        float3 sum_xp = {0.0f, 0.0f, 0.0f };
        float3 sum_yp = {0.0f, 0.0f, 0.0f };
        float3 sum_zp = {0.0f, 0.0f, 0.0f };

        for (i = 0; i < texelcount; i++) {
            int iwt = pt->texels_of_partition[partition][i];
            float weight = texel_weights[iwt];
            float3 texel_datum = { blk->work_data[4 * iwt + component1],
                                   blk->work_data[4 * iwt + component2],
                                   blk->work_data[4 * iwt + component3]
                                 };
            texel_datum = (texel_datum - average) * weight;

            if (texel_datum.x > 0.0f)
                sum_xp = sum_xp + texel_datum;
            if (texel_datum.y > 0.0f)
                sum_yp = sum_yp + texel_datum;
            if (texel_datum.z > 0.0f)
                sum_zp = sum_zp + texel_datum;
        }

        float prod_xp = dot(sum_xp, sum_xp);
        float prod_yp = dot(sum_yp, sum_yp);
        float prod_zp = dot(sum_zp, sum_zp);

        float3 best_vector = sum_xp;
        float best_sum = prod_xp;
        if (prod_yp > best_sum) {
            best_vector = sum_yp;
            best_sum = prod_yp;
        }
        if (prod_zp > best_sum) {
            best_vector = sum_zp;
            best_sum = prod_zp;
        }

        float3 onef = { 1.0f, 1.0f,1.0f };
        if (dot(best_vector, best_vector) < FLOAT_n18)
            best_vector = onef;
        directions[partition] = best_vector;
    }

}

void compute_endpoints_and_ideal_weights_3_components(
    __global partition_info * pt,
    imageblock * blk,
    error_weight_block * ewb,
    endpoints_and_weights * ei,
    int component1,
    int component2,
    int component3,
    __global ASTC_Encode *ASTCEncode) {
    DEBUG("compute_endpoints_and_ideal_weights_3_components");

    int i;

    int partition_count = pt->partition_count;
    ei->ep.partition_count = partition_count;

    float4 error_weightings[4];
    float4 color_scalefactors[4];
    float3 scalefactors[4];
    float *error_weights;

    if (component1 == 1 && component2 == 2 && component3 == 3)
        error_weights = ewb->texel_weight_gba;
    else if (component1 == 0 && component2 == 2 && component3 == 3)
        error_weights = ewb->texel_weight_rba;
    else if (component1 == 0 && component2 == 1 && component3 == 3)
        error_weights = ewb->texel_weight_rga;
    else if (component1 == 0 && component2 == 1 && component3 == 2)
        error_weights = ewb->texel_weight_rgb;
    else {
        error_weights = ewb->texel_weight_gba;
        astc_codec_internal_error("ERROR: compute_endpoints_and_ideal_weights_3_components");
    }

    compute_partition_error_color_weightings(ewb, pt, error_weightings, color_scalefactors, ASTCEncode);

    for (i = 0; i < partition_count; i++) {
        float s1 = 0, s2 = 0, s3 = 0;
        switch (component1) {
        case 0:
            s1 = color_scalefactors[i].x;
            break;
        case 1:
            s1 = color_scalefactors[i].y;
            break;
        case 2:
            s1 = color_scalefactors[i].z;
            break;
        case 3:
            s1 = color_scalefactors[i].w;
            break;
        }

        switch (component2) {
        case 0:
            s2 = color_scalefactors[i].x;
            break;
        case 1:
            s2 = color_scalefactors[i].y;
            break;
        case 2:
            s2 = color_scalefactors[i].z;
            break;
        case 3:
            s2 = color_scalefactors[i].w;
            break;
        }

        switch (component3) {
        case 0:
            s3 = color_scalefactors[i].x;
            break;
        case 1:
            s3 = color_scalefactors[i].y;
            break;
        case 2:
            s3 = color_scalefactors[i].z;
            break;
        case 3:
            s3 = color_scalefactors[i].w;
            break;
        }
        float3 sf = { s1, s2, s3 };
        scalefactors[i] = normalize(sf) * 1.73205080f;
    }


    float lowparam[4], highparam[4];

    float3 averages[4];
    float3 directions[4];

    line3 lines[4];
    float scale[4];
    float length_squared[4];


    for (i = 0; i < partition_count; i++) {
        lowparam[i] = FLOAT_10;
        highparam[i] = -FLOAT_10;
    }

    compute_averages_and_directions_3_components(pt, blk, ewb, scalefactors, component1, component2, component3, averages, directions);

    float3 onef  = { 1.0f,1.0f,1.0f };
    float3 zerof = { 0.0f, 0.0f, 0.0f };

    for (i = 0; i < partition_count; i++) {
        float3 direc = directions[i];
        if (direc.x + direc.y + direc.z < 0.0f)
            directions[i] = zerof - direc;
    }

    for (i = 0; i < partition_count; i++) {
        lines[i].a = averages[i];
        if (dot(directions[i], directions[i]) == 0.0f)
            lines[i].b = normalize(onef);
        else
            lines[i].b = normalize(directions[i]);
    }


    for (i = 0; i < ASTCEncode->m_texels_per_block; i++) {
        if (error_weights[i] > FLOAT_n10) {
            int partition = pt->partition_of_texel[i];
            float3 point = { blk->work_data[4 * i + component1], blk->work_data[4 * i + component2], blk->work_data[4 * i + component3] };
            point = point * scalefactors[partition];
            line3 l = lines[partition];
            float param = dot(point - l.a, l.b);
            ei->weights[i] = param;
            if (param < lowparam[partition])
                lowparam[partition] = param;
            if (param > highparam[partition])
                highparam[partition] = param;
        } else {
            ei->weights[i] = -FLOAT_38;
        }
    }

    float3 lowvalues[4];
    float3 highvalues[4];


    for (i = 0; i < partition_count; i++) {
        float length = highparam[i] - lowparam[i];
        if (length < 0) {          // case for when none of the texels had any weight
            lowparam[i] = 0.0f;
            highparam[i] = FLOAT_n7;
        }

        // it is possible for a uniform-color partition to produce length=0; this
        // causes NaN-production and NaN-propagation later on. Set length to
        // a small value to avoid this problem.
        if (length < FLOAT_n7)
            length = FLOAT_n7;

        length_squared[i] = length * length;
        scale[i] = 1.0f / length;

        float3 ep0 = lines[i].a + lines[i].b * lowparam[i];
        float3 ep1 = lines[i].a + lines[i].b * highparam[i];

        ep0 = ep0 / scalefactors[i];
        ep1 = ep1 / scalefactors[i];


        lowvalues[i] = ep0;
        highvalues[i] = ep1;
    }


    for (i = 0; i < partition_count; i++) {
        float4 ep0f = { blk->red_min, blk->green_min, blk->blue_min, blk->alpha_min };
        float4 ep1f = { blk->red_max, blk->green_max, blk->blue_max, blk->alpha_max };
        ei->ep.endpt0[i] = ep0f;
        ei->ep.endpt1[i] = ep1f;

        float3 ep0 = lowvalues[i];
        float3 ep1 = highvalues[i];

        switch (component1) {
        case 0:
            ei->ep.endpt0[i].x = ep0.x;
            ei->ep.endpt1[i].x = ep1.x;
            break;
        case 1:
            ei->ep.endpt0[i].y = ep0.x;
            ei->ep.endpt1[i].y = ep1.x;
            break;
        case 2:
            ei->ep.endpt0[i].z = ep0.x;
            ei->ep.endpt1[i].z = ep1.x;
            break;
        case 3:
            ei->ep.endpt0[i].w = ep0.x;
            ei->ep.endpt1[i].w = ep1.x;
            break;
        }
        switch (component2) {
        case 0:
            ei->ep.endpt0[i].x = ep0.y;
            ei->ep.endpt1[i].x = ep1.y;
            break;
        case 1:
            ei->ep.endpt0[i].y = ep0.y;
            ei->ep.endpt1[i].y = ep1.y;
            break;
        case 2:
            ei->ep.endpt0[i].z = ep0.y;
            ei->ep.endpt1[i].z = ep1.y;
            break;
        case 3:
            ei->ep.endpt0[i].w = ep0.y;
            ei->ep.endpt1[i].w = ep1.y;
            break;
        }
        switch (component3) {
        case 0:
            ei->ep.endpt0[i].x = ep0.z;
            ei->ep.endpt1[i].x = ep1.z;
            break;
        case 1:
            ei->ep.endpt0[i].y = ep0.z;
            ei->ep.endpt1[i].y = ep1.z;
            break;
        case 2:
            ei->ep.endpt0[i].z = ep0.z;
            ei->ep.endpt1[i].z = ep1.z;
            break;
        case 3:
            ei->ep.endpt0[i].w = ep0.z;
            ei->ep.endpt1[i].w = ep1.z;
            break;
        }
    }

    for (i = 0; i < ASTCEncode->m_texels_per_block; i++) {
        int partition = pt->partition_of_texel[i];
        float idx = (ei->weights[i] - lowparam[partition]) * scale[partition];
        if (idx > 1.0f)
            idx = 1.0f;
        else if (!(idx > 0.0f))
            idx = 0.0f;

        ei->weights[i] = idx;
        ei->weight_error_scale[i] = length_squared[partition] * error_weights[i];
        if (astc_isnan(ei->weight_error_scale[i])) {
            astc_codec_internal_error("ERROR: compute_endpoints_and_ideal_weights_3_components: 2");
        }
    }

//#ifdef __OPENCL_VERSION__
//    if ((get_global_id(0) == 0) && (get_global_id(1) == 0))
//#endif
//        printf("ei->ep.endpt0[0].x %3.3f\n", ei->ep.endpt0[0].x);
//
}

int imageblock_uses_alpha1(imageblock * blk) {
    DEBUG("imageblock_uses_alpha1");
    //printf("%2.3f",pb->alpha_max);
    //printf("%2.3f", pb->alpha_min);

    return blk->alpha_max != blk->alpha_min;
}

void compute_endpoints_and_ideal_weights_1_plane(
    __global partition_info * pt,
    imageblock * blk,
    error_weight_block * ewb,
    endpoints_and_weights * ei,
    __global ASTC_Encode *ASTCEncode) {
    DEBUG("compute_endpoints_and_ideal_weights_1_plane");
    int uses_alpha = imageblock_uses_alpha1(blk);
    if (uses_alpha) {
        compute_endpoints_and_ideal_weights_rgba(pt, blk, ewb, ei, ASTCEncode);
    } else {
        compute_endpoints_and_ideal_weights_3_components(pt, blk, ewb, ei, 0, 1, 2, ASTCEncode);
    }
}

// function to compute angular sums; then, from the
// angular sums, compute alignment factor and offset.
void compute_angular_offsets(
    int samplecount,
    __global2 float *samples,
    __global2 float *sample_weights,
    int max_angular_steps,
    float *offsets,
    __global ASTC_Encode *ASTCEncode) {

    int i, j;

    float anglesum_x[ANGULAR_STEPS];
    float anglesum_y[ANGULAR_STEPS];

    for (i = 0; i < max_angular_steps; i++) {
        anglesum_x[i] = 0;
        anglesum_y[i] = 0;
    }


    // compute the angle-sums.
    for (i = 0; i < samplecount; i++) {
        float sample = samples[i];
        float sample_weight = sample_weights[i];
        if32 p;
        p.f = (sample * (SINCOS_STEPS - 1.0f)) + 12582912.0f;
        unsigned int isample = p.u & 0x3F;

        __global float *sinptr = ASTCEncode->sin_table[isample];
        __global float *cosptr = ASTCEncode->cos_table[isample];

        for (j = 0; j < max_angular_steps; j++) {
            float cp = cosptr[j];
            float sp = sinptr[j];

            anglesum_x[j] += cp * sample_weight;
            anglesum_y[j] += sp * sample_weight;
        }
    }

    // postprocess the angle-sums
    for (i = 0; i < max_angular_steps; i++) {
        float angle = (float)atan2(anglesum_y[i], anglesum_x[i]);    // positive angle -> positive offset
        offsets[i] = angle * (ASTCEncode->stepsizes[i] * (1.0f / (2.0f * (float)M_PI)));
    }
}

void compute_lowest_and_highest_weight(
    int samplecount,
    __global2 float *samples,
    __global2 float *sample_weights,
    int max_angular_steps,
    float *offsets,
    int8_t * lowest_weight,
    int8_t * highest_weight,
    float *error,
    float *cut_low_weight_error,
    float *cut_high_weight_error,
    __global ASTC_Encode *ASTCEncode) {
    int i;

    int sp;

    float error_from_forcing_weight_down[60];
    float error_from_forcing_weight_either_way[60];
    for (i = 0; i < 60; i++) {
        error_from_forcing_weight_down[i] = 0;
        error_from_forcing_weight_either_way[i] = 0;
    }

    for (sp = 0; sp < max_angular_steps; sp++) {
        unsigned int minidx_bias12 = 55;
        unsigned int maxidx_bias12 = 0;

        float errval = 0.0f;

        float rcp_stepsize = angular_steppings[sp];
        float offset = offsets[sp];

        float scaled_offset = rcp_stepsize * offset;


        for (i = 0; i < samplecount - 1; i += 2) {
            float wt1 = sample_weights[i];
            float wt2 = sample_weights[i + 1];
            if32 p1, p2;
            float sval1 = (samples[i] * rcp_stepsize) - scaled_offset;
            float sval2 = (samples[i + 1] * rcp_stepsize) - scaled_offset;
            p1.f = sval1 + 12582912.0f;    // FP representation abuse to avoid floor() and float->int conversion
            p2.f = sval2 + 12582912.0f;    // FP representation abuse to avoid floor() and float->int conversion
            float isval1 = p1.f - 12582912.0f;
            float isval2 = p2.f - 12582912.0f;
            float dif1 = sval1 - isval1;
            float dif2 = sval2 - isval2;

            errval += (dif1 * wt1) * dif1;
            errval += (dif2 * wt2) * dif2;

            // table lookups that really perform a minmax function.
            unsigned int idx1_bias12 = idxtab[p1.u & 0xFF];
            unsigned int idx2_bias12 = idxtab[p2.u & 0xFF];

            if (idx1_bias12 < minidx_bias12)
                minidx_bias12 = idx1_bias12;
            if (idx1_bias12 > maxidx_bias12)
                maxidx_bias12 = idx1_bias12;
            if (idx2_bias12 < minidx_bias12)
                minidx_bias12 = idx2_bias12;
            if (idx2_bias12 > maxidx_bias12)
                maxidx_bias12 = idx2_bias12;

            error_from_forcing_weight_either_way[idx1_bias12] += wt1;
            error_from_forcing_weight_down[idx1_bias12] += (dif1 * wt1);

            error_from_forcing_weight_either_way[idx2_bias12] += wt2;
            error_from_forcing_weight_down[idx2_bias12] += (dif2 * wt2);
        }

        if (samplecount & 1) {
            i = samplecount - 1;
            float wt = sample_weights[i];
            if32 p;
            float sval = (samples[i] * rcp_stepsize) - scaled_offset;
            p.f = sval + 12582912.0f;    // FP representation abuse to avoid floor() and float->int conversion
            float isval = p.f - 12582912.0f;
            float dif = sval - isval;

            errval += (dif * wt) * dif;

            unsigned int idx_bias12 = idxtab[p.u & 0xFF];

            if (idx_bias12 < minidx_bias12)
                minidx_bias12 = idx_bias12;
            if (idx_bias12 > maxidx_bias12)
                maxidx_bias12 = idx_bias12;

            error_from_forcing_weight_either_way[idx_bias12] += wt;
            error_from_forcing_weight_down[idx_bias12] += dif * wt;
        }


        lowest_weight[sp] = (int8_t)(minidx_bias12 - 12);
        highest_weight[sp] = (int8_t)(maxidx_bias12 - 12);
        error[sp] = errval;

        // the cut_(lowest/highest)_weight_error indicate the error that results from
        // forcing samples that should have had the (lowest/highest) weight value
        // one step (up/down).
        cut_low_weight_error[sp] = error_from_forcing_weight_either_way[minidx_bias12] - 2.0f * error_from_forcing_weight_down[minidx_bias12];
        cut_high_weight_error[sp] = error_from_forcing_weight_either_way[maxidx_bias12] + 2.0f * error_from_forcing_weight_down[maxidx_bias12];

        // clear out the error-from-forcing values we actually used in this pass
        // so that these are clean for the next pass.
        unsigned int ui;
        for (ui = minidx_bias12 & ~0x3; ui <= maxidx_bias12; ui += 4) {
            error_from_forcing_weight_either_way[ui] = 0;
            error_from_forcing_weight_down[ui] = 0;
            error_from_forcing_weight_either_way[ui + 1] = 0;
            error_from_forcing_weight_down[ui + 1] = 0;
            error_from_forcing_weight_either_way[ui + 2] = 0;
            error_from_forcing_weight_down[ui + 2] = 0;
            error_from_forcing_weight_either_way[ui + 3] = 0;
            error_from_forcing_weight_down[ui + 3] = 0;
        }
    }


    for (sp = 0; sp < max_angular_steps; sp++) {
        float errscale = ASTCEncode->stepsizes_sqr[sp];
        error[sp] *= errscale;
        cut_low_weight_error[sp] *= errscale;
        cut_high_weight_error[sp] *= errscale;
    }
}

void compute_angular_endpoints_for_quantization_levels(
    int samplecount,
    __global2 float *samples,
    __global2 float *sample_weights,
    int max_quantization_level,
    float low_value[12],
    float high_value[12],
    __global ASTC_Encode *ASTCEncode) {
    DEBUG("compute_angular_endpoints_for_quantization_levels");

    int i;


    max_quantization_level++;    // Temporarily increase level - needs refinement

    int quantization_steps_for_level[13] = { 2, 3, 4, 5, 6, 8, 10, 12, 16, 20, 24, 33, 36 };
    int max_quantization_steps = quantization_steps_for_level[max_quantization_level];

    float offsets[ANGULAR_STEPS];

    int max_angular_steps = ASTCEncode->max_angular_steps_needed_for_quant_level[max_quantization_level];

    compute_angular_offsets(samplecount, samples, sample_weights, max_angular_steps, offsets, ASTCEncode);


    // the +4 offsets are to allow for vectorization within compute_lowest_and_highest_weight().
    int8_t lowest_weight[ANGULAR_STEPS + 4];
    int8_t highest_weight[ANGULAR_STEPS + 4];
    float error[ANGULAR_STEPS + 4];

    float cut_low_weight_error[ANGULAR_STEPS + 4];
    float cut_high_weight_error[ANGULAR_STEPS + 4];

    compute_lowest_and_highest_weight(samplecount, samples, sample_weights, max_angular_steps, offsets,
                                      lowest_weight, highest_weight, error, cut_low_weight_error, cut_high_weight_error,ASTCEncode);

    // for each quantization level, find the best error terms.
    float best_errors[40];
    int best_scale[40];
    uint8_t cut_low_weight[40];

    for (i = 0; i < (max_quantization_steps + 4); i++) {
        best_errors[i] = FLOAT_30;
        best_scale[i] = -1;    // Indicates no solution found
        cut_low_weight[i] = 0;
    }

    for (i = 0; i < max_angular_steps; i++) {
        int samplecount1 = highest_weight[i] - lowest_weight[i] + 1;
        if (samplecount1 >= (max_quantization_steps + 4)) {
            continue;
        }
        if (samplecount1 < 2)
            samplecount1 = 2;

        if (best_errors[samplecount1] > error[i]) {
            best_errors[samplecount1] = error[i];
            best_scale[samplecount1] = i;
            cut_low_weight[samplecount1] = 0;
        }

        float error_cut_low = error[i] + cut_low_weight_error[i];
        float error_cut_high = error[i] + cut_high_weight_error[i];
        float error_cut_low_high = error[i] + cut_low_weight_error[i] + cut_high_weight_error[i];

        if (best_errors[samplecount1 - 1] > error_cut_low) {
            best_errors[samplecount1 - 1] = error_cut_low;
            best_scale[samplecount1 - 1] = i;
            cut_low_weight[samplecount1 - 1] = 1;
        }

        if (best_errors[samplecount1 - 1] > error_cut_high) {
            best_errors[samplecount1 - 1] = error_cut_high;
            best_scale[samplecount1 - 1] = i;
            cut_low_weight[samplecount1 - 1] = 0;
        }

        if (best_errors[samplecount1 - 2] > error_cut_low_high) {
            best_errors[samplecount1 - 2] = error_cut_low_high;
            best_scale[samplecount1 - 2] = i;
            cut_low_weight[samplecount1 - 2] = 1;
        }

    }

    // if we got a better error-value for a low samplecount than for a high one,
    // use the low-samplecount error value for the higher samplecount as well.
    for (i = 3; i <= max_quantization_steps; i++) {
        if (best_errors[i] > best_errors[i - 1]) {
            best_errors[i] = best_errors[i - 1];
            best_scale[i] = best_scale[i - 1];
            cut_low_weight[i] = cut_low_weight[i - 1];
        }
    }


    max_quantization_level--;    // Decrease level again (see corresponding ++, above)

    int ql_weights[12] = { 2, 3, 4, 5, 6, 8, 10, 12, 16, 20, 24, 33 };
    for (i = 0; i <= max_quantization_level; i++) {
        int q = ql_weights[i];
        int bsi = best_scale[q];

        // Did we find anything?
        if(bsi < 0) {
            printf("ERROR: Unable to find an encoding within the specified error limits. Please revise the error limit values and try again.\n");
            // exit(1);
        }

        float stepsize = ASTCEncode->stepsizes[bsi];
        int lwi = lowest_weight[bsi] + cut_low_weight[q];
        int hwi = lwi + q - 1;
        float offset = offsets[bsi];

        low_value[i] = offset + lwi * stepsize;
        high_value[i] = offset + hwi * stepsize;
    }

}

float compute_value_of_texel_flt(int texel_to_get,
                                 __global decimation_table *it,
                                 __global2 float *weights) {
    return
        ( weights[it->texel_weights[texel_to_get][0]] * it->texel_weights_float[texel_to_get][0]
          +  weights[it->texel_weights[texel_to_get][1]] * it->texel_weights_float[texel_to_get][1])
        + (weights[it->texel_weights[texel_to_get][2]] * it->texel_weights_float[texel_to_get][2]
           +  weights[it->texel_weights[texel_to_get][3]] * it->texel_weights_float[texel_to_get][3]);
}

float compute_value_of_texel_flt_localVar(
    int texel_to_get,
    __global decimation_table *it,
    float *weights) {

//#ifdef __OPENCL_VERSION__
//    if ((get_global_id(0) == 0) && (get_global_id(1) == 0))
//#endif
//        printf("flt_localVar %d %d %3.3f %3.3f \n",texel_to_get, it->texel_weights[texel_to_get][0],     weights[it->texel_weights[texel_to_get][0]],    it->texel_weights_float[texel_to_get][0]);

    return
        (weights[it->texel_weights[texel_to_get][0]] * it->texel_weights_float[texel_to_get][0]
         +  weights[it->texel_weights[texel_to_get][1]] * it->texel_weights_float[texel_to_get][1])
        + (weights[it->texel_weights[texel_to_get][2]] * it->texel_weights_float[texel_to_get][2]
           +  weights[it->texel_weights[texel_to_get][3]] * it->texel_weights_float[texel_to_get][3]);
}

//inline
float compute_error_of_texel(
    endpoints_and_weights * eai,
    int texel_to_get,
    __global decimation_table * it,
    __global2 float *weights) {
    float current_value = compute_value_of_texel_flt(texel_to_get, it, weights);
    float valuedif = current_value - eai->weights[texel_to_get];
    return valuedif * valuedif * eai->weight_error_scale[texel_to_get];
}

float compute_error_of_weight_set(
    endpoints_and_weights * eai,
    __global decimation_table * it,
    __global2 float *weights) {
    int i;
    int texel_count = it->num_texels;
    float error_summa = 0.0;
    for (i = 0; i < texel_count; i++)
        error_summa += compute_error_of_texel(eai, i, it, weights);
    return error_summa;
}

void compute_two_error_changes_from_perturbing_weight_infill(
    endpoints_and_weights * eai,
    __global decimation_table * it,
    float *infilled_weights, int weight_to_perturb,
    float perturbation1, float perturbation2, float *res1, float *res2) {
    int num_weights = it->weight_num_texels[weight_to_perturb];
    float error_change0 = 0.0f;
    float error_change1 = 0.0f;
    int i;

    for (i = num_weights - 1; i >= 0; i--) {
        uint8_t weight_texel = it->weight_texel[weight_to_perturb][i];
        float weights = it->weights_flt[weight_to_perturb][i];

        float scale = eai->weight_error_scale[weight_texel] * weights;
        float old_weight = infilled_weights[weight_texel];
        float ideal_weight = eai->weights[weight_texel];

        error_change0 += weights * scale;
        error_change1 += (old_weight - ideal_weight) * scale;
    }
    *res1 = error_change0 * (perturbation1 * perturbation1 * (1.0f / (TEXEL_WEIGHT_SUM * TEXEL_WEIGHT_SUM))) + error_change1 * (perturbation1 * (2.0f / TEXEL_WEIGHT_SUM));
    *res2 = error_change0 * (perturbation2 * perturbation2 * (1.0f / (TEXEL_WEIGHT_SUM * TEXEL_WEIGHT_SUM))) + error_change1 * (perturbation2 * (2.0f / TEXEL_WEIGHT_SUM));
}

void compute_ideal_weights_for_decimation_table(
    endpoints_and_weights * eai,
    __global decimation_table * it,
    __global2 float *weight_set,
    __global2 float *weights) {
    DEBUG("compute_ideal_weights_for_decimation_table");

    int i, j, k;
    int texels_per_block = it->num_texels;
    int weight_count = it->num_weights;

    // perform a shortcut in the case of a complete decimation table
    if (texels_per_block == weight_count) {

        for (i = 0; i < it->num_texels; i++) {
            int texel = it->weight_texel[i][0];
            weight_set[i] = eai->weights[texel];
            weights[i] = eai->weight_error_scale[texel];
        }
        return;
    }


    // if the shortcut is not available, we will instead compute a simple estimate
    // and perform three rounds of refinement on that estimate.

    // float initial_weight_set[MAX_WEIGHTS_PER_BLOCK];
    float infilled_weights[MAX_TEXELS_PER_BLOCK];

    // compute an initial average for each weight.
    for (i = 0; i < weight_count; i++) {
        int texel_count = it->weight_num_texels[i];

        float weight_weight = FLOAT_n10;    // to avoid 0/0 later on
        float initial_weight = 0.0f;
        for (j = 0; j < texel_count; j++) {
            int texel = it->weight_texel[i][j];
            float weight = it->weights_flt[i][j];
            float contrib_weight = weight * eai->weight_error_scale[texel];
            weight_weight += contrib_weight;
            initial_weight += eai->weights[texel] * contrib_weight;
        }

        weights[i] = weight_weight;
        weight_set[i] = initial_weight / weight_weight;    // this is the 0/0 that is to be avoided.
    }


    for (i = 0; i < texels_per_block; i++) {
        infilled_weights[i] = compute_value_of_texel_flt(i, it, weight_set);
    }

    float stepsizes[3] = { 0.25f, 0.125f, 0.0625f };

    for (j = 0; j < 2; j++) {
        float stepsize = stepsizes[j];

        for (i = 0; i < weight_count; i++) {
            float weight_val = weight_set[i];
            float error_change_up, error_change_down;
            compute_two_error_changes_from_perturbing_weight_infill(eai, it, infilled_weights, i, stepsize, -stepsize, &error_change_up, &error_change_down);

            /*
                assume that the error-change function behaves like a quadratic function in the interval examined,
                with "error_change_up" and "error_change_down" defining the function at the endpoints
                of the interval. Then, find the position where the function's derivative is zero.

                The "fabs(b) >= a" check tests several conditions in one:
                    if a is negative, then the 2nd derivative fo the function is negative;
                    in this case, f'(x)=0 will maximize error.
                If fabs(b) > fabs(a), then f'(x)=0 will lie outside the interval altogether.
                If a and b are both 0, then set step to 0;
                    otherwise, we end up computing 0/0, which produces a lethal NaN.
                We can get an a=b=0 situation if an error weight is 0 in the wrong place.
            */

            float step;
            float a = (error_change_up + error_change_down) * 2.0f;
            float b = error_change_down - error_change_up;
            if (fabs(b) >= a) {
                if (a <= 0.0f) {
                    if (error_change_up < error_change_down)
                        step = 1;
                    else if (error_change_up > error_change_down)
                        step = -1;

                    else
                        step = 0;

                } else {
                    if (a < FLOAT_n10)
                        a = FLOAT_n10;
                    step = b / a;
                    if (step < -1.0f)
                        step = -1.0f;
                    else if (step > 1.0f)
                        step = 1.0f;
                }
            } else
                step = b / a;


            step *= stepsize;
            float new_weight_val = weight_val + step;

            // update the weight
            weight_set[i] = new_weight_val;
            // update the infilled-weights
            int num_weights = it->weight_num_texels[i];
            float perturbation = (new_weight_val - weight_val) * (1.0f / TEXEL_WEIGHT_SUM);

            for (k = num_weights - 1; k >= 0; k--) {
                uint8_t weight_texel = it->weight_texel[i][k];
                float weight_weight  = it->weights_flt[i][k];
                infilled_weights[weight_texel] += perturbation * weight_weight;
            }

        }

    }


    return;
}

void compute_angular_endpoints_1plane(float mode_cutoff,
                                      __global2 float *decimated_quantized_weights,
                                      __global2 float *decimated_weights,
                                      float low_value[MAX_WEIGHT_MODES],
                                      float high_value[MAX_WEIGHT_MODES],
                                      __global ASTC_Encode *ASTCEncode) {
    DEBUG("compute_angular_endpoints_1plane");

    int i;
    float low_values[MAX_DECIMATION_MODES][12];
    float high_values[MAX_DECIMATION_MODES][12];

    for (i = 0; i < MAX_DECIMATION_MODES; i++) {
        int samplecount     = ASTCEncode->bsd.decimation_mode_samples[i];
        int quant_mode      = ASTCEncode->bsd.decimation_mode_maxprec_1plane[i];
        float percentile    = ASTCEncode->bsd.decimation_mode_percentile[i];
        int permit_encode   = ASTCEncode->bsd.permit_encode[i];
        if (permit_encode == 0 || samplecount < 1 || quant_mode < 0 || percentile > mode_cutoff)
            continue;

        compute_angular_endpoints_for_quantization_levels(samplecount,
                decimated_quantized_weights + i * MAX_WEIGHTS_PER_BLOCK,
                decimated_weights + i * MAX_WEIGHTS_PER_BLOCK, quant_mode,
                low_values[i], high_values[i],ASTCEncode);
    }

    for (i = 0; i < MAX_WEIGHT_MODES; i++) {
        if (ASTCEncode->bsd.block_modes[i].is_dual_plane != 0 || ASTCEncode->bsd.block_modes[i].percentile > mode_cutoff)
            continue;
        int quant_mode = ASTCEncode->bsd.block_modes[i].quantization_mode;
        int decim_mode = ASTCEncode->bsd.block_modes[i].decimation_mode;

        low_value[i] = low_values[decim_mode][quant_mode];
        high_value[i] = high_values[decim_mode][quant_mode];
    }

}

void compute_ideal_quantized_weights_for_decimation_table(
    endpoints_and_weights * eai,
    __global decimation_table * it,
    float low_bound, float high_bound,
    __global2 float *weight_set_in,
    __global2 float *weight_set_out,
    __global2 uint8_t * quantized_weight_set,
    int quantization_level) {
    DEBUG("compute_ideal_quantized_weights_for_decimation_table");
    int i;
    int weight_count = it->num_weights;
    int texels_per_block = it->num_texels;

    __constant quantization_and_transfer_table *qat = &(quant_and_xfer_tables[quantization_level]);

    // quantize the weight set using both the specified low/high bounds and the
    // standard 0..1 weight bounds.

    /*
       WTF issue that we need to examine some time
    */

    if (!((high_bound - low_bound) > 0.5f)) {
        low_bound = 0.0f;
        high_bound = 1.0f;
    }

    float rscale = high_bound - low_bound;
    float scale = 1.0f / rscale;

    // rescale the weights so that
    // low_bound -> 0
    // high_bound -> 1
    // OK: first, subtract low_bound, then divide by (high_bound - low_bound)

    for (i = 0; i < weight_count; i++)
        weight_set_out[i] = (weight_set_in[i] - low_bound) * scale;


    float quantization_cutoff = quantization_step_table[quantization_level] * 0.333f;


    int is_perturbable[MAX_WEIGHTS_PER_BLOCK];
    int perturbable_count = 0;

    // quantize the weight set
    for (i = 0; i < weight_count; i++) {
        float ix0 = weight_set_out[i];
        if (ix0 < 0.0f)
            ix0 = 0.0f;
        if (ix0 > 1.0f)
            ix0 = 1.0f;
        float ix = ix0;

        ix *= 1024.0f;
        int ix2 = (int)floor(ix + 0.5f);
        int weight = qat->closest_quantized_weight[ix2];

        ix = qat->unquantized_value_flt[weight];
        weight_set_out[i] = ix;
        quantized_weight_set[i] = (uint8_t)weight;

        // test whether the error of the weight is greater than 1/3 of the weight spacing;
        // if it is not, then it is flagged as "not perturbable". This causes a
        // quality loss of about 0.002 dB, which is totally worth the speedup we're getting.
        is_perturbable[i] = 0;
        if (fabs(ix - ix0) > quantization_cutoff) {
            is_perturbable[i] = 1;
            perturbable_count++;
        }
    }


    // if the decimation table is complete, the quantization above was all we needed to do,
    // so we can early-out.
    if (it->num_weights == it->num_texels) {
        // invert the weight-scaling that was done initially
        // 0 -> low_bound
        // 1 -> high_bound

        rscale = high_bound - low_bound;
        for (i = 0; i < weight_count; i++)
            weight_set_out[i] = (weight_set_out[i] * rscale) + low_bound;

        return;
    }


    int weights_tested = 0;

    // if no weights are flagged as perturbable, don't try to perturb them.
    // if only one weight is flagged as perturbable, perturbation is also pointless.
    if (perturbable_count > 1) {
        endpoints_and_weights eaix;
        for (i = 0; i < texels_per_block; i++) {
            eaix.weights[i] = (eai->weights[i] - low_bound) * scale;
            eaix.weight_error_scale[i] = eai->weight_error_scale[i];
        }

        float infilled_weights[MAX_TEXELS_PER_BLOCK];
        for (i = 0; i < texels_per_block; i++)
            infilled_weights[i] = compute_value_of_texel_flt(i, it, weight_set_out);

        int weight_to_perturb = 0;
        int weights_since_last_perturbation = 0;
        int num_weights = it->num_weights;

        while (weights_since_last_perturbation < num_weights && weights_tested < num_weights * 4) {
            int do_quant_mod = 0;
            if (is_perturbable[weight_to_perturb]) {

                int weight_val = quantized_weight_set[weight_to_perturb];
                int weight_next_up = qat->next_quantized_value[weight_val];
                int weight_next_down = qat->prev_quantized_value[weight_val];
                float flt_weight_val = qat->unquantized_value_flt[weight_val];
                float flt_weight_next_up = qat->unquantized_value_flt[weight_next_up];
                float flt_weight_next_down = qat->unquantized_value_flt[weight_next_down];


                do_quant_mod = 0;

                float error_change_up, error_change_down;

                // compute the error change from perturbing the weight either up or down.
                compute_two_error_changes_from_perturbing_weight_infill(&eaix,
                        it,
                        infilled_weights,
                        weight_to_perturb,
                        (flt_weight_next_up - flt_weight_val), (flt_weight_next_down - flt_weight_val), &error_change_up, &error_change_down);

                int new_weight_val = 0;
                float flt_new_weight_val = 0;
                if (weight_val != weight_next_up && error_change_up < 0.0f) {
                    do_quant_mod = 1;
                    new_weight_val = weight_next_up;
                    flt_new_weight_val = flt_weight_next_up;
                } else if (weight_val != weight_next_down && error_change_down < 0.0f) {
                    do_quant_mod = 1;
                    new_weight_val = weight_next_down;
                    flt_new_weight_val = flt_weight_next_down;
                }


                if (do_quant_mod) {

                    // update the weight.
                    weight_set_out[weight_to_perturb] = flt_new_weight_val;
                    quantized_weight_set[weight_to_perturb] = (uint8_t)new_weight_val;

                    // update the infilled-weights
                    int num_weights1 = it->weight_num_texels[weight_to_perturb];
                    float perturbation = (flt_new_weight_val - flt_weight_val) * (1.0f / TEXEL_WEIGHT_SUM);
                    for (i = num_weights1 - 1; i >= 0; i--) {
                        uint8_t weight_texel = it->weight_texel[weight_to_perturb][i];
                        float weights = it->weights_flt[weight_to_perturb][i];
                        infilled_weights[weight_texel] += perturbation * weights;
                    }

                }
            }

            if (do_quant_mod)
                weights_since_last_perturbation = 0;
            else
                weights_since_last_perturbation++;

            weight_to_perturb++;
            if (weight_to_perturb >= num_weights)
                weight_to_perturb -= num_weights;

            weights_tested++;
        }
    }

    // invert the weight-scaling that was done initially
    // 0 -> low_bound
    // 1 -> high_bound


    for (i = 0; i < weight_count; i++)
        weight_set_out[i] = (weight_set_out[i] * rscale) + low_bound;
}

mat4t invertMat4t(mat4t p) {
    // cross products between the bottom two rows
    float3 bpc0 = cross(p.v[2].yzw, p.v[3].yzw);
    float3 bpc1 = cross(p.v[2].xzw, p.v[3].xzw);
    float3 bpc2 = cross(p.v[2].xyw, p.v[3].xyw);
    float3 bpc3 = cross(p.v[2].xyz, p.v[3].xyz);

    // dot-products for the top rows
    float4 row1 = { dot(bpc0, p.v[1].yzw),
                    -dot(bpc1, p.v[1].xzw),
                    dot(bpc2, p.v[1].xyw),
                    -dot(bpc3, p.v[1].xyz)
                  };

    float det = dot(p.v[0], row1);
    float rdet = 1.0f / det;

    mat4t res;

    float3 tpc0 = cross(p.v[0].yzw, p.v[1].yzw);
    float4 f4f = { row1.x, -dot(bpc0, p.v[0].yzw), dot(tpc0, p.v[3].yzw), -dot(tpc0, p.v[2].yzw) };
    res.v[0] = f4f * rdet;

    float3 tpc1 = cross(p.v[0].xzw, p.v[1].xzw);
    float4 f4f1 = { row1.y, dot(bpc1, p.v[0].xzw), -dot(tpc1, p.v[3].xzw), dot(tpc1, p.v[2].xzw) };
    res.v[1] = f4f1 * rdet;
    float3 tpc2 = cross(p.v[0].xyw, p.v[1].xyw);

    float4 f4f2 = { row1.z, -dot(bpc2, p.v[0].xyw), dot(tpc2, p.v[3].xyw), -dot(tpc2, p.v[2].xyw) };
    res.v[2] = f4f2 * rdet;

    float3 tpc3 = cross(p.v[0].xyz, p.v[1].xyz);
    float4 f4f3 = { row1.w, dot(bpc3, p.v[0].xyz), -dot(tpc3, p.v[3].xyz), dot(tpc3, p.v[2].xyz) };
    res.v[3] = f4f3 * rdet;


    return res;
}

float4 transform(mat4t p, float4 q) {
    float4 tfm = { dot(p.v[0], q), dot(p.v[1], q), dot(p.v[2], q), dot(p.v[3], q) };
    return tfm;
}

float determinant(mat2t p) {
    float2 v = p.v[0].xy * p.v[1].yx;
    return v.x - v.y;
}

//static inline
float mat_square_sum(mat2t p) {
    float a = p.v[0].x;
    float b = p.v[0].y;
    float c = p.v[1].x;
    float d = p.v[1].y;
    return a * a + b * b + c * c + d * d;
}

mat2t invertMat2t(mat2t p) {
    float rdet = 1.0f / determinant(p);
    mat2t res;
    float2 p0 = { p.v[1].y, -p.v[0].y };
    float2 p1 = { -p.v[1].x, p.v[0].x };
    res.v[0] = p0 * rdet;
    res.v[1] = p1 * rdet;
    return res;
}

void recompute_ideal_colors(int weight_quantization_mode,
                            endpoints * ep,                     // contains the endpoints we wish to update
                            float4 * rgbs_vectors,              // used to return RGBS-vectors. (endpoint mode #6)
                            float4 * rgbo_vectors,              // used to return RGBO-vectors. (endpoint mode #7)
                            float2 * lum_vectors,               // used to return luminance-vectors.
                            __global2 uint8_t * weight_set8,     // the current set of weight values
                            __global2 uint8_t * plane2_weight_set8,    // 0 if plane 2 is not actually used.
                            int plane2_color_component,    // color component for 2nd plane of weights; -1 if the 2nd plane of weights is not present
                            __global partition_info * pi,
                            __global decimation_table * it,
                            imageblock * blk,    // picture-block containing the actual data.
                            error_weight_block * ewb,
                            __global ASTC_Encode *ASTCEncode
                           ) {
    DEBUG("recompute_ideal_colors");

    int i, j;
    __constant quantization_and_transfer_table *qat = &(quant_and_xfer_tables[weight_quantization_mode]);

    float weight_set[MAX_WEIGHTS_PER_BLOCK];
    float plane2_weight_set[MAX_WEIGHTS_PER_BLOCK];

    for (i = 0; i < it->num_weights; i++) {

//#ifdef __OPENCL_VERSION__
//         if ((get_global_id(0) == 0) && (get_global_id(1) == 0))
//#endif
//             printf("weight_set8[%d] = %d\n", i, weight_set8[i]);

        weight_set[i] = qat->unquantized_value_flt[weight_set8[i]];
    }
    if (plane2_weight_set8) {

//#ifdef __OPENCL_VERSION__
//         if ((get_global_id(0) == 0) && (get_global_id(1) == 0))
//#endif
//             printf("plane2_weight_set8\n");

        for (i = 0; i < it->num_weights; i++) {
//#ifdef __OPENCL_VERSION__
//             if ((get_global_id(0) == 0) && (get_global_id(1) == 0))
//#endif
//                 printf("plane2_weight_set8[%d] = %d\n", i, plane2_weight_set8[i]);
            plane2_weight_set[i] = qat->unquantized_value_flt[plane2_weight_set8[i]];
        }
    }

    int partition_count = pi->partition_count;

//#ifdef __OPENCL_VERSION__
//     if ((get_global_id(0) == 0) && (get_global_id(1) == 0))
//#endif
//         printf("Low  <%g %g %g %g>\n", ep->endpt0[0].x, ep->endpt0[0].y, ep->endpt0[0].z, ep->endpt0[0].w);


    mat2t pmat1_red[4], pmat1_green[4], pmat1_blue[4], pmat1_alpha[4], pmat1_lum[4], pmat1_scale[4];    // matrices for plane of weights 1
    mat2t pmat2_red[4], pmat2_green[4], pmat2_blue[4], pmat2_alpha[4];    // matrices for plane of weights 2
    float2 red_vec[4];
    float2 green_vec[4];
    float2 blue_vec[4];
    float2 alpha_vec[4];
    float2 lum_vec[4];
    float2 scale_vec[4];
    float2 zerof = {0.0f, 0.0f};

    for (i = 0; i < partition_count; i++) {
        for (j = 0; j < 2; j++) {
            pmat1_red[i].v[j]   = zerof;
            pmat2_red[i].v[j]   = zerof;
            pmat1_green[i].v[j] = zerof;
            pmat2_green[i].v[j] = zerof;
            pmat1_blue[i].v[j]  = zerof;
            pmat2_blue[i].v[j]  = zerof;
            pmat1_alpha[i].v[j] = zerof;
            pmat2_alpha[i].v[j] = zerof;
            pmat1_lum[i].v[j]   = zerof;
            pmat1_scale[i].v[j] = zerof;
        }
        red_vec[i]   = zerof;
        green_vec[i] = zerof;
        blue_vec[i]  = zerof;
        alpha_vec[i] = zerof;
        lum_vec[i]   = zerof;
        scale_vec[i] = zerof;
    }


    float wmin1[4], wmax1[4];
    float wmin2[4], wmax2[4];
    float red_weight_sum[4];
    float green_weight_sum[4];
    float blue_weight_sum[4];
    float alpha_weight_sum[4];
    float lum_weight_sum[4];
    float scale_weight_sum[4];

    float red_weight_weight_sum[4];
    float green_weight_weight_sum[4];
    float blue_weight_weight_sum[4];

    float psum[4];                // sum of (weight * qweight^2) across (red,green,blue)
    float qsum[4];                // sum of (weight * qweight * texelval) across (red,green,blue)


    for (i = 0; i < partition_count; i++) {
        wmin1[i] = 1.0f;
        wmax1[i] = 0.0f;
        wmin2[i] = 1.0f;
        wmax2[i] = 0.0f;
        red_weight_sum[i] = FLOAT_n17;
        green_weight_sum[i] = FLOAT_n17;
        blue_weight_sum[i] = FLOAT_n17;
        alpha_weight_sum[i] = FLOAT_n17;

        lum_weight_sum[i] = FLOAT_n17;
        scale_weight_sum[i] = FLOAT_n17;

        red_weight_weight_sum[i] = FLOAT_n17;
        green_weight_weight_sum[i] = FLOAT_n17;
        blue_weight_weight_sum[i] = FLOAT_n17;

        psum[i] = FLOAT_n17;
        qsum[i] = FLOAT_n17;
    }


    // for each partition, compute the direction that an RGB-scale color endpoint pair would have.
    float3 rgb_sum[4];
    float3 rgb_weight_sum[4];
    float3 scale_directions[4];
    float scale_min[4];
    float scale_max[4];
    float lum_min[4];
    float lum_max[4];
    float3 one17f = { FLOAT_n17, FLOAT_n17, FLOAT_n17 };

    for (i = 0; i < partition_count; i++) {
        rgb_sum[i]        = one17f;
        rgb_weight_sum[i] = one17f;
    }


    for (i = 0; i < ASTCEncode->m_texels_per_block; i++) {
        float3 rgb = { blk->work_data[4 * i], blk->work_data[4 * i + 1], blk->work_data[4 * i + 2]};
        float3 rgb_weight = {ewb->texel_weight_r[i],
                             ewb->texel_weight_g[i],
                             ewb->texel_weight_b[i]
                            };

        int part = pi->partition_of_texel[i];
        rgb_sum[part] = rgb_sum[part] + (rgb * rgb_weight);
        rgb_weight_sum[part] = rgb_weight_sum[part] + rgb_weight;
    }

    for (i = 0; i < partition_count; i++) {
        scale_directions[i] = normalize(rgb_sum[i] / rgb_weight_sum[i]);
        scale_max[i] = 0.0f;
        scale_min[i] = FLOAT_10;
        lum_max[i] = 0.0f;
        lum_min[i] = FLOAT_10;
    }

    for (i = 0; i < ASTCEncode->m_texels_per_block; i++) {
        float r = blk->work_data[4 * i];
        float g = blk->work_data[4 * i + 1];
        float b = blk->work_data[4 * i + 2];
        float a = blk->work_data[4 * i + 3];

        int part = pi->partition_of_texel[i];
        float idx0 = it ? compute_value_of_texel_flt_localVar(i, it, weight_set) : weight_set[i];

//#ifdef __OPENCL_VERSION__
//         if ((get_global_id(0) == 0) && (get_global_id(1) == 0))
//#endif
//             printf("i= %d om_idx0 %3.3f weight_set[i] %3.3f\n",i, idx0, weight_set[i]);


        float om_idx0 = 1.0f - idx0;

        if (idx0 > wmax1[part])
            wmax1[part] = idx0;
        if (idx0 < wmin1[part])
            wmin1[part] = idx0;

        float red_weight = ewb->texel_weight_r[i];
        float green_weight = ewb->texel_weight_g[i];
        float blue_weight = ewb->texel_weight_b[i];
        float alpha_weight = ewb->texel_weight_a[i];

        float lum_weight = (red_weight + green_weight + blue_weight);
        float scale_weight = lum_weight;

        float lum = (r * red_weight + g * green_weight + b * blue_weight) / lum_weight;
        float3 scale_direction = scale_directions[part];
        float3 rgbf = {r, g, b};
        float scale = dot(scale_direction, rgbf);
        if (lum < lum_min[part])
            lum_min[part] = scale;
        if (lum > lum_max[part])
            lum_max[part] = scale;
        if (scale < scale_min[part])
            scale_min[part] = scale;
        if (scale > scale_max[part])
            scale_max[part] = scale;


        red_weight_sum[part] += red_weight;
        green_weight_sum[part] += green_weight;
        blue_weight_sum[part] += blue_weight;
        alpha_weight_sum[part] += alpha_weight;
        lum_weight_sum[part] += lum_weight;
        scale_weight_sum[part] += scale_weight;


        pmat1_red[part].v[0].x += om_idx0 * om_idx0 * red_weight;
        pmat1_red[part].v[0].y += idx0 * om_idx0 * red_weight;
        pmat1_red[part].v[1].x += idx0 * om_idx0 * red_weight;
        pmat1_red[part].v[1].y += idx0 * idx0 * red_weight;

        pmat1_green[part].v[0].x += om_idx0 * om_idx0 * green_weight;
        pmat1_green[part].v[0].y += idx0 * om_idx0 * green_weight;
        pmat1_green[part].v[1].x += idx0 * om_idx0 * green_weight;
        pmat1_green[part].v[1].y += idx0 * idx0 * green_weight;

        pmat1_blue[part].v[0].x += om_idx0 * om_idx0 * blue_weight;
        pmat1_blue[part].v[0].y += idx0 * om_idx0 * blue_weight;
        pmat1_blue[part].v[1].x += idx0 * om_idx0 * blue_weight;
        pmat1_blue[part].v[1].y += idx0 * idx0 * blue_weight;

        pmat1_alpha[part].v[0].x += om_idx0 * om_idx0 * alpha_weight;
        pmat1_alpha[part].v[0].y += idx0 * om_idx0 * alpha_weight;
        pmat1_alpha[part].v[1].x += idx0 * om_idx0 * alpha_weight;
        pmat1_alpha[part].v[1].y += idx0 * idx0 * alpha_weight;

        pmat1_lum[part].v[0].x += om_idx0 * om_idx0 * lum_weight;
        pmat1_lum[part].v[0].y += idx0 * om_idx0 * lum_weight;
        pmat1_lum[part].v[1].x += idx0 * om_idx0 * lum_weight;
        pmat1_lum[part].v[1].y += idx0 * idx0 * lum_weight;

        pmat1_scale[part].v[0].x += om_idx0 * om_idx0 * scale_weight;
        pmat1_scale[part].v[0].y += idx0 * om_idx0 * scale_weight;
        pmat1_scale[part].v[1].x += idx0 * om_idx0 * scale_weight;
        pmat1_scale[part].v[1].y += idx0 * idx0 * scale_weight;

        float idx1 = 0.0f, om_idx1 = 0.0f;
        if (plane2_weight_set8) {
            idx1 = it ? compute_value_of_texel_flt_localVar(i, it, plane2_weight_set) : plane2_weight_set[i];
            om_idx1 = 1.0f - idx1;
            if (idx1 > wmax2[part])
                wmax2[part] = idx1;
            if (idx1 < wmin2[part])
                wmin2[part] = idx1;

            pmat2_red[part].v[0].x += om_idx1 * om_idx1 * red_weight;
            pmat2_red[part].v[0].y += idx1 * om_idx1 * red_weight;
            pmat2_red[part].v[1].x += idx1 * om_idx1 * red_weight;
            pmat2_red[part].v[1].y += idx1 * idx1 * red_weight;

            pmat2_green[part].v[0].x += om_idx1 * om_idx1 * green_weight;
            pmat2_green[part].v[0].y += idx1 * om_idx1 * green_weight;
            pmat2_green[part].v[1].x += idx1 * om_idx1 * green_weight;
            pmat2_green[part].v[1].y += idx1 * idx1 * green_weight;

            pmat2_blue[part].v[0].x += om_idx1 * om_idx1 * blue_weight;
            pmat2_blue[part].v[0].y += idx1 * om_idx1 * blue_weight;
            pmat2_blue[part].v[1].x += idx1 * om_idx1 * blue_weight;
            pmat2_blue[part].v[1].y += idx1 * idx1 * blue_weight;

            pmat2_alpha[part].v[0].x += om_idx1 * om_idx1 * alpha_weight;
            pmat2_alpha[part].v[0].y += idx1 * om_idx1 * alpha_weight;
            pmat2_alpha[part].v[1].x += idx1 * om_idx1 * alpha_weight;
            pmat2_alpha[part].v[1].y += idx1 * idx1 * alpha_weight;
        }

        float red_idx = (plane2_color_component == 0) ? idx1 : idx0;
        float green_idx = (plane2_color_component == 1) ? idx1 : idx0;
        float blue_idx = (plane2_color_component == 2) ? idx1 : idx0;
        float alpha_idx = (plane2_color_component == 3) ? idx1 : idx0;


        red_vec[part].x += (red_weight * r) * (1.0f - red_idx);
        green_vec[part].x += (green_weight * g) * (1.0f - green_idx);
        blue_vec[part].x += (blue_weight * b) * (1.0f - blue_idx);
        alpha_vec[part].x += (alpha_weight * a) * (1.0f - alpha_idx);
        lum_vec[part].x += (lum_weight * lum) * om_idx0;
        scale_vec[part].x += (scale_weight * scale) * om_idx0;

        red_vec[part].y += (red_weight * r) * red_idx;
        green_vec[part].y += (green_weight * g) * green_idx;
        blue_vec[part].y += (blue_weight * b) * blue_idx;
        alpha_vec[part].y += (alpha_weight * a) * alpha_idx;
        lum_vec[part].y += (lum_weight * lum) * idx0;
        scale_vec[part].y += (scale_weight * scale) * idx0;

        red_weight_weight_sum[part] += red_weight * red_idx;
        green_weight_weight_sum[part] += green_weight * green_idx;
        blue_weight_weight_sum[part] += blue_weight * blue_idx;

        psum[part] += red_weight * red_idx * red_idx + green_weight * green_idx * green_idx + blue_weight * blue_idx * blue_idx;

    }

    // calculations specific to mode #7, the HDR RGB-scale mode.
    float red_sum[4];
    float green_sum[4];
    float blue_sum[4];
    for (i = 0; i < partition_count; i++) {
        red_sum[i] = red_vec[i].x + red_vec[i].y;
        green_sum[i] = green_vec[i].x + green_vec[i].y;
        blue_sum[i] = blue_vec[i].x + blue_vec[i].y;
        qsum[i] = red_vec[i].y + green_vec[i].y + blue_vec[i].y;
    }

    // rgb+offset for HDR endpoint mode #7
    int rgbo_fail[4];
    for (i = 0; i < partition_count; i++) {
        mat4t mod7_mat;
        float4 matv0f = {red_weight_sum[i], 0.0f, 0.0f, red_weight_weight_sum[i]    };
        float4 matv1f = {0.0f, green_weight_sum[i], 0.0f, green_weight_weight_sum[i]};
        float4 matv2f = {0.0f, 0.0f, blue_weight_sum[i], blue_weight_weight_sum[i]  };
        float4 matv3f = {red_weight_weight_sum[i], green_weight_weight_sum[i], blue_weight_weight_sum[i], psum[i]};
        mod7_mat.v[0] = matv0f;
        mod7_mat.v[1] = matv1f;
        mod7_mat.v[2] = matv2f;
        mod7_mat.v[3] = matv3f;

        float4 vect = { red_sum[i], green_sum[i], blue_sum[i], qsum[i] };

        mat4t rmod7_mat = invertMat4t(mod7_mat);
        float4 rgbovec = transform(rmod7_mat, vect);
        rgbo_vectors[i] = rgbovec;

        // we will occasionally get a failure due to a singluar matrix. Record whether such a
        // failure has taken place; if it did, compute rgbo_vectors[] with a different method
        // later on.
        float chkval = dot(rgbovec, rgbovec);
        rgbo_fail[i] = chkval != chkval;

    }


    // initialize the luminance and scale vectors with a reasonable default,
    // just in case the subsequent calculation blows up.
    for (i = 0; i < partition_count; i++) {

        float scalediv = scale_min[i] / scale_max[i];
        if (!(scalediv > 0.0f))
            scalediv = 0.0f;    // set to zero if scalediv is zero, negative, or NaN.

        if (scalediv > 1.0f)
            scalediv = 1.0f;
        float3 tmp1 = scale_directions[i] * scale_max[i];
        float4 rgbs_vectorsf = { tmp1.x,tmp1.y, tmp1.z, scalediv};
        rgbs_vectors[i] = rgbs_vectorsf;
        float2 lumf = {lum_min[i], lum_max[i]};
        lum_vectors[i] = lumf;
    }


    for (i = 0; i < partition_count; i++) {

        if (wmin1[i] >= wmax1[i] * 0.999) {
            // if all weights in the partition were equal, then just take average
            // of all colors in the partition and use that as both endpoint colors.
            float4 avg = {(red_vec[i].x + red_vec[i].y) / red_weight_sum[i],
                          (green_vec[i].x + green_vec[i].y) / green_weight_sum[i],
                          (blue_vec[i].x + blue_vec[i].y) / blue_weight_sum[i],
                          (alpha_vec[i].x + alpha_vec[i].y) / alpha_weight_sum[i]
                         };

            if (plane2_color_component != 0 && avg.x == avg.x)
                ep->endpt0[i].x = ep->endpt1[i].x = avg.x;
            if (plane2_color_component != 1 && avg.y == avg.y)
                ep->endpt0[i].y = ep->endpt1[i].y = avg.y;
            if (plane2_color_component != 2 && avg.z == avg.z)
                ep->endpt0[i].z = ep->endpt1[i].z = avg.z;
            if (plane2_color_component != 3 && avg.w == avg.w)
                ep->endpt0[i].w = ep->endpt1[i].w = avg.w;
            float3 tmp2 = scale_directions[i] * scale_max[i];
            float4 rgbs_vectorsf = { tmp2.x,tmp2.y,tmp2.z, 1.0f};
            rgbs_vectors[i] = rgbs_vectorsf;
            float  lumval  = (red_vec[i].x + red_vec[i].y + green_vec[i].x + green_vec[i].y + blue_vec[i].x + blue_vec[i].y) / (red_weight_sum[i] + green_weight_sum[i] + blue_weight_sum[i]);
            float2 lumval2 = {lumval, lumval};
            lum_vectors[i] = lumval2;
        }

        else {

            // otherwise, complete the analytic calculation of ideal-endpoint-values
            // for the given set of texel weigths and pixel colors.

//#ifdef __OPENCL_VERSION__
//             if ((get_global_id(0) == 0) && (get_global_id(1) == 0))
//#endif
//                 printf("Plane-1a partition %d pmat1_red=[%3.3f %3.3f]\n", i, pmat1_red[i].v[0].x, pmat1_red[i].v[1].x);


            float red_det1 = determinant(pmat1_red[i]);
            float green_det1 = determinant(pmat1_green[i]);
            float blue_det1 = determinant(pmat1_blue[i]);
            float alpha_det1 = determinant(pmat1_alpha[i]);
            float lum_det1 = determinant(pmat1_lum[i]);
            float scale_det1 = determinant(pmat1_scale[i]);

            float red_mss1 = mat_square_sum(pmat1_red[i]);
            float green_mss1 = mat_square_sum(pmat1_green[i]);
            float blue_mss1 = mat_square_sum(pmat1_blue[i]);
            float alpha_mss1 = mat_square_sum(pmat1_alpha[i]);
            float lum_mss1 = mat_square_sum(pmat1_lum[i]);
            float scale_mss1 = mat_square_sum(pmat1_scale[i]);

//#ifdef __OPENCL_VERSION__
//             if ((get_global_id(0) == 0) && (get_global_id(1) == 0))
//#endif
//                 printf("Plane-1b partition %d determinants: R=%g G=%g B=%g A=%g L=%g S=%g\n", i, red_det1, green_det1, blue_det1, alpha_det1, lum_det1, scale_det1);

            pmat1_red[i]   = invertMat2t(pmat1_red[i]);
            pmat1_green[i] = invertMat2t(pmat1_green[i]);
            pmat1_blue[i]  = invertMat2t(pmat1_blue[i]);
            pmat1_alpha[i] = invertMat2t(pmat1_alpha[i]);
            pmat1_lum[i]   = invertMat2t(pmat1_lum[i]);
            pmat1_scale[i] = invertMat2t(pmat1_scale[i]);

            float4 ep0 = {dot(pmat1_red[i].v[0], red_vec[i]),
                          dot(pmat1_green[i].v[0], green_vec[i]),
                          dot(pmat1_blue[i].v[0], blue_vec[i]),
                          dot(pmat1_alpha[i].v[0], alpha_vec[i])
                         };

            float4 ep1 = {dot(pmat1_red[i].v[1], red_vec[i]),
                          dot(pmat1_green[i].v[1], green_vec[i]),
                          dot(pmat1_blue[i].v[1], blue_vec[i]),
                          dot(pmat1_alpha[i].v[1], alpha_vec[i])
                         };

            float lum_ep0    = dot(pmat1_lum[i].v[0], lum_vec[i]);
            float lum_ep1    = dot(pmat1_lum[i].v[1], lum_vec[i]);
            float scale_ep0  = dot(pmat1_scale[i].v[0], scale_vec[i]);
            float scale_ep1  = dot(pmat1_scale[i].v[1], scale_vec[i]);


            if (plane2_color_component != 0 && fabs(red_det1) > (red_mss1 * FLOAT_n4) && ep0.x == ep0.x && ep1.x == ep1.x) {
                ep->endpt0[i].x = ep0.x;
                ep->endpt1[i].x = ep1.x;
            }
            if (plane2_color_component != 1 && fabs(green_det1) > (green_mss1 * FLOAT_n4) && ep0.y == ep0.y && ep1.y == ep1.y) {
                ep->endpt0[i].y = ep0.y;
                ep->endpt1[i].y = ep1.y;
            }
            if (plane2_color_component != 2 && fabs(blue_det1) > (blue_mss1 * FLOAT_n4) && ep0.z == ep0.z && ep1.z == ep1.z) {
                ep->endpt0[i].z = ep0.z;
                ep->endpt1[i].z = ep1.z;
            }
            if (plane2_color_component != 3 && fabs(alpha_det1) > (alpha_mss1 * FLOAT_n4) && ep0.w == ep0.w && ep1.w == ep1.w) {
                ep->endpt0[i].w = ep0.w;
                ep->endpt1[i].w = ep1.w;
            }

            if (fabs(lum_det1) > (lum_mss1 * FLOAT_n4) && lum_ep0 == lum_ep0 && lum_ep1 == lum_ep1 && lum_ep0 < lum_ep1) {
                lum_vectors[i].x = lum_ep0;
                lum_vectors[i].y = lum_ep1;
            }
            if (fabs(scale_det1) > (scale_mss1 * FLOAT_n4) && scale_ep0 == scale_ep0 && scale_ep1 == scale_ep1 && scale_ep0 < scale_ep1) {
                float scalediv = scale_ep0 / scale_ep1;
                float3 tmp3 = scale_directions[i] * scale_ep1;
                float4 rgbs_vectorsf = { tmp3.x,tmp3.y,tmp3.z, scalediv};
                rgbs_vectors[i] = rgbs_vectorsf;
            }

        }

        if (plane2_weight_set8) {
            if (wmin2[i] >= wmax2[i] * 0.999) {
                // if all weights in the partition were equal, then just take average
                // of all colors in the partition and use that as both endpoint colors.
                float4 avg = {(red_vec[i].x + red_vec[i].y) / red_weight_sum[i],
                              (green_vec[i].x + green_vec[i].y) / green_weight_sum[i],
                              (blue_vec[i].x + blue_vec[i].y) / blue_weight_sum[i],
                              (alpha_vec[i].x + alpha_vec[i].y) / alpha_weight_sum[i]
                             };

                if (plane2_color_component == 0 && avg.x == avg.x)
                    ep->endpt0[i].x = ep->endpt1[i].x = avg.x;
                if (plane2_color_component == 1 && avg.y == avg.y)
                    ep->endpt0[i].y = ep->endpt1[i].y = avg.y;
                if (plane2_color_component == 2 && avg.z == avg.z)
                    ep->endpt0[i].z = ep->endpt1[i].z = avg.z;
                if (plane2_color_component == 3 && avg.w == avg.w)
                    ep->endpt0[i].w = ep->endpt1[i].w = avg.w;
            } else {

                // otherwise, complete the analytic calculation of ideal-endpoint-values
                // for the given set of texel weigths and pixel colors.
                float red_det2 = determinant(pmat2_red[i]);
                float green_det2 = determinant(pmat2_green[i]);
                float blue_det2 = determinant(pmat2_blue[i]);
                float alpha_det2 = determinant(pmat2_alpha[i]);

                float red_mss2 = mat_square_sum(pmat2_red[i]);
                float green_mss2 = mat_square_sum(pmat2_green[i]);
                float blue_mss2 = mat_square_sum(pmat2_blue[i]);
                float alpha_mss2 = mat_square_sum(pmat2_alpha[i]);

//#ifdef __OPENCL_VERSION__
//                 if ((get_global_id(0) == 0) && (get_global_id(1) == 0))
//#endif
//                     printf("Plane-2 partition %d determinants: R=%g G=%g B=%g A=%g\n", i, red_det2, green_det2, blue_det2, alpha_det2);

                pmat2_red[i]   = invertMat2t(pmat2_red[i]);
                pmat2_green[i] = invertMat2t(pmat2_green[i]);
                pmat2_blue[i]  = invertMat2t(pmat2_blue[i]);
                pmat2_alpha[i] = invertMat2t(pmat2_alpha[i]);
                float4 ep0 = {dot(pmat2_red[i].v[0], red_vec[i]),
                              dot(pmat2_green[i].v[0], green_vec[i]),
                              dot(pmat2_blue[i].v[0], blue_vec[i]),
                              dot(pmat2_alpha[i].v[0], alpha_vec[i])
                             };
                float4 ep1 = {dot(pmat2_red[i].v[1], red_vec[i]),
                              dot(pmat2_green[i].v[1], green_vec[i]),
                              dot(pmat2_blue[i].v[1], blue_vec[i]),
                              dot(pmat2_alpha[i].v[1], alpha_vec[i])
                             };

                if (plane2_color_component == 0 && fabs(red_det2) > (red_mss2 * FLOAT_n4) && ep0.x == ep0.x && ep1.x == ep1.x) {
                    ep->endpt0[i].x = ep0.x;
                    ep->endpt1[i].x = ep1.x;
                }
                if (plane2_color_component == 1 && fabs(green_det2) > (green_mss2 * FLOAT_n4) && ep0.y == ep0.y && ep1.y == ep1.y) {
                    ep->endpt0[i].y = ep0.y;
                    ep->endpt1[i].y = ep1.y;
                }
                if (plane2_color_component == 2 && fabs(blue_det2) > (blue_mss2 * FLOAT_n4) && ep0.z == ep0.z && ep1.z == ep1.z) {
                    ep->endpt0[i].z = ep0.z;
                    ep->endpt1[i].z = ep1.z;
                }
                if (plane2_color_component == 3 && fabs(alpha_det2) > (alpha_mss2 * FLOAT_n4) && ep0.w == ep0.w && ep1.w == ep1.w) {
                    ep->endpt0[i].w = ep0.w;
                    ep->endpt1[i].w = ep1.w;
                }

            }
        }
    }

    // if the calculation of an RGB-offset vector failed, try to compute
    // a somewhat-sensible value anyway
    float3 onef = {1.0f,1.0f,1.0f};
    for (i = 0; i < partition_count; i++)
        if (rgbo_fail[i]) {
            float4 v0 = ep->endpt0[i];
            float4 v1 = ep->endpt1[i];
            float avgdif = dot(v1.xyz - v0.xyz, onef) * (1.0f / 3.0f);
            if (avgdif <= 0.0f)
                avgdif = 0.0f;
            float4 avg = (v0 + v1) * 0.5f;
            float4 ep0 = {avgdif, avgdif, avgdif, avgdif};
            ep0 = avg - ep0 * 0.5f;
            float3 tmp_ep0 = ep0.xyz;
            float4 tmp = { tmp_ep0.x,tmp_ep0.y,tmp_ep0.z, avgdif};
            rgbo_vectors[i] = tmp;
        }

//#ifdef __OPENCL_VERSION__
//     if ((get_global_id(0) == 0) && (get_global_id(1) == 0))
//#endif
//         printf("post  <%g %g %g %g>\n", ep->endpt0[0].x, ep->endpt0[0].y, ep->endpt0[0].z, ep->endpt0[0].w);

}


int imageblock_uses_alpha4(imageblock * blk) {
    return blk->alpha_max != blk->alpha_min;
}

void compute_averages_and_directions_2_components(__global partition_info * pt,
        imageblock * blk,
        error_weight_block * ewb, float2 * color_scalefactors, int component1, int component2, float2 * averages, float2 * directions) {
    int i;
    int partition_count = pt->partition_count;
    int partition;

    float *texel_weights;
    if (component1 == 0 && component2 == 1)
        texel_weights = ewb->texel_weight_rg;
    else if (component1 == 0 && component2 == 2)
        texel_weights = ewb->texel_weight_rb;
    else if (component1 == 1 && component2 == 2)
        texel_weights = ewb->texel_weight_gb;
    else {
        texel_weights = ewb->texel_weight_rg;
        // unsupported set of color components.
        //exit(1);
        astc_codec_internal_error("ERROR: compute_averages_and_directions_2_components");
    }


    for (partition = 0; partition < partition_count; partition++) {
        int texelcount = pt->texels_per_partition[partition];

        float2 base_sum = { 0.0f, 0.0f };
        float partition_weight = 0.0f;

        for (i = 0; i < texelcount; i++) {
            int iwt = pt->texels_of_partition[partition][i];
            float weight = texel_weights[iwt];
            float2 texel_datum = {blk->work_data[4 * iwt + component1], blk->work_data[4 * iwt + component2]};
            texel_datum = texel_datum  * weight;
            partition_weight += weight;

            base_sum = base_sum + texel_datum;
        }

        float2 csf = color_scalefactors[partition];

        float2 average = base_sum * 1.0f / MAX(partition_weight, FLOAT_n7);
        averages[partition] = average * csf.xy;


        float2 sum_xp = { 0.0f, 0.0f };
        float2 sum_yp = { 0.0f, 0.0f };

        for (i = 0; i < texelcount; i++) {
            int iwt = pt->texels_of_partition[partition][i];
            float weight = texel_weights[iwt];
            float2 texel_datum = {blk->work_data[4 * iwt + component1], blk->work_data[4 * iwt + component2]};
            texel_datum = (texel_datum - average) * weight;

            if (texel_datum.x > 0.0f)
                sum_xp = sum_xp + texel_datum;
            if (texel_datum.y > 0.0f)
                sum_yp = sum_yp + texel_datum;
        }

        float prod_xp = dot(sum_xp, sum_xp);
        float prod_yp = dot(sum_yp, sum_yp);

        float2 best_vector = sum_xp;
        float best_sum = prod_xp;
        if (prod_yp > best_sum) {
            best_vector = sum_yp;
            best_sum = prod_yp;
        }

        directions[partition] = best_vector;
    }

}

//static
void compute_endpoints_and_ideal_weights_2_components( __global partition_info * pt,
        imageblock * blk, error_weight_block * ewb,
        endpoints_and_weights * ei, int component1, int component2,
        __global ASTC_Encode *ASTCEncode) {
    int i;

    int partition_count = pt->partition_count;
    ei->ep.partition_count = partition_count;

    float4 error_weightings[4];
    float4 color_scalefactors[4];

    float2 scalefactors[4];

    float *error_weights;
    if (component1 == 0 && component2 == 1)
        error_weights = ewb->texel_weight_rg;
    else if (component1 == 0 && component2 == 2)
        error_weights = ewb->texel_weight_rb;
    else if (component1 == 1 && component2 == 2)
        error_weights = ewb->texel_weight_gb;
    else {
        error_weights = ewb->texel_weight_rg;
        astc_codec_internal_error("ERROR: compute_endpoints_and_ideal_weights_2_components");
    }

    compute_partition_error_color_weightings(ewb, pt, error_weightings, color_scalefactors, ASTCEncode);

    for (i = 0; i < partition_count; i++) {
        float s1 = 0.0f, s2 = 0.0f;
        switch (component1) {
        case 0:
            s1 = color_scalefactors[i].x;
            break;
        case 1:
            s1 = color_scalefactors[i].y;
            break;
        case 2:
            s1 = color_scalefactors[i].z;
            break;
        case 3:
            s1 = color_scalefactors[i].w;
            break;
        }

        switch (component2) {
        case 0:
            s2 = color_scalefactors[i].x;
            break;
        case 1:
            s2 = color_scalefactors[i].y;
            break;
        case 2:
            s2 = color_scalefactors[i].z;
            break;
        case 3:
            s2 = color_scalefactors[i].w;
            break;
        }
        float2 s2f = { s1, s2 };
        scalefactors[i] = normalize(s2f) * 1.41421356f;
    }


    float lowparam[4], highparam[4];

    float2 averages[4];
    float2 directions[4];

    line2 lines[4];
    float scale[4];
    float length_squared[4];


    for (i = 0; i < partition_count; i++) {
        lowparam[i] = FLOAT_10;
        highparam[i] = -FLOAT_10;
    }


    compute_averages_and_directions_2_components(pt, blk, ewb, scalefactors, component1, component2, averages, directions);

    for (i = 0; i < partition_count; i++) {
        float2 egv = directions[i];
        float2 zero2f = {0.0f,0.0f};
        if (egv.x + egv.y < 0.0f)
            directions[i] = zero2f - egv;
    }

    for (i = 0; i < partition_count; i++) {
        float2 one2f = {1.0f, 1.0f};
        lines[i].a = averages[i];
        if (dot(directions[i], directions[i]) == 0.0f)
            lines[i].b = normalize(one2f);
        else
            lines[i].b = normalize(directions[i]);
    }


    for (i = 0; i < ASTCEncode->m_texels_per_block; i++) {
        if (error_weights[i] > FLOAT_n10) {
            int partition = pt->partition_of_texel[i];
            float2 point = { blk->work_data[4 * i + component1], blk->work_data[4 * i + component2] };
            point = point  * scalefactors[partition];
            line2 l = lines[partition];
            float param = dot(point - l.a, l.b);
            ei->weights[i] = param;
            if (param < lowparam[partition])
                lowparam[partition] = param;
            if (param > highparam[partition])
                highparam[partition] = param;
        } else {
            ei->weights[i] = -FLOAT_38;
        }
    }

    float2 lowvalues[4];
    float2 highvalues[4];


    for (i = 0; i < partition_count; i++) {
        float length = highparam[i] - lowparam[i];
        if (length < 0) {          // case for when none of the texels had any weight
            lowparam[i] = 0.0f;
            highparam[i] = FLOAT_n7;
        }

        // it is possible for a uniform-color partition to produce length=0; this
        // causes NaN-production and NaN-propagation later on. Set length to
        // a small value to avoid this problem.
        if (length < FLOAT_n7)
            length = FLOAT_n7;

        length_squared[i] = length * length;
        scale[i] = 1.0f / length;

        float2 ep0 = lines[i].a + lines[i].b * lowparam[i];
        float2 ep1 = lines[i].a + lines[i].b * highparam[i];

        ep0 = ep0 / scalefactors[i];
        ep1 = ep1 / scalefactors[i];

        lowvalues[i] = ep0;
        highvalues[i] = ep1;
    }


    for (i = 0; i < partition_count; i++) {
        float4 ep0f = { blk->red_min, blk->green_min, blk->blue_min, blk->alpha_min };
        float4 ep1f = { blk->red_max, blk->green_max, blk->blue_max, blk->alpha_max };
        ei->ep.endpt0[i] = ep0f;
        ei->ep.endpt1[i] = ep1f;

        float2 ep0 = lowvalues[i];
        float2 ep1 = highvalues[i];

        switch (component1) {
        case 0:
            ei->ep.endpt0[i].x = ep0.x;
            ei->ep.endpt1[i].x = ep1.x;
            break;
        case 1:
            ei->ep.endpt0[i].y = ep0.x;
            ei->ep.endpt1[i].y = ep1.x;
            break;
        case 2:
            ei->ep.endpt0[i].z = ep0.x;
            ei->ep.endpt1[i].z = ep1.x;
            break;
        case 3:
            ei->ep.endpt0[i].w = ep0.x;
            ei->ep.endpt1[i].w = ep1.x;
            break;
        }
        switch (component2) {
        case 0:
            ei->ep.endpt0[i].x = ep0.y;
            ei->ep.endpt1[i].x = ep1.y;
            break;
        case 1:
            ei->ep.endpt0[i].y = ep0.y;
            ei->ep.endpt1[i].y = ep1.y;
            break;
        case 2:
            ei->ep.endpt0[i].z = ep0.y;
            ei->ep.endpt1[i].z = ep1.y;
            break;
        case 3:
            ei->ep.endpt0[i].w = ep0.y;
            ei->ep.endpt1[i].w = ep1.y;
            break;
        }
    }

    for (i = 0; i < ASTCEncode->m_texels_per_block; i++) {
        int partition = pt->partition_of_texel[i];
        float idx = (ei->weights[i] - lowparam[partition]) * scale[partition];
        if (idx > 1.0f)
            idx = 1.0f;
        else if (!(idx > 0.0f))
            idx = 0.0f;

        ei->weights[i] = idx;
        ei->weight_error_scale[i] = length_squared[partition] * error_weights[i];
        if (astc_isnan(ei->weight_error_scale[i])) {
            astc_codec_internal_error("ERROR: compute_endpoints_and_ideal_weights_2_components:2");
        }
    }
}


//static
void compute_endpoints_and_ideal_weights_1_component(
    __global partition_info * pt, imageblock * blk,
    error_weight_block * ewb, endpoints_and_weights * ei,
    int component,
    __global ASTC_Encode *ASTCEncode) {
    int i;

    int partition_count = pt->partition_count;
    ei->ep.partition_count = partition_count;

    float lowvalues[4], highvalues[4];
    float partition_error_scale[4];
    float linelengths_rcp[4];


    float *error_weights;
    switch (component) {
    case 0:
        error_weights = ewb->texel_weight_r;
        break;
    case 1:
        error_weights = ewb->texel_weight_g;
        break;
    case 2:
        error_weights = ewb->texel_weight_b;
        break;
    case 3:
        error_weights = ewb->texel_weight_a;
        break;
    default:
        error_weights = ewb->texel_weight_r;
        astc_codec_internal_error("ERROR:1 compute_endpoints_and_ideal_weights_1_component");
    }


    for (i = 0; i < partition_count; i++) {
        lowvalues[i] = FLOAT_10;
        highvalues[i] = -FLOAT_10;
    }

    for (i = 0; i < ASTCEncode->m_texels_per_block; i++) {
        if (error_weights[i] > FLOAT_n10) {
            float value = blk->work_data[4 * i + component];
            int partition = pt->partition_of_texel[i];
            if (value < lowvalues[partition])
                lowvalues[partition] = value;
            if (value > highvalues[partition])
                highvalues[partition] = value;
        }
    }

    for (i = 0; i < partition_count; i++) {
        float diff = highvalues[i] - lowvalues[i];
        if (diff < 0) {
            lowvalues[i] = 0;
            highvalues[i] = 0;
        }
        if (diff < FLOAT_n7)
            diff = FLOAT_n7;
        partition_error_scale[i] = diff * diff;
        linelengths_rcp[i] = 1.0f / diff;
    }

    for (i = 0; i < ASTCEncode->m_texels_per_block; i++) {
        float value = blk->work_data[4 * i + component];
        int partition = pt->partition_of_texel[i];
        value -= lowvalues[partition];
        value *= linelengths_rcp[partition];
        if (value > 1.0f)
            value = 1.0f;
        else if (!(value > 0.0f))
            value = 0.0f;

        ei->weights[i] = value;
        ei->weight_error_scale[i] = partition_error_scale[partition] * error_weights[i];
        if (astc_isnan(ei->weight_error_scale[i])) {
            astc_codec_internal_error("ERROR:2 compute_endpoints_and_ideal_weights_1_component");
        }
    }

    for (i = 0; i < partition_count; i++) {
        float4 ep0 = { blk->red_min, blk->green_min, blk->blue_min, blk->alpha_min };
        float4 ep1 = { blk->red_max, blk->green_max, blk->blue_max, blk->alpha_max };

        ei->ep.endpt0[i] = ep0;
        ei->ep.endpt1[i] = ep1;
        switch (component) {
        case 0:                // red/x
            ei->ep.endpt0[i].x = lowvalues[i];
            ei->ep.endpt1[i].x = highvalues[i];
            break;
        case 1:                // green/y
            ei->ep.endpt0[i].y = lowvalues[i];
            ei->ep.endpt1[i].y = highvalues[i];
            break;
        case 2:                // blue/z
            ei->ep.endpt0[i].z = lowvalues[i];
            ei->ep.endpt1[i].z = highvalues[i];
            break;
        case 3:                // alpha/w
            ei->ep.endpt0[i].w = lowvalues[i];
            ei->ep.endpt1[i].w = highvalues[i];
            break;
        }
    }
}

void compute_endpoints_and_ideal_weights_2_planes(
    __global partition_info * pt,
    imageblock * blk,
    error_weight_block * ewb,
    int separate_component,
    endpoints_and_weights * ei1,
    endpoints_and_weights * ei2,
    __global ASTC_Encode *ASTCEncode) {
    int uses_alpha = imageblock_uses_alpha4(blk);
    switch (separate_component) {
    case 0:                    // separate weights for red
        if (uses_alpha == 1)
            compute_endpoints_and_ideal_weights_3_components(pt, blk, ewb, ei1, 1, 2, 3, ASTCEncode);
        else
            compute_endpoints_and_ideal_weights_2_components(pt, blk, ewb, ei1, 1, 2, ASTCEncode);
        compute_endpoints_and_ideal_weights_1_component( pt, blk, ewb, ei2, 0, ASTCEncode);
        break;

    case 1:                    // separate weights for green
        if (uses_alpha == 1)
            compute_endpoints_and_ideal_weights_3_components(pt, blk, ewb, ei1, 0, 2, 3, ASTCEncode);
        else
            compute_endpoints_and_ideal_weights_2_components(pt, blk, ewb, ei1, 0, 2, ASTCEncode);
        compute_endpoints_and_ideal_weights_1_component(pt, blk, ewb, ei2, 1, ASTCEncode);
        break;

    case 2:                    // separate weights for blue
        if (uses_alpha == 1)
            compute_endpoints_and_ideal_weights_3_components(pt, blk, ewb, ei1, 0, 1, 3, ASTCEncode);
        else
            compute_endpoints_and_ideal_weights_2_components(pt, blk, ewb, ei1, 0, 1, ASTCEncode);
        compute_endpoints_and_ideal_weights_1_component(pt, blk, ewb, ei2, 2, ASTCEncode);
        break;

    case 3:                    // separate weights for alpha
        if (uses_alpha == 0) {
            astc_codec_internal_error("ERROR: compute_endpoints_and_ideal_weights_2_planes");
        }
        compute_endpoints_and_ideal_weights_3_components(pt, blk, ewb, ei1, 0, 1, 2, ASTCEncode);

        compute_endpoints_and_ideal_weights_1_component(pt, blk, ewb, ei2, 3, ASTCEncode);
        break;
    }

}

// function to compute the error across a tile when using a particular line for
// a particular partition.
float compute_error_squared_rgb_single_partition(int partition_to_test,
        __global partition_info * pt,    // the partition that we use when computing the squared-error.
        imageblock * blk, error_weight_block * ewb,
        processed_line3 * lin,    // the line for the partition.
        __global ASTC_Encode *ASTCEncode
                                                ) {
    int i;

    float errorsum = 0.0f;

    for (i = 0; i < ASTCEncode->m_texels_per_block; i++) {
        int partition = pt->partition_of_texel[i];
        float texel_weight = ewb->texel_weight_rgb[i];
        if (partition != partition_to_test || texel_weight < FLOAT_n20)
            continue;
        float3 point = { blk->work_data[4 * i], blk->work_data[4 * i + 1], blk->work_data[4 * i + 2] };

        float param = dot(point, lin->bs);
        float3 rp1 = lin->amod + param * lin->bis;
        float3 dist = rp1 - point;
        float4 ews = ewb->error_weights[i];

        errorsum += dot(ews.xyz, dist * dist);
    }
    return errorsum;
}

// helper function to merge two endpoint-colors
void merge_endpoints(endpoints * ep1,    // contains three of the color components
                     endpoints * ep2,    // contains the remaining color component
                     int separate_component, endpoints * res) {
    int i;
    int partition_count = ep1->partition_count;
    res->partition_count = partition_count;
    for (i = 0; i < partition_count; i++) {
        res->endpt0[i] = ep1->endpt0[i];
        res->endpt1[i] = ep1->endpt1[i];
    }

    switch (separate_component) {
    case 0:
        for (i = 0; i < partition_count; i++) {
            res->endpt0[i].x = ep2->endpt0[i].x;
            res->endpt1[i].x = ep2->endpt1[i].x;
        }
        break;
    case 1:
        for (i = 0; i < partition_count; i++) {
            res->endpt0[i].y = ep2->endpt0[i].y;
            res->endpt1[i].y = ep2->endpt1[i].y;
        }
        break;
    case 2:
        for (i = 0; i < partition_count; i++) {
            res->endpt0[i].z = ep2->endpt0[i].z;
            res->endpt1[i].z = ep2->endpt1[i].z;
        }
        break;
    case 3:
        for (i = 0; i < partition_count; i++) {
            res->endpt0[i].w = ep2->endpt0[i].w;
            res->endpt1[i].w = ep2->endpt1[i].w;
        }
        break;
    }
}

void compute_encoding_choice_errors(
    imageblock * pb, __global partition_info * pi, error_weight_block * ewb,
    int separate_component,    // component that is separated out in 2-plane mode, -1 in 1-plane mode
    encoding_choice_errors * eci,
    __global ASTC_Encode *ASTCEncode) {
    int i;

    int partition_count = pi->partition_count;

    float3 averages[4];
    float3 directions_rgb[4];
    float2 directions_rg[4];
    float2 directions_rb[4];
    float2 directions_gb[4];

    float4 error_weightings[4];
    float4 color_scalefactors[4];
    float4 inverse_color_scalefactors[4];

    compute_partition_error_color_weightings(ewb, pi, error_weightings, color_scalefactors, ASTCEncode);

    compute_averages_and_directions_rgb(pi, pb, ewb, color_scalefactors, averages, directions_rgb, directions_rg, directions_rb, directions_gb);

    line3 uncorr_rgb_lines[4];
    line3 samechroma_rgb_lines[4];    // for LDR-RGB-scale
    line3 rgb_luma_lines[4];    // for HDR-RGB-scale
    line3 luminance_lines[4];

    processed_line3 proc_uncorr_rgb_lines[4];
    processed_line3 proc_samechroma_rgb_lines[4];    // for LDR-RGB-scale
    processed_line3 proc_rgb_luma_lines[4];    // for HDR-RGB-scale
    processed_line3 proc_luminance_lines[4];


    for (i = 0; i < partition_count; i++) {
        inverse_color_scalefactors[i].x = 1.0f / MAX(color_scalefactors[i].x, FLOAT_n7);
        inverse_color_scalefactors[i].y = 1.0f / MAX(color_scalefactors[i].y, FLOAT_n7);
        inverse_color_scalefactors[i].z = 1.0f / MAX(color_scalefactors[i].z, FLOAT_n7);
        inverse_color_scalefactors[i].w = 1.0f / MAX(color_scalefactors[i].w, FLOAT_n7);


        uncorr_rgb_lines[i].a = averages[i];
        float3 tmp3f;

        if (dot(directions_rgb[i], directions_rgb[i]) == 0.0f) {
            tmp3f = color_scalefactors[i].xyz;
            uncorr_rgb_lines[i].b = normalize(tmp3f);
        } else {
            tmp3f = directions_rgb[i];
            uncorr_rgb_lines[i].b = normalize(tmp3f);
        }

        float3 zero3f = { 0.0f, 0.0f, 0.0f } ;
        samechroma_rgb_lines[i].a = zero3f;
        if (dot(averages[i], averages[i]) < FLOAT_n20) {
            float3 tmp3af = color_scalefactors[i].xyz;
            samechroma_rgb_lines[i].b = normalize(tmp3af);
        } else
            samechroma_rgb_lines[i].b = normalize(averages[i]);

        rgb_luma_lines[i].a = averages[i];
        rgb_luma_lines[i].b = normalize(color_scalefactors[i].xyz);

        luminance_lines[i].a = zero3f;
        luminance_lines[i].b = normalize(color_scalefactors[i].xyz);

        proc_uncorr_rgb_lines[i].amod = (uncorr_rgb_lines[i].a - uncorr_rgb_lines[i].b * dot(uncorr_rgb_lines[i].a, uncorr_rgb_lines[i].b)) * inverse_color_scalefactors[i].xyz;
        proc_uncorr_rgb_lines[i].bs = uncorr_rgb_lines[i].b * color_scalefactors[i].xyz;
        proc_uncorr_rgb_lines[i].bis = uncorr_rgb_lines[i].b * inverse_color_scalefactors[i].xyz;

        proc_samechroma_rgb_lines[i].amod = (samechroma_rgb_lines[i].a - samechroma_rgb_lines[i].b * dot(samechroma_rgb_lines[i].a, samechroma_rgb_lines[i].b)) * inverse_color_scalefactors[i].xyz;
        proc_samechroma_rgb_lines[i].bs = samechroma_rgb_lines[i].b * color_scalefactors[i].xyz;
        proc_samechroma_rgb_lines[i].bis = samechroma_rgb_lines[i].b * inverse_color_scalefactors[i].xyz;

        proc_rgb_luma_lines[i].amod = (rgb_luma_lines[i].a - rgb_luma_lines[i].b * dot(rgb_luma_lines[i].a, rgb_luma_lines[i].b)) * inverse_color_scalefactors[i].xyz;
        proc_rgb_luma_lines[i].bs = rgb_luma_lines[i].b * color_scalefactors[i].xyz;
        proc_rgb_luma_lines[i].bis = rgb_luma_lines[i].b * inverse_color_scalefactors[i].xyz;

        proc_luminance_lines[i].amod = (luminance_lines[i].a - luminance_lines[i].b * dot(luminance_lines[i].a, luminance_lines[i].b)) * inverse_color_scalefactors[i].xyz;
        proc_luminance_lines[i].bs = luminance_lines[i].b * color_scalefactors[i].xyz;
        proc_luminance_lines[i].bis = luminance_lines[i].b * inverse_color_scalefactors[i].xyz;

    }

    float uncorr_rgb_error[4];
    float samechroma_rgb_error[4];
    float rgb_luma_error[4];
    float luminance_rgb_error[4];

    for (i = 0; i < partition_count; i++) {

        uncorr_rgb_error[i] = compute_error_squared_rgb_single_partition(i, pi, pb, ewb, &(proc_uncorr_rgb_lines[i]), ASTCEncode);

        samechroma_rgb_error[i] = compute_error_squared_rgb_single_partition(i, pi, pb, ewb, &(proc_samechroma_rgb_lines[i]), ASTCEncode);

        rgb_luma_error[i] = compute_error_squared_rgb_single_partition(i,  pi, pb, ewb, &(proc_rgb_luma_lines[i]), ASTCEncode);

        luminance_rgb_error[i] = compute_error_squared_rgb_single_partition(i,  pi, pb, ewb, &(proc_luminance_lines[i]), ASTCEncode);

    }

    // compute the error that arises from just ditching alpha and RGB
    float alpha_drop_error[4];
    float rgb_drop_error[4];
    for (i = 0; i < partition_count; i++) {
        alpha_drop_error[i] = 0;
        rgb_drop_error[i] = 0;
    }
    for (i = 0; i < ASTCEncode->m_texels_per_block; i++) {
        int partition = pi->partition_of_texel[i];
        float alpha = pb->work_data[4 * i + 3];
        float default_alpha = pb->alpha_lns[i] ? (float)0x7800 : (float)0xFFFF;

        float omalpha = alpha - default_alpha;
        alpha_drop_error[partition] += omalpha * omalpha * ewb->error_weights[i].w;
        float red = pb->work_data[4 * i];
        float green = pb->work_data[4 * i + 1];
        float blue = pb->work_data[4 * i + 2];
        rgb_drop_error[partition] += red * red * ewb->error_weights[i].x + green * green * ewb->error_weights[i].y + blue * blue * ewb->error_weights[i].z;
    }

    // check if we are eligible for blue-contraction and offset-encoding

    endpoints ep;
    if (separate_component == -1) {
        endpoints_and_weights ei;
        compute_endpoints_and_ideal_weights_1_plane(pi, pb, ewb, &ei, ASTCEncode);
        ep = ei.ep;
    } else {
        endpoints_and_weights ei1, ei2;
        compute_endpoints_and_ideal_weights_2_planes( pi, pb, ewb, separate_component, &ei1, &ei2, ASTCEncode);

        merge_endpoints(&(ei1.ep), &(ei2.ep), separate_component, &ep);
    }

    int eligible_for_offset_encode[4];
    int eligible_for_blue_contraction[4];

    for (i = 0; i < partition_count; i++) {
        float4 endpt0 = ep.endpt0[i];
        float4 endpt1 = ep.endpt1[i];
        float4 endpt_dif = endpt1 - endpt0;
        if (fabs(endpt_dif.x) < (0.12 * 65535.0f) && fabs(endpt_dif.y) < (0.12 * 65535.0f) && fabs(endpt_dif.z) < (0.12 * 65535.0f))
            eligible_for_offset_encode[i] = 1;
        else
            eligible_for_offset_encode[i] = 0;
        endpt0.x += (endpt0.x - endpt0.z);
        endpt0.y += (endpt0.y - endpt0.z);
        endpt1.x += (endpt1.x - endpt1.z);
        endpt1.y += (endpt1.y - endpt1.z);
        if (endpt0.x > (0.01f * 65535.0f) && endpt0.x < (0.99f * 65535.0f)
                && endpt1.x >(0.01f * 65535.0f) && endpt1.x < (0.99f * 65535.0f)
                && endpt0.y >(0.01f * 65535.0f) && endpt0.y < (0.99f * 65535.0f) && endpt1.y >(0.01f * 65535.0f) && endpt1.y < (0.99f * 65535.0f))
            eligible_for_blue_contraction[i] = 1;
        else
            eligible_for_blue_contraction[i] = 0;
    }


    // finally, gather up our results
    for (i = 0; i < partition_count; i++) {
        eci[i].rgb_scale_error = (samechroma_rgb_error[i] - uncorr_rgb_error[i]) * 0.7f;    // empirical
        eci[i].rgb_luma_error = (rgb_luma_error[i] - uncorr_rgb_error[i]) * 1.5f;    // wild guess
        eci[i].luminance_error = (luminance_rgb_error[i] - uncorr_rgb_error[i]) * 3.0f;    // empirical
        eci[i].alpha_drop_error = alpha_drop_error[i] * 3.0f;
        eci[i].rgb_drop_error = rgb_drop_error[i] * 3.0f;
        eci[i].can_offset_encode = eligible_for_offset_encode[i];
        eci[i].can_blue_contract = eligible_for_blue_contraction[i];
    }
}

// for a given partition, compute for every (integer-component-count, quantization-level)
// the color error.
//static
void compute_color_error_for_every_integer_count_and_quantization_level(int encode_hdr_rgb,    // 1 = perform HDR encoding, 0 = perform LDR encoding.
        int encode_hdr_alpha, int partition_index, __global partition_info * pi,
        encoding_choice_errors * eci,    // pointer to the structure for the CURRENT partition.
        endpoints * ep, float4 error_weightings[4],
        // arrays to return results back through.
        float best_error[21][4], int format_of_choice[21][4]) {
    int i;
    int partition_size = pi->texels_per_partition[partition_index];

    // float4 eps = ep->endpt_scale[partition_index];
    float4 ep0 = ep->endpt0[partition_index];    // / eps;
    float4 ep1 = ep->endpt1[partition_index];    // / eps;

    float ep0_max = MAX(MAX(ep0.x, ep0.y), ep0.z);
    float ep0_min = MIN(MIN(ep0.x, ep0.y), ep0.z);
    float ep1_max = MAX(MAX(ep1.x, ep1.y), ep1.z);
    float ep1_min = MIN(MIN(ep1.x, ep1.y), ep1.z);

    ep0_min = MAX(ep0_min, 0.0f);
    ep1_min = MAX(ep1_min, 0.0f);
    ep0_max = MAX(ep0_max, FLOAT_n10);
    ep1_max = MAX(ep1_max, FLOAT_n10);

    /*
    float lum_scale = MAX(ep0_max, ep1_max); float alpha_scale = MAX(ep0.w, ep1.w);
    */


    float4 error_weight = error_weightings[partition_index];


    float error_weight_rgbsum = error_weight.x + error_weight.y + error_weight.z;


    float range_upper_limit_rgb = encode_hdr_rgb ? 61440.0f : 65535.0f;
    float range_upper_limit_alpha = encode_hdr_alpha ? 61440.0f : 65535.0f;

    // it is possible to get endpoint colors significantly outside [0,upper-limit]
    // even if the input data are safely contained in [0,upper-limit];
    // we need to add an error term for this situation,
    float4 ep0_range_error_high;
    float4 ep1_range_error_high;
    float4 ep0_range_error_low;
    float4 ep1_range_error_low;

    ep0_range_error_high.x = MAX(0.0f, ep0.x - range_upper_limit_rgb);
    ep0_range_error_high.y = MAX(0.0f, ep0.y - range_upper_limit_rgb);
    ep0_range_error_high.z = MAX(0.0f, ep0.z - range_upper_limit_rgb);
    ep0_range_error_high.w = MAX(0.0f, ep0.w - range_upper_limit_alpha);
    ep1_range_error_high.x = MAX(0.0f, ep1.x - range_upper_limit_rgb);
    ep1_range_error_high.y = MAX(0.0f, ep1.y - range_upper_limit_rgb);
    ep1_range_error_high.z = MAX(0.0f, ep1.z - range_upper_limit_rgb);
    ep1_range_error_high.w = MAX(0.0f, ep1.w - range_upper_limit_alpha);

    ep0_range_error_low.x = MIN(0.0f, ep0.x);
    ep0_range_error_low.y = MIN(0.0f, ep0.y);
    ep0_range_error_low.z = MIN(0.0f, ep0.z);
    ep0_range_error_low.w = MIN(0.0f, ep0.w);
    ep1_range_error_low.x = MIN(0.0f, ep1.x);
    ep1_range_error_low.y = MIN(0.0f, ep1.y);
    ep1_range_error_low.z = MIN(0.0f, ep1.z);
    ep1_range_error_low.w = MIN(0.0f, ep1.w);

    float4 sum_range_error =
        (ep0_range_error_low * ep0_range_error_low) + (ep1_range_error_low * ep1_range_error_low) + (ep0_range_error_high * ep0_range_error_high) + (ep1_range_error_high * ep1_range_error_high);
    float rgb_range_error = dot(sum_range_error.xyz, error_weight.xyz) * 0.5f * partition_size;
    float alpha_range_error = sum_range_error.w * error_weight.w * 0.5f * partition_size;

    if (encode_hdr_rgb) {

        // collect some statistics
        float af, cf;
        if (ep1.x > ep1.y && ep1.x > ep1.z) {
            af = ep1.x;
            cf = ep1.x - ep0.x;
        } else if (ep1.y > ep1.z) {
            af = ep1.y;
            cf = ep1.y - ep0.y;
        } else {
            af = ep1.z;
            cf = ep1.z - ep0.z;
        }

        float bf = af - ep1_min;    // estimate of color-component spread in high endpoint color
        float3 cf3 = { cf, cf, cf };
        float3 prd = ep1.xyz - cf3;
        float3 pdif = prd - ep0.xyz;
        // estimate of color-component spread in low endpoint color
        float df = (float)MAX(MAX(fabs(pdif.x), fabs(pdif.y)), fabs(pdif.z));

        int b = (int)bf;
        int c = (int)cf;
        int d = (int)df;


        // determine which one of the 6 submodes is likely to be used in
        // case of an RGBO-mode
        int rgbo_mode = 5;        // 7 bits per component
        // mode 4: 8 7 6
        if (b < 32768 && c < 16384)
            rgbo_mode = 4;
        // mode 3: 9 6 7
        if (b < 8192 && c < 16384)
            rgbo_mode = 3;
        // mode 2: 10 5 8
        if (b < 2048 && c < 16384)
            rgbo_mode = 2;
        // mode 1: 11 6 5
        if (b < 2048 && c < 1024)
            rgbo_mode = 1;
        // mode 0: 11 5 7
        if (b < 1024 && c < 4096)
            rgbo_mode = 0;

        // determine which one of the 9 submodes is likely to be used in
        // case of an RGB-mode.
        int rgb_mode = 8;        // 8 bits per component, except 7 bits for blue

        // mode 0: 9 7 6 7
        if (b < 16384 && c < 8192 && d < 8192)
            rgb_mode = 0;
        // mode 1: 9 8 6 6
        if (b < 32768 && c < 8192 && d < 4096)
            rgb_mode = 1;
        // mode 2: 10 6 7 7
        if (b < 4096 && c < 8192 && d < 4096)
            rgb_mode = 2;
        // mode 3: 10 7 7 6
        if (b < 8192 && c < 8192 && d < 2048)
            rgb_mode = 3;
        // mode 4: 11 8 6 5
        if (b < 8192 && c < 2048 && d < 512)
            rgb_mode = 4;
        // mode 5: 11 6 8 6
        if (b < 2048 && c < 8192 && d < 1024)
            rgb_mode = 5;
        // mode 6: 12 7 7 5
        if (b < 2048 && c < 2048 && d < 256)
            rgb_mode = 6;
        // mode 7: 12 6 7 6
        if (b < 1024 && c < 2048 && d < 512)
            rgb_mode = 7;

        float mode7mult = rgbo_error_scales[rgbo_mode] * 0.0015f;    // empirically determined ....
        float mode11mult = rgb_error_scales[rgb_mode] * 0.010f;    // empirically determined ....

        float lum_high = (ep1.x + ep1.y + ep1.z) * (1.0f / 3.0f);
        float lum_low = (ep0.x + ep0.y + ep0.z) * (1.0f / 3.0f);
        float lumdif = lum_high - lum_low;
        float mode23mult = lumdif < 960 ? 4.0f : lumdif < 3968 ? 16.0f : 128.0f;

        mode23mult *= 0.0005f;    // empirically determined ....

        // pick among the available HDR endpoint modes
        for (i = 0; i < 8; i++) {
            best_error[i][3] = FLOAT_30;
            format_of_choice[i][3] = encode_hdr_alpha ? FMT_HDR_RGBA : FMT_HDR_RGB_LDR_ALPHA;
            best_error[i][2] = FLOAT_30;
            format_of_choice[i][2] = FMT_HDR_RGB;
            best_error[i][1] = FLOAT_30;
            format_of_choice[i][1] = FMT_HDR_RGB_SCALE;
            best_error[i][0] = FLOAT_30;
            format_of_choice[i][0] = FMT_HDR_LUMINANCE_LARGE_RANGE;
        }


        for (i = 8; i < 21; i++) {
            // base_quant_error should depend on the scale-factor that would be used
            // during actual encode of the color value.

            float base_quant_error = baseline_quant_error[i] * partition_size * 1.0f;
            float rgb_quantization_error = error_weight_rgbsum * base_quant_error * 2.0f;
            float alpha_quantization_error = error_weight.w * base_quant_error * 2.0f;
            float rgba_quantization_error = rgb_quantization_error + alpha_quantization_error;

            // for 8 integers, we have two encodings: one with HDR alpha and another one
            // with LDR alpha.

            float full_hdr_rgba_error = rgba_quantization_error + rgb_range_error + alpha_range_error;
            best_error[i][3] = full_hdr_rgba_error;
            format_of_choice[i][3] = encode_hdr_alpha ? FMT_HDR_RGBA : FMT_HDR_RGB_LDR_ALPHA;

            // for 6 integers, we have one HDR-RGB encoding
            float full_hdr_rgb_error = (rgb_quantization_error * mode11mult) + rgb_range_error + eci->alpha_drop_error;
            best_error[i][2] = full_hdr_rgb_error;
            format_of_choice[i][2] = FMT_HDR_RGB;

            // for 4 integers, we have one HDR-RGB-Scale encoding
            float hdr_rgb_scale_error = (rgb_quantization_error * mode7mult) + rgb_range_error + eci->alpha_drop_error + eci->rgb_luma_error;

            best_error[i][1] = hdr_rgb_scale_error;
            format_of_choice[i][1] = FMT_HDR_RGB_SCALE;

            // for 2 integers, we assume luminance-with-large-range
            float hdr_luminance_error = (rgb_quantization_error * mode23mult) + rgb_range_error + eci->alpha_drop_error + eci->luminance_error;
            best_error[i][0] = hdr_luminance_error;
            format_of_choice[i][0] = FMT_HDR_LUMINANCE_LARGE_RANGE;
        }
    }


    else {
        for (i = 0; i < 4; i++) {
            best_error[i][3] = FLOAT_30;
            best_error[i][2] = FLOAT_30;
            best_error[i][1] = FLOAT_30;
            best_error[i][0] = FLOAT_30;

            format_of_choice[i][3] = FMT_RGBA;
            format_of_choice[i][2] = FMT_RGB;
            format_of_choice[i][1] = FMT_RGB_SCALE;
            format_of_choice[i][0] = FMT_LUMINANCE;
        }


        // pick among the available LDR endpoint modes
        for (i = 4; i < 21; i++) {
            float base_quant_error = baseline_quant_error[i] * partition_size * 1.0f;
            float rgb_quantization_error = error_weight_rgbsum * base_quant_error;
            float alpha_quantization_error = error_weight.w * base_quant_error;
            float rgba_quantization_error = rgb_quantization_error + alpha_quantization_error;

            // for 8 integers, the available encodings are:
            // full LDR RGB-Alpha
            float full_ldr_rgba_error = rgba_quantization_error;
            if (eci->can_blue_contract)
                full_ldr_rgba_error *= 0.625f;
            if (eci->can_offset_encode && i <= 18)
                full_ldr_rgba_error *= 0.5f;
            full_ldr_rgba_error += rgb_range_error + alpha_range_error;

            best_error[i][3] = full_ldr_rgba_error;
            format_of_choice[i][3] = FMT_RGBA;

            // for 6 integers, we have:
            // - an LDR-RGB encoding
            // - an RGBS + Alpha encoding (LDR)

            float full_ldr_rgb_error = rgb_quantization_error;
            if (eci->can_blue_contract)
                full_ldr_rgb_error *= 0.5f;
            if (eci->can_offset_encode && i <= 18)
                full_ldr_rgb_error *= 0.25f;
            full_ldr_rgb_error += eci->alpha_drop_error + rgb_range_error;

            float rgbs_alpha_error = rgba_quantization_error + eci->rgb_scale_error + rgb_range_error + alpha_range_error;

            if (rgbs_alpha_error < full_ldr_rgb_error) {
                best_error[i][2] = rgbs_alpha_error;
                format_of_choice[i][2] = FMT_RGB_SCALE_ALPHA;
            } else {
                best_error[i][2] = full_ldr_rgb_error;
                format_of_choice[i][2] = FMT_RGB;
            }


            // for 4 integers, we have a Luminance-Alpha encoding and the RGBS encoding
            float ldr_rgbs_error = rgb_quantization_error + eci->alpha_drop_error + eci->rgb_scale_error + rgb_range_error;

            float lum_alpha_error = rgba_quantization_error + eci->luminance_error + rgb_range_error + alpha_range_error;

            if (ldr_rgbs_error < lum_alpha_error) {
                best_error[i][1] = ldr_rgbs_error;
                format_of_choice[i][1] = FMT_RGB_SCALE;
            } else {
                best_error[i][1] = lum_alpha_error;
                format_of_choice[i][1] = FMT_LUMINANCE_ALPHA;
            }


            // for 2 integers, we have a Luminance-encoding and an Alpha-encoding.
            float luminance_error = rgb_quantization_error + eci->alpha_drop_error + eci->luminance_error + rgb_range_error;

            best_error[i][0] = luminance_error;
            format_of_choice[i][0] = FMT_LUMINANCE;
        }
    }
}

// for 1 partition, find the best combination (one format + a quantization level) for a given bitcount
//static
void one_partition_find_best_combination_for_bitcount(float combined_best_error[21][4],
        int formats_of_choice[21][4], int bits_available, int *best_quantization_level, int *best_formats, float *error_of_best_combination, __global ASTC_Encode *ASTCEncode) {
    int i;
    int best_integer_count = -1;
    float best_integer_count_error = FLOAT_20;
    for (i = 0; i < 4; i++) {
        // compute the quantization level for a given number of integers and a given number of bits.
        int quantization_level = ASTCEncode->quantization_mode_table[i + 1][bits_available];
        if (quantization_level == -1)
            continue;            // used to indicate the case where we don't have enoug bits to represent a given endpoint format at all.
        if (combined_best_error[quantization_level][i] < best_integer_count_error) {
            best_integer_count_error = combined_best_error[quantization_level][i];
            best_integer_count = i;
        }
    }

    int ql = ASTCEncode->quantization_mode_table[best_integer_count + 1][bits_available];

    *best_quantization_level = ql;
    *error_of_best_combination = best_integer_count_error;
    if (ql >= 0)
        *best_formats = formats_of_choice[ql][best_integer_count];
    else
        *best_formats = FMT_LUMINANCE;

}

// for 2 partitions, find the best format combinations for every (quantization-mode, integer-count) combination
//static
void two_partitions_find_best_combination_for_every_quantization_and_integer_count(float best_error[2][21][4],    // indexed by (partition, quant-level, integer-pair-count-minus-1)
        int format_of_choice[2][21][4],
        float combined_best_error[21][7],    // indexed by (quant-level, integer-pair-count-minus-2)
        int formats_of_choice[21][7][2]) {
    int i, j;

    for (i = 0; i < 21; i++)
        for (j = 0; j < 7; j++)
            combined_best_error[i][j] = FLOAT_30;

    int quant;
    for (quant = 5; quant < 21; quant++) {
        for (i = 0; i < 4; i++) {  // integer-count for first endpoint-pair
            for (j = 0; j < 4; j++) {  // integer-count for second endpoint-pair
                int low2 = MIN(i, j);
                int high2 = MAX(i, j);
                if ((high2 - low2) > 1)
                    continue;

                int intcnt = i + j;
                float errorterm = MIN(best_error[0][quant][i] + best_error[1][quant][j], FLOAT_10);
                if (errorterm <= combined_best_error[quant][intcnt]) {
                    combined_best_error[quant][intcnt] = errorterm;
                    formats_of_choice[quant][intcnt][0] = format_of_choice[0][quant][i];
                    formats_of_choice[quant][intcnt][1] = format_of_choice[1][quant][j];
                }
            }
        }
    }
}

// for 2 partitions, find the best combination (two formats + a quantization level) for a given bitcount
//static
void two_partitions_find_best_combination_for_bitcount(float combined_best_error[21][7],
        int formats_of_choice[21][7][2],
        int bits_available, int
        *best_quantization_level,
        int *best_quantization_level_mod,
        int *best_formats,
        float *error_of_best_combination,
        __global ASTC_Encode *ASTCEncode) {
    int i;

    int best_integer_count = 0;
    float best_integer_count_error = FLOAT_20;
    int integer_count;

    for (integer_count = 2; integer_count <= 8; integer_count++) {
        // compute the quantization level for a given number of integers and a given number of bits.
        int quantization_level = ASTCEncode->quantization_mode_table[integer_count][bits_available];
        if (quantization_level == -1)
            break;                // used to indicate the case where we don't have enoug bits to represent a given endpoint format at all.
        float integer_count_error = combined_best_error[quantization_level][integer_count - 2];
        if (integer_count_error < best_integer_count_error) {
            best_integer_count_error = integer_count_error;
            best_integer_count = integer_count;
        }
    }

    int ql = ASTCEncode->quantization_mode_table[best_integer_count][bits_available];
    int ql_mod = ASTCEncode->quantization_mode_table[best_integer_count][bits_available + 2];

    *best_quantization_level = ql;
    *best_quantization_level_mod = ql_mod;
    *error_of_best_combination = best_integer_count_error;
    if (ql >= 0) {
        for (i = 0; i < 2; i++)
            best_formats[i] = formats_of_choice[ql][best_integer_count - 2][i];
    } else {
        for (i = 0; i < 2; i++)
            best_formats[i] = FMT_LUMINANCE;
    }
}

#ifdef ENABLE_3_PARTITION_CODE
// 4
// for 3 partitions, find the best format combinations for every (quantization-mode, integer-count) combination
static void three_partitions_find_best_combination_for_every_quantization_and_integer_count(float best_error[3][21][4],    // indexed by (partition, quant-level, integer-count)
        int format_of_choice[3][21][4], float combined_best_error[21][10], int formats_of_choice[21][10][3]) {
    int i, j, k;

    for (i = 0; i < 21; i++)
        for (j = 0; j < 10; j++)
            combined_best_error[i][j] = FLOAT_30;

    int quant;
    for (quant = 5; quant < 21; quant++) {
        for (i = 0; i < 4; i++) {  // integer-count for first endpoint-pair
            for (j = 0; j < 4; j++) {  // integer-count for second endpoint-pair
                int low2 = MIN(i, j);
                int high2 = MAX(i, j);
                if ((high2 - low2) > 1)
                    continue;
                for (k = 0; k < 4; k++) {  // integer-count for third endpoint-pair
                    int low3 = MIN(k, low2);
                    int high3 = MAX(k, high2);
                    if ((high3 - low3) > 1)
                        continue;

                    int intcnt = i + j + k;
                    float errorterm = MIN(best_error[0][quant][i] + best_error[1][quant][j] + best_error[2][quant][k], FLOAT_10);
                    if (errorterm <= combined_best_error[quant][intcnt]) {
                        combined_best_error[quant][intcnt] = errorterm;
                        formats_of_choice[quant][intcnt][0] = format_of_choice[0][quant][i];
                        formats_of_choice[quant][intcnt][1] = format_of_choice[1][quant][j];
                        formats_of_choice[quant][intcnt][2] = format_of_choice[2][quant][k];
                    }
                }
            }
        }
    }
}


// 4
static void three_partitions_find_best_combination_for_bitcount(float combined_best_error[21][10],
        int formats_of_choice[21][10][3],
        int bits_available, int *best_quantization_level, int *best_quantization_level_mod, int *best_formats, float *error_of_best_combination, __global ASTC_Encode *ASTCEncode) {
    int i;

    int best_integer_count = 0;
    float best_integer_count_error = FLOAT_20;
    int integer_count;

    for (integer_count = 3; integer_count <= 9; integer_count++) {
        // compute the quantization level for a given number of integers and a given number of bits.
        int quantization_level = ASTCEncode->quantization_mode_table[integer_count][bits_available];
        if (quantization_level == -1)
            break;                // used to indicate the case where we don't have enough bits to represent a given endpoint format at all.
        float integer_count_error = combined_best_error[quantization_level][integer_count - 3];
        if (integer_count_error < best_integer_count_error) {
            best_integer_count_error = integer_count_error;
            best_integer_count = integer_count;
        }
    }

    int ql = ASTCEncode->quantization_mode_table[best_integer_count][bits_available];
    int ql_mod = ASTCEncode->quantization_mode_table[best_integer_count][bits_available + 5];

    *best_quantization_level = ql;
    *best_quantization_level_mod = ql_mod;
    *error_of_best_combination = best_integer_count_error;
    if (ql >= 0) {
        for (i = 0; i < 3; i++)
            best_formats[i] = formats_of_choice[ql][best_integer_count - 3][i];
    } else {
        for (i = 0; i < 3; i++)
            best_formats[i] = FMT_LUMINANCE;
    }
}
#endif

#ifdef ENABLE_4_PARTITION_CODE
// 4
// for 4 partitions, find the best format combinations for every (quantization-mode, integer-count) combination
static void four_partitions_find_best_combination_for_every_quantization_and_integer_count(float best_error[4][21][4],    // indexed by (partition, quant-level, integer-count)
        int format_of_choice[4][21][4], float combined_best_error[21][13], int formats_of_choice[21][13][4]) {
    int i, j, k, l;

    for (i = 0; i < 21; i++)
        for (j = 0; j < 13; j++)
            combined_best_error[i][j] = FLOAT_30;

    int quant;
    for (quant = 5; quant < 21; quant++) {
        for (i = 0; i < 4; i++) {  // integer-count for first endpoint-pair
            for (j = 0; j < 4; j++) {  // integer-count for second endpoint-pair
                int low2 = MIN(i, j);
                int high2 = MAX(i, j);
                if ((high2 - low2) > 1)
                    continue;
                for (k = 0; k < 4; k++) {  // integer-count for third endpoint-pair
                    int low3 = MIN(k, low2);
                    int high3 = MAX(k, high2);
                    if ((high3 - low3) > 1)
                        continue;
                    for (l = 0; l < 4; l++) {  // integer-count for fourth endpoint-pair
                        int low4 = MIN(l, low3);
                        int high4 = MAX(l, high3);
                        if ((high4 - low4) > 1)
                            continue;

                        int intcnt = i + j + k + l;
                        float errorterm = MIN(best_error[0][quant][i] + best_error[1][quant][j] + best_error[2][quant][k] + best_error[3][quant][l], FLOAT_10);
                        if (errorterm <= combined_best_error[quant][intcnt]) {
                            combined_best_error[quant][intcnt] = errorterm;
                            formats_of_choice[quant][intcnt][0] = format_of_choice[0][quant][i];
                            formats_of_choice[quant][intcnt][1] = format_of_choice[1][quant][j];
                            formats_of_choice[quant][intcnt][2] = format_of_choice[2][quant][k];
                            formats_of_choice[quant][intcnt][3] = format_of_choice[3][quant][l];
                        }
                    }
                }
            }
        }
    }
}

// 4
// for 4 partitions, find the best combination (four formats + a quantization level) for a given bitcount
static void four_partitions_find_best_combination_for_bitcount(float combined_best_error[21][13],
        int formats_of_choice[21][13][4],
        int bits_available, int *best_quantization_level, int *best_quantization_level_mod, int *best_formats, float *error_of_best_combination, __global ASTC_Encode *ASTCEncode) {
    int i;
    int best_integer_count = -4;
    float best_integer_count_error = FLOAT_20;
    int integer_count;

    for (integer_count = 4; integer_count <= 9; integer_count++) {
        // compute the quantization level for a given number of integers and a given number of bits.
        int quantization_level = ASTCEncode->quantization_mode_table[integer_count][bits_available];
        if (quantization_level == -1)
            break;                // used to indicate the case where we don't have enoug bits to represent a given endpoint format at all.
        float integer_count_error = combined_best_error[quantization_level][integer_count - 4];
        if (integer_count_error < best_integer_count_error) {
            best_integer_count_error = integer_count_error;
            best_integer_count = integer_count;
        }
    }

    if (best_integer_count < 0) {
        best_integer_count = 0;
    }

    int ql = ASTCEncode->quantization_mode_table[best_integer_count][bits_available];
    int ql_mod = ASTCEncode->quantization_mode_table[best_integer_count][bits_available + 8];

    *best_quantization_level = ql;
    *best_quantization_level_mod = ql_mod;
    *error_of_best_combination = best_integer_count_error;
    if (ql >= 0) {
        for (i = 0; i < 4; i++)
            best_formats[i] = formats_of_choice[ql][best_integer_count - 4][i];
    } else {
        for (i = 0; i < 4; i++)
            best_formats[i] = FMT_LUMINANCE;
    }
}
#endif

void determine_optimal_set_of_endpoint_formats_to_use(
    __global partition_info * pt,
    imageblock * blk,
    error_weight_block * ewb,
    endpoints * ep,
    int separate_component,    // separate color component for 2-plane mode; -1 for single-plane mode
    // bitcounts and errors computed for the various quantization methods
    int *qwt_bitcounts, float *qwt_errors,
    // output data
    int partition_format_specifiers[4][4],
    int quantized_weight[4],
    int quantization_level[4], int quantization_level_mod[4],
    __global ASTC_Encode *ASTCEncode) {
    int i, j;
    int partition_count = pt->partition_count;

    int encode_hdr_rgb = blk->rgb_lns[0];
    int encode_hdr_alpha = blk->alpha_lns[0];


    // call a helper function to compute the errors that result from various
    // encoding choices (such as using luminance instead of RGB, discarding Alpha,
    // using RGB-scale in place of two separate RGB endpoints and so on)
    encoding_choice_errors eci[4];
    compute_encoding_choice_errors(blk, pt, ewb, separate_component, eci, ASTCEncode);

    // for each partition, compute the error weights to apply for that partition.
    float4 error_weightings[4];
    float4 dummied_color_scalefactors[4];    // only used to receive data
    compute_partition_error_color_weightings(ewb, pt, error_weightings, dummied_color_scalefactors, ASTCEncode);


    float best_error[4][21][4];
    int format_of_choice[4][21][4];
    for (i = 0; i < partition_count; i++)
        compute_color_error_for_every_integer_count_and_quantization_level(encode_hdr_rgb, encode_hdr_alpha, i, pt, &(eci[i]), ep, error_weightings, best_error[i], format_of_choice[i]);

    float errors_of_best_combination[MAX_WEIGHT_MODES];
    int best_quantization_levels[MAX_WEIGHT_MODES];
    int best_quantization_levels_mod[MAX_WEIGHT_MODES];
    int best_ep_formats[MAX_WEIGHT_MODES][4];

    // code for the case where the block contains 1 partition
    if (partition_count == 1) {
        int best_quantization_level;
        int best_format;
        float error_of_best_combination;
        for (i = 0; i < MAX_WEIGHT_MODES; i++) {
            if (qwt_errors[i] >= FLOAT_29) {
                errors_of_best_combination[i] = FLOAT_30;
                continue;
            }

            one_partition_find_best_combination_for_bitcount(best_error[0], format_of_choice[0], qwt_bitcounts[i],
                    &best_quantization_level, &best_format, &error_of_best_combination,ASTCEncode);
            error_of_best_combination += qwt_errors[i];

            errors_of_best_combination[i] = error_of_best_combination;
            best_quantization_levels[i] = best_quantization_level;
            best_quantization_levels_mod[i] = best_quantization_level;
            best_ep_formats[i][0] = best_format;
        }
    }

    // code for the case where the block contains 2 partitions
    else if (partition_count == 2) {
        int best_quantization_level;
        int best_quantization_level_mod;
        int best_formats[2];
        float error_of_best_combination;

        float combined_best_error[21][7];
        int formats_of_choice[21][7][2];

        two_partitions_find_best_combination_for_every_quantization_and_integer_count(best_error, format_of_choice, combined_best_error, formats_of_choice);


        for (i = 0; i < MAX_WEIGHT_MODES; i++) {
            if (qwt_errors[i] >= FLOAT_29) {
                errors_of_best_combination[i] = FLOAT_30;
                continue;
            }

            two_partitions_find_best_combination_for_bitcount(combined_best_error, formats_of_choice, qwt_bitcounts[i],
                    &best_quantization_level, &best_quantization_level_mod, best_formats, &error_of_best_combination,
                    ASTCEncode);

            error_of_best_combination += qwt_errors[i];

            errors_of_best_combination[i] = error_of_best_combination;
            best_quantization_levels[i] = best_quantization_level;
            best_quantization_levels_mod[i] = best_quantization_level_mod;
            best_ep_formats[i][0] = best_formats[0];
            best_ep_formats[i][1] = best_formats[1];
        }
    }

#ifdef ENABLE_3_PARTITION_CODE
    // code for the case where the block contains 3 partitions
    else if (partition_count == 3) {
        int best_quantization_level;
        int best_quantization_level_mod;
        int best_formats[3];
        float error_of_best_combination;

        float combined_best_error[21][10];
        int formats_of_choice[21][10][3];

        three_partitions_find_best_combination_for_every_quantization_and_integer_count(best_error, format_of_choice, combined_best_error, formats_of_choice);

        for (i = 0; i < MAX_WEIGHT_MODES; i++) {
            if (qwt_errors[i] >= FLOAT_29) {
                errors_of_best_combination[i] = FLOAT_30;
                continue;
            }

            three_partitions_find_best_combination_for_bitcount(combined_best_error,
                    formats_of_choice, qwt_bitcounts[i], &best_quantization_level, &best_quantization_level_mod, best_formats,
                    &error_of_best_combination, ASTCEncode);
            error_of_best_combination += qwt_errors[i];

            errors_of_best_combination[i] = error_of_best_combination;
            best_quantization_levels[i] = best_quantization_level;
            best_quantization_levels_mod[i] = best_quantization_level_mod;
            best_ep_formats[i][0] = best_formats[0];
            best_ep_formats[i][1] = best_formats[1];
            best_ep_formats[i][2] = best_formats[2];
        }
    }
#endif

#ifdef ENABLE_4_PARTITION_CODE
    // code for the case where the block contains 4 partitions
    else if (partition_count == 4) {
        int best_quantization_level;
        int best_quantization_level_mod;
        int best_formats[4];
        float error_of_best_combination;

        float combined_best_error[21][13];
        int formats_of_choice[21][13][4];

        four_partitions_find_best_combination_for_every_quantization_and_integer_count(best_error, format_of_choice, combined_best_error, formats_of_choice);

        for (i = 0; i < MAX_WEIGHT_MODES; i++) {
            if (qwt_errors[i] >= FLOAT_29) {
                errors_of_best_combination[i] = FLOAT_30;
                continue;
            }
            four_partitions_find_best_combination_for_bitcount(combined_best_error,
                    formats_of_choice, qwt_bitcounts[i], &best_quantization_level, &best_quantization_level_mod, best_formats,
                    &error_of_best_combination, ASTCEncode);
            error_of_best_combination += qwt_errors[i];

            errors_of_best_combination[i] = error_of_best_combination;
            best_quantization_levels[i] = best_quantization_level;
            best_quantization_levels_mod[i] = best_quantization_level_mod;
            best_ep_formats[i][0] = best_formats[0];
            best_ep_formats[i][1] = best_formats[1];
            best_ep_formats[i][2] = best_formats[2];
            best_ep_formats[i][3] = best_formats[3];
        }
    }
#endif

    // finally, go through the results and pick the 4 best-looking modes.

    int best_error_weights[4];

    for (i = 0; i < 4; i++) {
        float best_ep_error = FLOAT_30;
        int best_error_index = -1;
        for (j = 0; j < MAX_WEIGHT_MODES; j++) {
            if (errors_of_best_combination[j] < best_ep_error && best_quantization_levels[j] >= 5) {
                best_ep_error = errors_of_best_combination[j];
                best_error_index = j;
            }
        }
        best_error_weights[i] = best_error_index;

        if(best_error_index >= 0) {
            errors_of_best_combination[best_error_index] = FLOAT_30;
        }
    }

    for (i = 0; i < 4; i++) {
        quantized_weight[i] = best_error_weights[i];
        if (quantized_weight[i] >= 0) {
            quantization_level[i] = best_quantization_levels[best_error_weights[i]];
            quantization_level_mod[i] = best_quantization_levels_mod[best_error_weights[i]];
            for (j = 0; j < partition_count; j++) {
                partition_format_specifiers[i][j] = best_ep_formats[best_error_weights[i]][j];
            }
        }
    }
}

//==================================================================================

/*
     quantize an LDR RGB color. Since this is a fallback encoding, we cannot actually
     fail but must just go on until we can produce a sensible result.

     Due to how this encoding works, color0 cannot be larger than color1; as such,
     if color0 is actually larger than color1, then color0 is reduced and color1 is
     increased until color0 is no longer larger than color1.
*/
int cqt_lookup(int quantization_level, int value) {
    if (value < 0)
        value = 0;
    else if (value > 255)
        value = 255;
    return color_quantization_tables[quantization_level][value];
}

// clamp an input value to [0,255]; NaN is turned into 0
float clamp255(float val) {
    if (val > 255.0f)
        val = 255.0f;
    else if (val > 0.0f) {
        // deliberately empty
        // switching the order of calculation here will fail to handle 0.
    } else
        val = 0.0f;

    return val;
}

// clamp an input value to [0,1]; Nan is turned into 0.
float clamp01(float val) {
    if (val > 1.0f)
        val = 1.0f;
    else if (val > 0.0f) {
        // deliberately empty
        // switching the order of calculation here will fail to handle 0.
    } else
        val = 0.0f;

    return val;
}

void quantize_rgb(
    float4 color0,    // LDR: 0=lowest, 255=highest
    float4 color1,
    int output[6],
    int quantization_level) {
    color0.xyz = color0.xyz * (1.0f / 257.0f);
    color1.xyz = color1.xyz * (1.0f / 257.0f);


    float r0 = clamp255(color0.x);
    float g0 = clamp255(color0.y);
    float b0 = clamp255(color0.z);

    float r1 = clamp255(color1.x);
    float g1 = clamp255(color1.y);
    float b1 = clamp255(color1.z);

    int ri0, gi0, bi0, ri1, gi1, bi1;
    int ri0b, gi0b, bi0b, ri1b, gi1b, bi1b;
    float rgb0_addon = 0.5f;
    float rgb1_addon = 0.5f;
    int iters = 0;
    do {
        ri0 = cqt_lookup(quantization_level, (int)floor(r0 + rgb0_addon));
        gi0 = cqt_lookup(quantization_level, (int)floor(g0 + rgb0_addon));
        bi0 = cqt_lookup(quantization_level, (int)floor(b0 + rgb0_addon));
        ri1 = cqt_lookup(quantization_level, (int)floor(r1 + rgb1_addon));
        gi1 = cqt_lookup(quantization_level, (int)floor(g1 + rgb1_addon));
        bi1 = cqt_lookup(quantization_level, (int)floor(b1 + rgb1_addon));

        ri0b = color_unquantization_tables[quantization_level][ri0];
        gi0b = color_unquantization_tables[quantization_level][gi0];
        bi0b = color_unquantization_tables[quantization_level][bi0];
        ri1b = color_unquantization_tables[quantization_level][ri1];
        gi1b = color_unquantization_tables[quantization_level][gi1];
        bi1b = color_unquantization_tables[quantization_level][bi1];

        rgb0_addon -= 0.2f;
        rgb1_addon += 0.2f;
        iters++;
    } while (ri0b + gi0b + bi0b > ri1b + gi1b + bi1b);

    output[0] = ri0;
    output[1] = ri1;
    output[2] = gi0;
    output[3] = gi1;
    output[4] = bi0;
    output[5] = bi1;
}

/* quantize an RGBA color. */
void quantize_rgba(
    float4 color0,
    float4 color1,
    int output[8], int quantization_level) {
    color0.w = color0.w * (1.0f / 257.0f);
    color1.w = color1.w * (1.0f / 257.0f);

    float a0 = clamp255(color0.w);
    float a1 = clamp255(color1.w);
    int ai0 = color_quantization_tables[quantization_level][(int)floor(a0 + 0.5f)];
    int ai1 = color_quantization_tables[quantization_level][(int)floor(a1 + 0.5f)];

    output[6] = ai0;
    output[7] = ai1;

    quantize_rgb(color0, color1, output, quantization_level);
}

/* attempt to quantize RGB endpoint values with blue-contraction. Returns 1 on failure, 0 on success. */
int try_quantize_rgb_blue_contract(
    float4 color0,    // assumed to be the smaller color
    float4 color1,    // assumed to be the larger color
    int output[6], int quantization_level) {
    color0.xyz = color0.xyz * (1.0f / 257.0f);
    color1.xyz = color1.xyz * (1.0f / 257.0f);

    float r0 = color0.x;
    float g0 = color0.y;
    float b0 = color0.z;

    float r1 = color1.x;
    float g1 = color1.y;
    float b1 = color1.z;

    // inverse blue-contraction. This can produce an overflow;
    // just bail out immediately if this is the case.
    r0 += (r0 - b0);
    g0 += (g0 - b0);
    r1 += (r1 - b1);
    g1 += (g1 - b1);

    if (r0 < 0.0f || r0 > 255.0f || g0 < 0.0f || g0 > 255.0f || b0 < 0.0f || b0 > 255.0f ||
            r1 < 0.0f || r1 > 255.0f || g1 < 0.0f || g1 > 255.0f || b1 < 0.0f || b1 > 255.0f) {
        return 0;
    }

    // quantize the inverse-blue-contracted color
    int ri0 = color_quantization_tables[quantization_level][(int)floor(r0 + 0.5f)];
    int gi0 = color_quantization_tables[quantization_level][(int)floor(g0 + 0.5f)];
    int bi0 = color_quantization_tables[quantization_level][(int)floor(b0 + 0.5f)];
    int ri1 = color_quantization_tables[quantization_level][(int)floor(r1 + 0.5f)];
    int gi1 = color_quantization_tables[quantization_level][(int)floor(g1 + 0.5f)];
    int bi1 = color_quantization_tables[quantization_level][(int)floor(b1 + 0.5f)];

    // then unquantize again
    int ru0 = color_unquantization_tables[quantization_level][ri0];
    int gu0 = color_unquantization_tables[quantization_level][gi0];
    int bu0 = color_unquantization_tables[quantization_level][bi0];
    int ru1 = color_unquantization_tables[quantization_level][ri1];
    int gu1 = color_unquantization_tables[quantization_level][gi1];
    int bu1 = color_unquantization_tables[quantization_level][bi1];

    // if color #1 is not larger than color #0, then blue-contraction is not a valid approach.
    // note that blue-contraction and quantization may itself change this order, which is why
    // we must only test AFTER blue-contraction.
    if (ru1 + gu1 + bu1 <= ru0 + gu0 + bu0)
        return 0;

    output[0] = ri1;
    output[1] = ri0;
    output[2] = gi1;
    output[3] = gi0;
    output[4] = bi1;
    output[5] = bi0;

    return 1;
}

/* quantize an RGBA color with blue-contraction */
int try_quantize_rgba_blue_contract(float4 color0, float4 color1, int output[8], int quantization_level) {
    color0.w = color0.w * (1.0f / 257.0f);
    color1.w = color1.w * (1.0f / 257.0f);

    float a0 = clamp255(color0.w);
    float a1 = clamp255(color1.w);

    output[7] = color_quantization_tables[quantization_level][(int)floor(a0 + 0.5f)];
    output[6] = color_quantization_tables[quantization_level][(int)floor(a1 + 0.5f)];

    return try_quantize_rgb_blue_contract(color0, color1, output, quantization_level);
}

// delta-encoding:
// at decode time, we move one bit from the offset to the base and seize another bit as a sign bit;
// we then unquantize both values as if they contain one extra bit.
// if the sum of the offsets is nonnegative, then we encode a regular delta.
/* attempt to quantize an RGB endpoint value with delta-encoding. */
int try_quantize_rgb_delta(float4 color0, float4 color1, int output[6], int quantization_level) {
    color0.xyz = color0.xyz * (1.0f / 257.0f);
    color1.xyz = color1.xyz * (1.0f / 257.0f);

    float r0 = clamp255(color0.x);
    float g0 = clamp255(color0.y);
    float b0 = clamp255(color0.z);

    float r1 = clamp255(color1.x);
    float g1 = clamp255(color1.y);
    float b1 = clamp255(color1.z);

    // transform r0 to unorm9
    int r0a = (int)floor(r0 + 0.5f);
    int g0a = (int)floor(g0 + 0.5f);
    int b0a = (int)floor(b0 + 0.5f);
    r0a <<= 1;
    g0a <<= 1;
    b0a <<= 1;

    // mask off the top bit
    int r0b = r0a & 0xFF;
    int g0b = g0a & 0xFF;
    int b0b = b0a & 0xFF;

    // quantize, then unquantize in order to get a value that we take
    // differences against.
    int r0be = color_quantization_tables[quantization_level][r0b];
    int g0be = color_quantization_tables[quantization_level][g0b];
    int b0be = color_quantization_tables[quantization_level][b0b];

    r0b = color_unquantization_tables[quantization_level][r0be];
    g0b = color_unquantization_tables[quantization_level][g0be];
    b0b = color_unquantization_tables[quantization_level][b0be];
    r0b |= r0a & 0x100;            // final unquantized-values for endpoint 0.
    g0b |= g0a & 0x100;
    b0b |= b0a & 0x100;

    // then, get hold of the second value
    int r1d = (int)floor(r1 + 0.5f);
    int g1d = (int)floor(g1 + 0.5f);
    int b1d = (int)floor(b1 + 0.5f);

    r1d <<= 1;
    g1d <<= 1;
    b1d <<= 1;
    // and take differences!
    r1d -= r0b;
    g1d -= g0b;
    b1d -= b0b;

    // check if the difference is too large to be encodable.
    if (r1d > 63 || g1d > 63 || b1d > 63 || r1d < -64 || g1d < -64 || b1d < -64)
        return 0;

    // insert top bit of the base into the offset
    r1d &= 0x7F;
    g1d &= 0x7F;
    b1d &= 0x7F;

    r1d |= (r0b & 0x100) >> 1;
    g1d |= (g0b & 0x100) >> 1;
    b1d |= (b0b & 0x100) >> 1;

    // then quantize & unquantize; if this causes any of the top two bits to flip,
    // then encoding fails, since we have then corrupted either the top bit of the base
    // or the sign bit of the offset.
    int r1de = color_quantization_tables[quantization_level][r1d];
    int g1de = color_quantization_tables[quantization_level][g1d];
    int b1de = color_quantization_tables[quantization_level][b1d];

    int r1du = color_unquantization_tables[quantization_level][r1de];
    int g1du = color_unquantization_tables[quantization_level][g1de];
    int b1du = color_unquantization_tables[quantization_level][b1de];

    if (((r1d ^ r1du) | (g1d ^ g1du) | (b1d ^ b1du)) & 0xC0)
        return 0;

    // check that the sum of the encoded offsets is nonnegative, else encoding fails
    r1du &= 0x7f;
    g1du &= 0x7f;
    b1du &= 0x7f;
    if (r1du & 0x40)
        r1du -= 0x80;
    if (g1du & 0x40)
        g1du -= 0x80;
    if (b1du & 0x40)
        b1du -= 0x80;
    if (r1du + g1du + b1du < 0)
        return 0;

    // check that the offsets produce legitimate sums as well.
    r1du += r0b;
    g1du += g0b;
    b1du += b0b;
    if (r1du < 0 || r1du > 0x1FF || g1du < 0 || g1du > 0x1FF || b1du < 0 || b1du > 0x1FF)
        return 0;

    // OK, we've come this far; we can now encode legitimate values.
    output[0] = r0be;
    output[1] = r1de;
    output[2] = g0be;
    output[3] = g1de;
    output[4] = b0be;
    output[5] = b1de;

    return 1;
}

int try_quantize_rgb_delta_blue_contract(float4 color0, float4 color1, int output[6], int quantization_level) {
    color0.xyz = color0.xyz * (1.0f / 257.0f);
    color1.xyz = color1.xyz * (1.0f / 257.0f);

    // switch around endpoint colors already at start.
    float r0 = color1.x;
    float g0 = color1.y;
    float b0 = color1.z;

    float r1 = color0.x;
    float g1 = color0.y;
    float b1 = color0.z;

    // inverse blue-contraction. This step can perform an overflow, in which case
    // we will bail out immediately.
    r0 += (r0 - b0);
    g0 += (g0 - b0);
    r1 += (r1 - b1);
    g1 += (g1 - b1);

    if (r0 < 0.0f || r0 > 255.0f || g0 < 0.0f || g0 > 255.0f || b0 < 0.0f || b0 > 255.0f || r1 < 0.0f || r1 > 255.0f || g1 < 0.0f || g1 > 255.0f || b1 < 0.0f || b1 > 255.0f)
        return 0;

    // transform r0 to unorm9
    int r0a = (int)floor(r0 + 0.5f);
    int g0a = (int)floor(g0 + 0.5f);
    int b0a = (int)floor(b0 + 0.5f);
    r0a <<= 1;
    g0a <<= 1;
    b0a <<= 1;

    // mask off the top bit
    int r0b = r0a & 0xFF;
    int g0b = g0a & 0xFF;
    int b0b = b0a & 0xFF;

    // quantize, then unquantize in order to get a value that we take
    // differences against.
    int r0be = color_quantization_tables[quantization_level][r0b];
    int g0be = color_quantization_tables[quantization_level][g0b];
    int b0be = color_quantization_tables[quantization_level][b0b];

    r0b = color_unquantization_tables[quantization_level][r0be];
    g0b = color_unquantization_tables[quantization_level][g0be];
    b0b = color_unquantization_tables[quantization_level][b0be];
    r0b |= r0a & 0x100;            // final unquantized-values for endpoint 0.
    g0b |= g0a & 0x100;
    b0b |= b0a & 0x100;

    // then, get hold of the second value
    int r1d = (int)floor(r1 + 0.5f);
    int g1d = (int)floor(g1 + 0.5f);
    int b1d = (int)floor(b1 + 0.5f);

    r1d <<= 1;
    g1d <<= 1;
    b1d <<= 1;
    // and take differences!
    r1d -= r0b;
    g1d -= g0b;
    b1d -= b0b;

    // check if the difference is too large to be encodable.
    if (r1d > 63 || g1d > 63 || b1d > 63 || r1d < -64 || g1d < -64 || b1d < -64)
        return 0;

    // insert top bit of the base into the offset
    r1d &= 0x7F;
    g1d &= 0x7F;
    b1d &= 0x7F;

    r1d |= (r0b & 0x100) >> 1;
    g1d |= (g0b & 0x100) >> 1;
    b1d |= (b0b & 0x100) >> 1;

    // then quantize & unquantize; if this causes any of the top two bits to flip,
    // then encoding fails, since we have then corrupted either the top bit of the base
    // or the sign bit of the offset.
    int r1de = color_quantization_tables[quantization_level][r1d];
    int g1de = color_quantization_tables[quantization_level][g1d];
    int b1de = color_quantization_tables[quantization_level][b1d];

    int r1du = color_unquantization_tables[quantization_level][r1de];
    int g1du = color_unquantization_tables[quantization_level][g1de];
    int b1du = color_unquantization_tables[quantization_level][b1de];

    if (((r1d ^ r1du) | (g1d ^ g1du) | (b1d ^ b1du)) & 0xC0)
        return 0;

    // check that the sum of the encoded offsets is negative, else encoding fails
    // note that this is inverse of the test for non-blue-contracted RGB.
    r1du &= 0x7f;
    g1du &= 0x7f;
    b1du &= 0x7f;
    if (r1du & 0x40)
        r1du -= 0x80;
    if (g1du & 0x40)
        g1du -= 0x80;
    if (b1du & 0x40)
        b1du -= 0x80;
    if (r1du + g1du + b1du >= 0)
        return 0;

    // check that the offsets produce legitimate sums as well.
    r1du += r0b;
    g1du += g0b;
    b1du += b0b;
    if (r1du < 0 || r1du > 0x1FF || g1du < 0 || g1du > 0x1FF || b1du < 0 || b1du > 0x1FF)
        return 0;

    // OK, we've come this far; we can now encode legitimate values.
    output[0] = r0be;
    output[1] = r1de;
    output[2] = g0be;
    output[3] = g1de;
    output[4] = b0be;
    output[5] = b1de;

    return 1;
}

int try_quantize_alpha_delta(float4 color0, float4 color1, int output[8], int quantization_level) {
    color0.w = color0.w * (1.0f / 257.0f);
    color1.w = color1.w * (1.0f / 257.0f);

    // the calculation for alpha-delta is exactly the same as for RGB-delta; see
    // the RGB-delta function for comments.
    float a0 = clamp255(color0.w);
    float a1 = clamp255(color1.w);

    int a0a = (int)floor(a0 + 0.5f);
    a0a <<= 1;
    int a0b = a0a & 0xFF;
    int a0be = color_quantization_tables[quantization_level][a0b];
    a0b = color_unquantization_tables[quantization_level][a0be];
    a0b |= a0a & 0x100;
    int a1d = (int)floor(a1 + 0.5f);
    a1d <<= 1;
    a1d -= a0b;
    if (a1d > 63 || a1d < -64)
        return 0;
    a1d &= 0x7F;
    a1d |= (a0b & 0x100) >> 1;
    int a1de = color_quantization_tables[quantization_level][a1d];
    int a1du = color_unquantization_tables[quantization_level][a1de];
    if ((a1d ^ a1du) & 0xC0)
        return 0;
    a1du &= 0x7F;
    if (a1du & 0x40)
        a1du -= 0x80;
    a1du += a0b;
    if (a1du < 0 || a1du > 0x1FF)
        return 0;
    output[6] = a0be;
    output[7] = a1de;
    return 1;
}

int try_quantize_luminance_alpha_delta(float4 color0, float4 color1, int output[8], int quantization_level) {
    float l0 = clamp255((color0.x + color0.y + color0.z) * ((1.0f / 3.0f) * (1.0f / 257.0f)));
    float l1 = clamp255((color1.x + color1.y + color1.z) * ((1.0f / 3.0f) * (1.0f / 257.0f)));
    float a0 = clamp255(color0.w * (1.0f / 257.0f));
    float a1 = clamp255(color1.w * (1.0f / 257.0f));

    int l0a = (int)floor(l0 + 0.5f);
    int a0a = (int)floor(a0 + 0.5f);
    l0a <<= 1;
    a0a <<= 1;
    int l0b = l0a & 0xFF;
    int a0b = a0a & 0xFF;
    int l0be = color_quantization_tables[quantization_level][l0b];
    int a0be = color_quantization_tables[quantization_level][a0b];
    l0b = color_unquantization_tables[quantization_level][l0be];
    a0b = color_unquantization_tables[quantization_level][a0be];
    l0b |= l0a & 0x100;
    a0b |= a0a & 0x100;
    int l1d = (int)floor(l1 + 0.5f);
    int a1d = (int)floor(a1 + 0.5f);
    l1d <<= 1;
    a1d <<= 1;
    l1d -= l0b;
    a1d -= a0b;
    if (l1d > 63 || l1d < -64)
        return 0;
    if (a1d > 63 || a1d < -64)
        return 0;
    l1d &= 0x7F;
    a1d &= 0x7F;
    l1d |= (l0b & 0x100) >> 1;
    a1d |= (a0b & 0x100) >> 1;

    int l1de = color_quantization_tables[quantization_level][l1d];
    int a1de = color_quantization_tables[quantization_level][a1d];
    int l1du = color_unquantization_tables[quantization_level][l1de];
    int a1du = color_unquantization_tables[quantization_level][a1de];
    if ((l1d ^ l1du) & 0xC0)
        return 0;
    if ((a1d ^ a1du) & 0xC0)
        return 0;
    l1du &= 0x7F;
    a1du &= 0x7F;
    if (l1du & 0x40)
        l1du -= 0x80;
    if (a1du & 0x40)
        a1du -= 0x80;
    l1du += l0b;
    a1du += a0b;
    if (l1du < 0 || l1du > 0x1FF)
        return 0;
    if (a1du < 0 || a1du > 0x1FF)
        return 0;
    output[0] = l0be;
    output[1] = l1de;
    output[2] = a0be;
    output[3] = a1de;

    return 1;
}

int try_quantize_rgba_delta(float4 color0, float4 color1, int output[8], int quantization_level) {
    int alpha_delta_res = try_quantize_alpha_delta(color0, color1, output, quantization_level);

    if (alpha_delta_res == 0)
        return 0;

    return try_quantize_rgb_delta(color0, color1, output, quantization_level);
}

int try_quantize_rgba_delta_blue_contract(float4 color0, float4 color1, int output[8], int quantization_level) {
    // notice that for the alpha encoding, we are swapping around color0 and color1;
    // this is because blue-contraction involves swapping around the two colors.
    int alpha_delta_res = try_quantize_alpha_delta(color1, color0, output, quantization_level);

    if (alpha_delta_res == 0)
        return 0;

    return try_quantize_rgb_delta_blue_contract(color0, color1, output, quantization_level);
}

void quantize_rgbs_new(float4 rgbs_color,    // W component is a desired-scale to apply, in the range 0..1
                       int output[4], int quantization_level) {
    rgbs_color.xyz = rgbs_color.xyz * (1.0f / 257.0f);

    float r = clamp255(rgbs_color.x);
    float g = clamp255(rgbs_color.y);
    float b = clamp255(rgbs_color.z);

    int ri = color_quantization_tables[quantization_level][(int)floor(r + 0.5f)];
    int gi = color_quantization_tables[quantization_level][(int)floor(g + 0.5f)];
    int bi = color_quantization_tables[quantization_level][(int)floor(b + 0.5f)];

    int ru = color_unquantization_tables[quantization_level][ri];
    int gu = color_unquantization_tables[quantization_level][gi];
    int bu = color_unquantization_tables[quantization_level][bi];

    float oldcolorsum = rgbs_color.x + rgbs_color.y + rgbs_color.z;
    float newcolorsum = (float)(ru + gu + bu);

    float scale = clamp01(rgbs_color.w * (oldcolorsum + FLOAT_n10) / (newcolorsum + FLOAT_n10));

    int scale_idx = (int)floor(scale * 256.0f + 0.5f);

    if (scale_idx < 0)
        scale_idx = 0;
    else if (scale_idx > 255)
        scale_idx = 255;

    output[0] = ri;
    output[1] = gi;
    output[2] = bi;
    output[3] = color_quantization_tables[quantization_level][scale_idx];
}

void quantize_rgbs_alpha_new(float4 color0, float4 color1, float4 rgbs_color, int output[6], int quantization_level) {
    color0.w = color0.w * (1.0f / 257.0f);
    color1.w = color1.w * (1.0f / 257.0f);

    float a0 = clamp255(color0.w);
    float a1 = clamp255(color1.w);

    int ai0 = color_quantization_tables[quantization_level][(int)floor(a0 + 0.5f)];
    int ai1 = color_quantization_tables[quantization_level][(int)floor(a1 + 0.5f)];

    output[4] = ai0;
    output[5] = ai1;

    quantize_rgbs_new(rgbs_color, output, quantization_level);
}

void quantize_luminance(float4 color0, float4 color1, int output[2], int quantization_level) {
    color0.xyz = color0.xyz * (1.0f / 257.0f);
    color1.xyz = color1.xyz * (1.0f / 257.0f);

    float lum0 = clamp255((color0.x + color0.y + color0.z) * (1.0f / 3.0f));
    float lum1 = clamp255((color1.x + color1.y + color1.z) * (1.0f / 3.0f));

    if (lum0 > lum1) {
        float avg = (lum0 + lum1) * 0.5f;
        lum0 = avg;
        lum1 = avg;
    }

    output[0] = color_quantization_tables[quantization_level][(int)floor(lum0 + 0.5f)];
    output[1] = color_quantization_tables[quantization_level][(int)floor(lum1 + 0.5f)];
}

void quantize_luminance_alpha(float4 color0, float4 color1, int output[4], int quantization_level) {
    color0 = color0 * (1.0f / 257.0f);
    color1 = color1 * (1.0f / 257.0f);

    float lum0 = clamp255((color0.x + color0.y + color0.z) * (1.0f / 3.0f));
    float lum1 = clamp255((color1.x + color1.y + color1.z) * (1.0f / 3.0f));
    float a0 = clamp255(color0.w);
    float a1 = clamp255(color1.w);

    // if the endpoints are *really* close, then pull them apart slightly;
    // tisa affords for >8 bits precision for normal maps.
    if (quantization_level > 18 && fabs(lum0 - lum1) < 3.0f) {
        if (lum0 < lum1) {
            lum0 -= 0.5f;
            lum1 += 0.5f;
        } else {
            lum0 += 0.5f;
            lum1 -= 0.5f;
        }
        lum0 = clamp255(lum0);
        lum1 = clamp255(lum1);
    }
    if (quantization_level > 18 && fabs(a0 - a1) < 3.0f) {
        if (a0 < a1) {
            a0 -= 0.5f;
            a1 += 0.5f;
        } else {
            a0 += 0.5f;
            a1 -= 0.5f;
        }
        a0 = clamp255(a0);
        a1 = clamp255(a1);
    }


    output[0] = color_quantization_tables[quantization_level][(int)floor(lum0 + 0.5f)];
    output[1] = color_quantization_tables[quantization_level][(int)floor(lum1 + 0.5f)];
    output[2] = color_quantization_tables[quantization_level][(int)floor(a0 + 0.5f)];
    output[3] = color_quantization_tables[quantization_level][(int)floor(a1 + 0.5f)];
}

void quantize0(int output[8]) {
    int i;
    for (i = 0; i < 8; i++)
        output[i] = 0;
}

// quantize and unquantize a number, wile making sure to retain the top two bits.
//static inline
void quantize_and_unquantize_retain_top_two_bits(int quantization_level, int value_to_quantize,    // 0 to 255.
        int *quantized_value, int *unquantized_value) {

    int perform_loop;
    int quantval;
    int uquantval;

    do {
        quantval = color_quantization_tables[quantization_level][value_to_quantize];
        uquantval = color_unquantization_tables[quantization_level][quantval];

        // perform looping if the top two bits were modified by quant/unquant
        perform_loop = (value_to_quantize & 0xC0) != (uquantval & 0xC0);

        if ((uquantval & 0xC0) > (value_to_quantize & 0xC0)) {
            // quant/unquant rounded UP so that the top two bits changed;
            // decrement the input value in hopes that this will avoid rounding up.
            value_to_quantize--;
        } else if ((uquantval & 0xC0) < (value_to_quantize & 0xC0)) {
            // quant/unquant rounded DOWN so that the top two bits changed;
            // decrement the input value in hopes that this will avoid rounding down.
            value_to_quantize--;
        }
    } while (perform_loop);

    *quantized_value = quantval;
    *unquantized_value = uquantval;
}

// quantize and unquantize a number, wile making sure to retain the top four bits.
//static inline
void quantize_and_unquantize_retain_top_four_bits(int quantization_level, int value_to_quantize,    // 0 to 255.
        int *quantized_value, int *unquantized_value) {

    int perform_loop;
    int quantval;
    int uquantval;

    do {
        quantval = color_quantization_tables[quantization_level][value_to_quantize];
        uquantval = color_unquantization_tables[quantization_level][quantval];

        // perform looping if the top two bits were modified by quant/unquant
        perform_loop = (value_to_quantize & 0xF0) != (uquantval & 0xF0);

        if ((uquantval & 0xF0) > (value_to_quantize & 0xF0)) {
            // quant/unquant rounded UP so that the top two bits changed;
            // decrement the input value in hopes that this will avoid rounding up.
            value_to_quantize--;
        } else if ((uquantval & 0xF0) < (value_to_quantize & 0xF0)) {
            // quant/unquant rounded DOWN so that the top two bits changed;
            // decrement the input value in hopes that this will avoid rounding down.
            value_to_quantize--;
        }
    } while (perform_loop);

    *quantized_value = quantval;
    *unquantized_value = uquantval;
}

void quantize_hdr_rgb3(float4 color0, float4 color1, int output[6], int quantization_level) {
    if (!(color0.x > 0.0f))
        color0.x = 0.0f;
    else if (color0.x > 65535.0f)
        color0.x = 65535.0f;

    if (!(color0.y > 0.0f))
        color0.y = 0.0f;
    else if (color0.y > 65535.0f)
        color0.y = 65535.0f;

    if (!(color0.z > 0.0f))
        color0.z = 0.0f;
    else if (color0.z > 65535.0f)
        color0.z = 65535.0f;

    if (!(color1.x > 0.0f))
        color1.x = 0.0f;
    else if (color1.x > 65535.0f)
        color1.x = 65535.0f;

    if (!(color1.y > 0.0f))
        color1.y = 0.0f;
    else if (color1.y > 65535.0f)
        color1.y = 65535.0f;

    if (!(color1.z > 0.0f))
        color1.z = 0.0f;
    else if (color1.z > 65535.0f)
        color1.z = 65535.0f;

    float4 color0_bak = color0;
    float4 color1_bak = color1;

    int majcomp;
    if (color1.x > color1.y && color1.x > color1.z)
        majcomp = 0;            // red is largest
    else if (color1.y > color1.z)
        majcomp = 1;            // green is largest
    else
        majcomp = 2;            // blue is largest

    // swizzle the components
    switch (majcomp) {
    case 1:                    // red-green swap
        color0 = color0.yxzw;
        color1 = color1.yxzw;
        break;
    case 2:                    // red-blue swap
        color0 = color0.zyxw;
        color1 = color1.zyxw;
        break;
    default:
        break;
    }

    float a_base = color1.x;
    if (a_base < 0.0f)
        a_base = 0.0f;
    else if (a_base > 65535.0f)
        a_base = 65535.0f;


    float b0_base = a_base - color1.y;
    float b1_base = a_base - color1.z;
    float c_base = a_base - color0.x;
    float d0_base = a_base - b0_base - c_base - color0.y;
    float d1_base = a_base - b1_base - c_base - color0.z;


    // try modes one by one, with the highest-precision mode first.
    int mode;
    for (mode = 7; mode >= 0; mode--) {
        // for each mode, test if we can in fact accommodate
        // the computed b,c,d values. If we clearly can't, then we skip to the next mode.

        float b_cutoff = mode_cutoffs[mode][0];
        float c_cutoff = mode_cutoffs[mode][1];
        float d_cutoff = mode_cutoffs[mode][2];

        if (b0_base > b_cutoff || b1_base > b_cutoff || c_base > c_cutoff || fabs(d0_base) > d_cutoff || fabs(d1_base) > d_cutoff) {
            continue;
        }

        float mode_scale = mode_scales[mode];
        float mode_rscale = mode_rscales[mode];

        int b_intcutoff = 1 << mode_bits[mode][1];
        int c_intcutoff = 1 << mode_bits[mode][2];
        int d_intcutoff = 1 << (mode_bits[mode][3] - 1);

        // first, quantize and unquantize A, with the assumption that its high bits can be handled safely.
        int a_intval = (int)floor(a_base * mode_scale + 0.5f);
        int a_lowbits = a_intval & 0xFF;

        int a_quantval = color_quantization_tables[quantization_level][a_lowbits];
        int a_uquantval = color_unquantization_tables[quantization_level][a_quantval];
        a_intval = (a_intval & ~0xFF) | a_uquantval;
        float a_fval = a_intval * mode_rscale;

        // next, recompute C, then quantize and unquantize it
        float c_fval = a_fval - color0.x;
        if (c_fval < 0.0f)
            c_fval = 0.0f;
        else if (c_fval > 65535.0f)
            c_fval = 65535.0f;

        int c_intval = (int)floor(c_fval * mode_scale + 0.5f);

        if (c_intval >= c_intcutoff) {
            continue;
        }

        int c_lowbits = c_intval & 0x3f;

        c_lowbits |= (mode & 1) << 7;
        c_lowbits |= (a_intval & 0x100) >> 2;

        int c_quantval;
        int c_uquantval;
        quantize_and_unquantize_retain_top_two_bits(quantization_level, c_lowbits, &c_quantval, &c_uquantval);
        c_intval = (c_intval & ~0x3F) | (c_uquantval & 0x3F);
        c_fval = c_intval * mode_rscale;


        // next, recompute B0 and B1, then quantize and unquantize them
        float b0_fval = a_fval - color1.y;
        float b1_fval = a_fval - color1.z;
        if (b0_fval < 0.0f)
            b0_fval = 0.0f;
        else if (b0_fval > 65535.0f)
            b0_fval = 65535.0f;
        if (b1_fval < 0.0f)
            b1_fval = 0.0f;
        else if (b1_fval > 65535.0f)
            b1_fval = 65535.0f;

        int b0_intval = (int)floor(b0_fval * mode_scale + 0.5f);
        int b1_intval = (int)floor(b1_fval * mode_scale + 0.5f);

        if (b0_intval >= b_intcutoff || b1_intval >= b_intcutoff) {
            continue;
        }


        int b0_lowbits = b0_intval & 0x3f;
        int b1_lowbits = b1_intval & 0x3f;

        int bit0 = 0;
        int bit1 = 0;
        switch (mode) {
        case 0:
        case 1:
        case 3:
        case 4:
        case 6:
            bit0 = (b0_intval >> 6) & 1;
            break;
        case 2:
        case 5:
        case 7:
            bit0 = (a_intval >> 9) & 1;
            break;
        }

        switch (mode) {
        case 0:
        case 1:
        case 3:
        case 4:
        case 6:
            bit1 = (b1_intval >> 6) & 1;
            break;
        case 2:
            bit1 = (c_intval >> 6) & 1;
            break;
        case 5:
        case 7:
            bit1 = (a_intval >> 10) & 1;
            break;
        }

        b0_lowbits |= bit0 << 6;
        b1_lowbits |= bit1 << 6;

        b0_lowbits |= ((mode >> 1) & 1) << 7;
        b1_lowbits |= ((mode >> 2) & 1) << 7;

        int b0_quantval;
        int b1_quantval;
        int b0_uquantval;
        int b1_uquantval;

        quantize_and_unquantize_retain_top_two_bits(quantization_level, b0_lowbits, &b0_quantval, &b0_uquantval);

        quantize_and_unquantize_retain_top_two_bits(quantization_level, b1_lowbits, &b1_quantval, &b1_uquantval);

        b0_intval = (b0_intval & ~0x3f) | (b0_uquantval & 0x3f);
        b1_intval = (b1_intval & ~0x3f) | (b1_uquantval & 0x3f);
        b0_fval = b0_intval * mode_rscale;
        b1_fval = b1_intval * mode_rscale;


        // finally, recompute D0 and D1, then quantize and unquantize them
        float d0_fval = a_fval - b0_fval - c_fval - color0.y;
        float d1_fval = a_fval - b1_fval - c_fval - color0.z;

        if (d0_fval < -65535.0f)
            d0_fval = -65535.0f;
        else if (d0_fval > 65535.0f)
            d0_fval = 65535.0f;

        if (d1_fval < -65535.0f)
            d1_fval = -65535.0f;
        else if (d1_fval > 65535.0f)
            d1_fval = 65535.0f;

        int d0_intval = (int)floor(d0_fval * mode_scale + 0.5f);
        int d1_intval = (int)floor(d1_fval * mode_scale + 0.5f);

        if (abs(d0_intval) >= d_intcutoff || abs(d1_intval) >= d_intcutoff)
            continue;

        // d0_intval += mode_dbiases[mode];
        // d1_intval += mode_dbiases[mode];

        int d0_lowbits = d0_intval & 0x1f;
        int d1_lowbits = d1_intval & 0x1f;

        int bit2 = 0;
        int bit3 = 0;
        int bit4;
        int bit5;
        switch (mode) {
        case 0:
        case 2:
            bit2 = (d0_intval >> 6) & 1;
            break;
        case 1:
        case 4:
            bit2 = (b0_intval >> 7) & 1;
            break;
        case 3:
            bit2 = (a_intval >> 9) & 1;
            break;
        case 5:
            bit2 = (c_intval >> 7) & 1;
            break;
        case 6:
        case 7:
            bit2 = (a_intval >> 11) & 1;
            break;
        }
        switch (mode) {
        case 0:
        case 2:
            bit3 = (d1_intval >> 6) & 1;
            break;
        case 1:
        case 4:
            bit3 = (b1_intval >> 7) & 1;
            break;
        case 3:
        case 5:
        case 6:
        case 7:
            bit3 = (c_intval >> 6) & 1;
            break;
        }

        switch (mode) {
        case 4:
        case 6:
            bit4 = (a_intval >> 9) & 1;
            bit5 = (a_intval >> 10) & 1;
            break;
        default:
            bit4 = (d0_intval >> 5) & 1;
            bit5 = (d1_intval >> 5) & 1;
            break;
        }

        d0_lowbits |= bit2 << 6;
        d1_lowbits |= bit3 << 6;
        d0_lowbits |= bit4 << 5;
        d1_lowbits |= bit5 << 5;

        d0_lowbits |= (majcomp & 1) << 7;
        d1_lowbits |= ((majcomp >> 1) & 1) << 7;

        int d0_quantval;
        int d1_quantval;
        int d0_uquantval;
        int d1_uquantval;

        quantize_and_unquantize_retain_top_four_bits(quantization_level, d0_lowbits, &d0_quantval, &d0_uquantval);

        quantize_and_unquantize_retain_top_four_bits(quantization_level, d1_lowbits, &d1_quantval, &d1_uquantval);

        output[0] = a_quantval;
        output[1] = c_quantval;
        output[2] = b0_quantval;
        output[3] = b1_quantval;
        output[4] = d0_quantval;
        output[5] = d1_quantval;
        return;
    }

    // neither of the modes fit? In this case, we will use a flat representation
    // for storing data, using 8 bits for red and green, and 7 bits for blue.
    // This gives color accuracy roughly similar to LDR 4:4:3 which is not at all great
    // but usable. This representation is used if the light color is more than 4x the
    // color value of the dark color.
    int i;
    float vals[6];
    vals[0] = color0_bak.x;
    vals[1] = color1_bak.x;
    vals[2] = color0_bak.y;
    vals[3] = color1_bak.y;
    vals[4] = color0_bak.z;
    vals[5] = color1_bak.z;


    for (i = 0; i < 6; i++) {
        if (vals[i] < 0.0f)
            vals[i] = 0.0f;
        else if (vals[i] > 65020.0f)
            vals[i] = 65020.0f;
    }
    for (i = 0; i < 4; i++) {
        int idx = (int)floor(vals[i] * 1.0f / 256.0f + 0.5f);
        output[i] = color_quantization_tables[quantization_level][idx];
    }
    for (i = 4; i < 6; i++) {
        int dummy;
        int idx = (int)floor(vals[i] * 1.0f / 512.0f + 0.5f) + 128;
        quantize_and_unquantize_retain_top_two_bits(quantization_level, idx, &(output[i]), &dummy);
    }

    return;
}

void quantize_hdr_alpha3(float alpha0, float alpha1, int output[2], int quantization_level) {
    int i;

    if (alpha0 < 0)
        alpha0 = 0;
    else if (alpha0 > 65280)
        alpha0 = 65280;

    if (alpha1 < 0)
        alpha1 = 0;
    else if (alpha1 > 65280)
        alpha1 = 65280;

    int ialpha0 = (int)floor(alpha0 + 0.5f);
    int ialpha1 = (int)floor(alpha1 + 0.5f);

    int val0, val1, diffval;
    int v6, v7;
    int v6e, v7e;
    int v6d, v7d;

    // try to encode one of the delta submodes, in decreasing-precision order.
    for (i = 2; i >= 0; i--) {
        val0 = (ialpha0 + (128 >> i)) >> (8 - i);
        val1 = (ialpha1 + (128 >> i)) >> (8 - i);

        v6 = (val0 & 0x7F) | ((i & 1) << 7);
        v6e = color_quantization_tables[quantization_level][v6];
        v6d = color_unquantization_tables[quantization_level][v6e];

        if ((v6 ^ v6d) & 0x80)
            continue;

        val0 = (val0 & ~0x7f) | (v6d & 0x7f);
        diffval = val1 - val0;
        int cutoff = 32 >> i;
        int mask = 2 * cutoff - 1;

        if (diffval < -cutoff || diffval >= cutoff)
            continue;

        v7 = ((i & 2) << 6) | ((val0 >> 7) << (6 - i)) | (diffval & mask);
        v7e = color_quantization_tables[quantization_level][v7];
        v7d = color_unquantization_tables[quantization_level][v7e];


        if ((v7 ^ v7d) & testbits[i])
            continue;

        output[0] = v6e;
        output[1] = v7e;
        return;
    }

    // could not encode any of the delta modes; instead encode a flat value
    val0 = (ialpha0 + 256) >> 9;
    val1 = (ialpha1 + 256) >> 9;
    v6 = val0 | 0x80;
    v7 = val1 | 0x80;

    v6e = color_quantization_tables[quantization_level][v6];
    v7e = color_quantization_tables[quantization_level][v7];
    output[0] = v6e;
    output[1] = v7e;

    return;
}

void quantize_hdr_rgb_ldr_alpha3(float4 color0, float4 color1, int output[8], int quantization_level) {
    color0.w *= (1.0f / 257.0f);
    color1.w *= (1.0f / 257.0f);

    quantize_hdr_rgb3(color0, color1, output, quantization_level);

    float a0 = clamp255(color0.w);
    float a1 = clamp255(color1.w);
    int ai0 = color_quantization_tables[quantization_level][(int)floor(a0 + 0.5f)];
    int ai1 = color_quantization_tables[quantization_level][(int)floor(a1 + 0.5f)];

    output[6] = ai0;
    output[7] = ai1;
}

void quantize_hdr_rgb_alpha3(float4 color0, float4 color1, int output[8], int quantization_level) {
    quantize_hdr_rgb3(color0, color1, output, quantization_level);
    quantize_hdr_alpha3(color0.w, color1.w, output + 6, quantization_level);
}

/* HDR color encoding, take #3 */
void quantize_hdr_rgbo3(float4 color, int output[4], int quantization_level) {
    color.xyz = color.xyz + color.www;

    if (!(color.x > 0.0f))
        color.x = 0.0f;
    else if (color.x > 65535.0f)
        color.x = 65535.0f;

    if (!(color.y > 0.0f))
        color.y = 0.0f;
    else if (color.y > 65535.0f)
        color.y = 65535.0f;

    if (!(color.z > 0.0f))
        color.z = 0.0f;
    else if (color.z > 65535.0f)
        color.z = 65535.0f;

    if (!(color.w > 0.0f))
        color.w = 0.0f;
    else if (color.w > 65535.0f)
        color.w = 65535.0f;

    float4 color_bak = color;
    int majcomp;
    if (color.x > color.y && color.x > color.z)
        majcomp = 0;            // red is largest component
    else if (color.y > color.z)
        majcomp = 1;            // green is largest component
    else
        majcomp = 2;            // blue is largest component

    // swap around the red component and the largest component.
    switch (majcomp) {
    case 1:
        color = color.yxzw;
        break;
    case 2:
        color = color.zyxw;
        break;
    default:
        break;
    }


    float r_base = color.x;
    float g_base = color.x - color.y;
    float b_base = color.x - color.z;
    float s_base = color.w;

    int mode;
    for (mode = 0; mode < 5; mode++) {
        if (g_base > mode_cutoffs2[mode][0] || b_base > mode_cutoffs2[mode][0] || s_base > mode_cutoffs2[mode][1]) {
            continue;
        }

        // encode the mode into a 4-bit vector.
        int mode_enc = mode < 4 ? (mode | (majcomp << 2)) : (majcomp | 0xC);

        float mode_scale = mode_scales2[mode];
        float mode_rscale = mode_rscales2[mode];

        int gb_intcutoff = 1 << mode_bits2[mode][1];
        int s_intcutoff = 1 << mode_bits2[mode][2];

        // first, quantize and unquantize R.
        int r_intval = (int)floor(r_base * mode_scale + 0.5f);

        int r_lowbits = r_intval & 0x3f;

        r_lowbits |= (mode_enc & 3) << 6;

        int r_quantval;
        int r_uquantval;
        quantize_and_unquantize_retain_top_two_bits(quantization_level, r_lowbits, &r_quantval, &r_uquantval);

        r_intval = (r_intval & ~0x3f) | (r_uquantval & 0x3f);
        float r_fval = r_intval * mode_rscale;


        // next, recompute G and B, then quantize and unquantize them.
        float g_fval = r_fval - color.y;
        float b_fval = r_fval - color.z;
        if (g_fval < 0.0f)
            g_fval = 0.0f;
        else if (g_fval > 65535.0f)
            g_fval = 65535.0f;
        if (b_fval < 0.0f)
            b_fval = 0.0f;
        else if (b_fval > 65535.0f)
            b_fval = 65535.0f;

        int g_intval = (int)floor(g_fval * mode_scale + 0.5f);
        int b_intval = (int)floor(b_fval * mode_scale + 0.5f);


        if (g_intval >= gb_intcutoff || b_intval >= gb_intcutoff) {
            continue;
        }

        int g_lowbits = g_intval & 0x1f;
        int b_lowbits = b_intval & 0x1f;

        int bit0 = 0;
        int bit1 = 0;
        int bit2 = 0;
        int bit3 = 0;

        switch (mode) {
        case 0:
        case 2:
            bit0 = (r_intval >> 9) & 1;
            break;
        case 1:
        case 3:
            bit0 = (r_intval >> 8) & 1;
            break;
        case 4:
        case 5:
            bit0 = (g_intval >> 6) & 1;
            break;
        }

        switch (mode) {
        case 0:
        case 1:
        case 2:
        case 3:
            bit2 = (r_intval >> 7) & 1;
            break;
        case 4:
        case 5:
            bit2 = (b_intval >> 6) & 1;
            break;
        }

        switch (mode) {
        case 0:
        case 2:
            bit1 = (r_intval >> 8) & 1;
            break;
        case 1:
        case 3:
        case 4:
        case 5:
            bit1 = (g_intval >> 5) & 1;
            break;
        }

        switch (mode) {
        case 0:
            bit3 = (r_intval >> 10) & 1;
            break;
        case 2:
            bit3 = (r_intval >> 6) & 1;
            break;
        case 1:
        case 3:
        case 4:
        case 5:
            bit3 = (b_intval >> 5) & 1;
            break;
        }

        g_lowbits |= (mode_enc & 0x4) << 5;
        b_lowbits |= (mode_enc & 0x8) << 4;

        g_lowbits |= bit0 << 6;
        g_lowbits |= bit1 << 5;
        b_lowbits |= bit2 << 6;
        b_lowbits |= bit3 << 5;

        int g_quantval;
        int b_quantval;
        int g_uquantval;
        int b_uquantval;

        quantize_and_unquantize_retain_top_four_bits(quantization_level, g_lowbits, &g_quantval, &g_uquantval);

        quantize_and_unquantize_retain_top_four_bits(quantization_level, b_lowbits, &b_quantval, &b_uquantval);

        g_intval = (g_intval & ~0x1f) | (g_uquantval & 0x1f);
        b_intval = (b_intval & ~0x1f) | (b_uquantval & 0x1f);

        g_fval = g_intval * mode_rscale;
        b_fval = b_intval * mode_rscale;


        // finally, recompute the scale value, based on the errors
        // introduced to red, green and blue.

        // If the error is positive, then the R,G,B errors combined have raised the color
        // value overall; as such, the scale value needs to be increased.
        float rgb_errorsum = (r_fval - color.x) + (r_fval - g_fval - color.y) + (r_fval - b_fval - color.z);

        float s_fval = s_base + rgb_errorsum * (1.0f / 3.0f);
        if (s_fval < 0.0f)
            s_fval = 0.0f;
        else if (s_fval > FLOAT_9)
            s_fval = FLOAT_9;

        int s_intval = (int)floor(s_fval * mode_scale + 0.5f);

        if (s_intval >= s_intcutoff) {
            continue;
        }

        int s_lowbits = s_intval & 0x1f;

        int bit4;
        int bit5;
        int bit6;
        switch (mode) {
        case 1:
            bit6 = (r_intval >> 9) & 1;
            break;
        default:
            bit6 = (s_intval >> 5) & 1;
            break;
        }

        switch (mode) {
        case 4:
            bit5 = (r_intval >> 7) & 1;
            break;
        case 1:
            bit5 = (r_intval >> 10) & 1;
            break;
        default:
            bit5 = (s_intval >> 6) & 1;
            break;
        }

        switch (mode) {
        case 2:
            bit4 = (s_intval >> 7) & 1;
            break;
        default:
            bit4 = (r_intval >> 6) & 1;
            break;
        }


        s_lowbits |= bit6 << 5;
        s_lowbits |= bit5 << 6;
        s_lowbits |= bit4 << 7;

        int s_quantval;
        int s_uquantval;

        quantize_and_unquantize_retain_top_four_bits(quantization_level, s_lowbits, &s_quantval, &s_uquantval);

        s_intval = (s_intval & ~0x1f) | (s_uquantval & 0x1f);
        s_fval = s_intval * mode_rscale;
        output[0] = r_quantval;
        output[1] = g_quantval;
        output[2] = b_quantval;
        output[3] = s_quantval;

        return;
    }

    // failed to encode any of the modes above? In that case,
    // encode using mode #5.
    int i;

    float vals[4];
    int ivals[4];
    vals[0] = color_bak.x;
    vals[1] = color_bak.y;
    vals[2] = color_bak.z;
    vals[3] = color_bak.w;

    float cvals[3];

    for (i = 0; i < 3; i++) {
        if (vals[i] < 0.0f)
            vals[i] = 0.0f;
        else if (vals[i] > 65020.0f)
            vals[i] = 65020.0f;

        ivals[i] = (int)floor(vals[i] * (1.0f / 512.0f) + 0.5f);
        cvals[i] = ivals[i] * 512.0f;
    }

    float rgb_errorsum = (cvals[0] - vals[0]) + (cvals[1] - vals[1]) + (cvals[2] - vals[2]);
    vals[3] += rgb_errorsum * (1.0f / 3.0f);

    if (vals[3] < 0.0f)
        vals[3] = 0.0f;
    else if (vals[3] > 65020.0f)
        vals[3] = 65020.0f;

    ivals[3] = (int)floor(vals[3] * (1.0f / 512.0f) + 0.5f);

    int encvals[4];

    encvals[0] = (ivals[0] & 0x3f) | 0xC0;
    encvals[1] = (ivals[1] & 0x7f) | 0x80;
    encvals[2] = (ivals[2] & 0x7f) | 0x80;
    encvals[3] = (ivals[3] & 0x7f) | ((ivals[0] & 0x40) << 1);

    for (i = 0; i < 4; i++) {
        int dummy;
        quantize_and_unquantize_retain_top_four_bits(quantization_level, encvals[i], &(output[i]), &dummy);
    }

    return;
}

int try_quantize_hdr_luminance_small_range3(float4 color0, float4 color1, int output[2], int quantization_level) {
    float lum1 = (color1.x + color1.y + color1.z) * (1.0f / 3.0f);
    float lum0 = (color0.x + color0.y + color0.z) * (1.0f / 3.0f);

    if (lum1 < lum0) {
        float avg = (lum0 + lum1) * 0.5f;
        lum0 = avg;
        lum1 = avg;
    }

    int ilum1 = (int)floor(lum1 + 0.5f);
    int ilum0 = (int)floor(lum0 + 0.5f);

    // difference of more than a factor-of-2 results in immediate failure.
    if (ilum1 - ilum0 > 2048)
        return 0;

    int lowval, highval, diffval;
    int v0, v1;
    int v0e, v1e;
    int v0d, v1d;

    // first, try to encode the high-precision submode
    lowval = (ilum0 + 16) >> 5;
    highval = (ilum1 + 16) >> 5;

    if (lowval < 0)
        lowval = 0;
    else if (lowval > 2047)
        lowval = 2047;

    if (highval < 0)
        highval = 0;
    else if (highval > 2047)
        highval = 2047;

    v0 = lowval & 0x7F;
    v0e = color_quantization_tables[quantization_level][v0];
    v0d = color_unquantization_tables[quantization_level][v0e];

    if ((v0d & 0x80) != 0x80) {
        lowval = (lowval & ~0x7F) | (v0d & 0x7F);
        diffval = highval - lowval;
        if (diffval >= 0 && diffval <= 15) {
            v1 = ((lowval >> 3) & 0xF0) | diffval;
            v1e = color_quantization_tables[quantization_level][v1];
            v1d = color_unquantization_tables[quantization_level][v1e];
            if ((v1d & 0xF0) == (v1 & 0xF0)) {
                output[0] = v0e;
                output[1] = v1e;
                return 1;
            }
        }
    }

    // failed to encode the high-precision submode; well, then try to encode the
    // low-precision submode.

    lowval = (ilum0 + 32) >> 6;
    highval = (ilum1 + 32) >> 6;
    if (lowval < 0)
        lowval = 0;
    else if (lowval > 1023)
        lowval = 1023;
    if (highval < 0)
        highval = 0;
    else if (highval > 1023)
        highval = 1023;

    v0 = (lowval & 0x7F) | 0x80;
    v0e = color_quantization_tables[quantization_level][v0];
    v0d = color_unquantization_tables[quantization_level][v0e];
    if ((v0d & 0x80) == 0)
        return 0;

    lowval = (lowval & ~0x7F) | (v0d & 0x7F);
    diffval = highval - lowval;
    if (diffval < 0 || diffval > 31)
        return 0;

    v1 = ((lowval >> 2) & 0xE0) | diffval;
    v1e = color_quantization_tables[quantization_level][v1];
    v1d = color_unquantization_tables[quantization_level][v1e];
    if ((v1d & 0xE0) != (v1 & 0xE0))
        return 0;;

    output[0] = v0e;
    output[1] = v1e;
    return 1;
}

void quantize_hdr_luminance_large_range3(float4 color0, float4 color1, int output[2], int quantization_level) {

    float lum1 = (color1.x + color1.y + color1.z) * (1.0f / 3.0f);
    float lum0 = (color0.x + color0.y + color0.z) * (1.0f / 3.0f);

    if (lum1 < lum0) {
        float avg = (lum0 + lum1) * 0.5f;
        lum0 = avg;
        lum1 = avg;
    }

    int ilum1 = (int)floor(lum1 + 0.5f);
    int ilum0 = (int)floor(lum0 + 0.5f);

    // find the closest encodable point in the upper half of the code-point space
    int upper_v0 = (ilum0 + 128) >> 8;
    int upper_v1 = (ilum1 + 128) >> 8;

    if (upper_v0 < 0)
        upper_v0 = 0;
    else if (upper_v0 > 255)
        upper_v0 = 255;

    if (upper_v1 < 0)
        upper_v1 = 0;
    else if (upper_v1 > 255)
        upper_v1 = 255;

    // find the closest encodable point in the lower half of the code-point space
    int lower_v0 = (ilum1 + 256) >> 8;
    int lower_v1 = ilum0 >> 8;

    if (lower_v0 < 0)
        lower_v0 = 0;
    else if (lower_v0 > 255)
        lower_v0 = 255;

    if (lower_v1 < 0)
        lower_v1 = 0;
    else if (lower_v1 > 255)
        lower_v1 = 255;

    // determine the distance between the point in code-point space and the input value
    int upper0_dec = upper_v0 << 8;
    int upper1_dec = upper_v1 << 8;
    int lower0_dec = (lower_v1 << 8) + 128;
    int lower1_dec = (lower_v0 << 8) - 128;


    int upper0_diff = upper0_dec - ilum0;
    int upper1_diff = upper1_dec - ilum1;
    int lower0_diff = lower0_dec - ilum0;
    int lower1_diff = lower1_dec - ilum1;

    int upper_error = (upper0_diff * upper0_diff) + (upper1_diff * upper1_diff);
    int lower_error = (lower0_diff * lower0_diff) + (lower1_diff * lower1_diff);

    int v0, v1;
    if (upper_error < lower_error) {
        v0 = upper_v0;
        v1 = upper_v1;
    } else {
        v0 = lower_v0;
        v1 = lower_v1;
    }

    // OK; encode.
    output[0] = color_quantization_tables[quantization_level][v0];
    output[1] = color_quantization_tables[quantization_level][v1];
}

int pack_color_endpoints(float4 color0, float4 color1, float4 rgbs_color, float4 rgbo_color,
                         int format, int *output, int quantization_level) {
    DEBUG("pack_color_endpoints");
//#ifdef __OPENCL_VERSION__
//    if ((get_global_id(0) == 0) && (get_global_id(1) == 0))
//#endif
//        printf("color0 %3.3f %3.3f %3.3f  color1 %3.3f %3.3f %3.3f\n", color0.x, color0.y, color0.z, color1.x, color1.y, color1.z);

    // we do not support negative colors.
    color0.x = (std::max)(color0.x, 0.0f);
    color0.y = (std::max)(color0.y, 0.0f);
    color0.z = (std::max)(color0.z, 0.0f);
    color0.w = (std::max)(color0.w, 0.0f);
    color1.x = (std::max)(color1.x, 0.0f);
    color1.y = (std::max)(color1.y, 0.0f);
    color1.z = (std::max)(color1.z, 0.0f);
    color1.w = (std::max)(color1.w, 0.0f);


    int retval;

    switch (format) {
    case FMT_RGB:
        if (quantization_level <= 18) {
            if (try_quantize_rgb_delta_blue_contract(color0, color1, output, quantization_level)) {
                retval = FMT_RGB_DELTA;
                break;
            }
            if (try_quantize_rgb_delta(color0, color1, output, quantization_level)) {
                retval = FMT_RGB_DELTA;
                break;
            }
        }
        if (try_quantize_rgb_blue_contract(color0, color1, output, quantization_level)) {
            retval = FMT_RGB;
            break;
        }
        quantize_rgb(color0, color1, output, quantization_level);
        retval = FMT_RGB;
        break;

    case FMT_RGBA:
        if (quantization_level <= 18) {
            if (try_quantize_rgba_delta_blue_contract(color0, color1, output, quantization_level)) {
                retval = FMT_RGBA_DELTA;
                break;
            }
            if (try_quantize_rgba_delta(color0, color1, output, quantization_level)) {
                retval = FMT_RGBA_DELTA;
                break;
            }
        }
        if (try_quantize_rgba_blue_contract(color0, color1, output, quantization_level)) {
            retval = FMT_RGBA;
            break;
        }
        quantize_rgba(color0, color1, output, quantization_level);
        retval = FMT_RGBA;
        break;

    case FMT_RGB_SCALE:
        quantize_rgbs_new(rgbs_color, output, quantization_level);
        // quantize_rgbs( color0, color1, output, quantization_level );
        retval = FMT_RGB_SCALE;
        break;

    case FMT_HDR_RGB_SCALE:
        quantize_hdr_rgbo3(rgbo_color, output, quantization_level);
        retval = FMT_HDR_RGB_SCALE;
        break;

    case FMT_HDR_RGB:
        quantize_hdr_rgb3(color0, color1, output, quantization_level);
        retval = FMT_HDR_RGB;
        break;

    case FMT_RGB_SCALE_ALPHA:
        quantize_rgbs_alpha_new(color0, color1, rgbs_color, output, quantization_level);
        retval = FMT_RGB_SCALE_ALPHA;
        break;

    case FMT_HDR_LUMINANCE_SMALL_RANGE:
    case FMT_HDR_LUMINANCE_LARGE_RANGE:
        if (try_quantize_hdr_luminance_small_range3(color0, color1, output, quantization_level)) {
            retval = FMT_HDR_LUMINANCE_SMALL_RANGE;
            break;
        }
        quantize_hdr_luminance_large_range3(color0, color1, output, quantization_level);
        retval = FMT_HDR_LUMINANCE_LARGE_RANGE;
        break;

    case FMT_LUMINANCE:
        quantize_luminance(color0, color1, output, quantization_level);
        retval = FMT_LUMINANCE;
        break;

    case FMT_LUMINANCE_ALPHA:
        if (quantization_level <= 18) {
            if (try_quantize_luminance_alpha_delta(color0, color1, output, quantization_level)) {
                retval = FMT_LUMINANCE_ALPHA_DELTA;
                break;
            }
        }
        quantize_luminance_alpha(color0, color1, output, quantization_level);
        retval = FMT_LUMINANCE_ALPHA;
        break;

    case FMT_HDR_RGB_LDR_ALPHA:
        quantize_hdr_rgb_ldr_alpha3(color0, color1, output, quantization_level);
        retval = FMT_HDR_RGB_LDR_ALPHA;
        break;

    case FMT_HDR_RGBA:
        quantize_hdr_rgb_alpha3(color0, color1, output, quantization_level);
        retval = FMT_HDR_RGBA;
        break;

    default:
        astc_codec_internal_error("ERROR: pack_color_endpoints");
        quantize0(output);
        retval = FMT_LUMINANCE;
        break;
    }

    return retval;
}

//============ UNPACKING CODE ==========
void luminance_unpack(int input[2], int quantization_level, ushort4 * output0, ushort4 * output1) {
    int lum0 = color_unquantization_tables[quantization_level][input[0]];
    int lum1 = color_unquantization_tables[quantization_level][input[1]];
    ushort4 um1 = { (ushort)lum0, (ushort)lum0, (ushort)lum0, (ushort)255 };
    ushort4 um2 = { (ushort)lum1, (ushort)lum1, (ushort)lum1, (ushort)255 };
    *output0 = um1;
    *output1 = um2;
}

void luminance_delta_unpack(int input[2], int quantization_level, ushort4 * output0, ushort4 * output1) {
    int v0 = color_unquantization_tables[quantization_level][input[0]];
    int v1 = color_unquantization_tables[quantization_level][input[1]];
    int l0 = (v0 >> 2) | (v1 & 0xC0);
    int l1 = l0 + (v1 & 0x3F);

    if (l1 > 255)
        l1 = 255;

    ushort4 lu40 = { (ushort)l0, (ushort)l0, (ushort)l0, (ushort)255 };
    ushort4 lu41 = { (ushort)l1, (ushort)l1, (ushort)l1, (ushort)255 };

    *output0 = lu40;
    *output1 = lu41;
}

void hdr_luminance_small_range_unpack(int input[2], int quantization_level, ushort4 * output0, ushort4 * output1) {
    int v0 = color_unquantization_tables[quantization_level][input[0]];
    int v1 = color_unquantization_tables[quantization_level][input[1]];

    int y0, y1;
    if (v0 & 0x80) {
        y0 = ((v1 & 0xE0) << 4) | ((v0 & 0x7F) << 2);
        y1 = (v1 & 0x1F) << 2;
    } else {
        y0 = ((v1 & 0xF0) << 4) | ((v0 & 0x7F) << 1);
        y1 = (v1 & 0xF) << 1;
    }

    y1 += y0;
    if (y1 > 0xFFF)
        y1 = 0xFFF;

    ushort4 uy0 = { (ushort)(y0 << 4), (ushort)(y0 << 4), (ushort)(y0 << 4), 0x7800 };
    ushort4 uy1 = { (ushort)(y1 << 4), (ushort)(y1 << 4), (ushort)(y1 << 4), 0x7800 };

    *output0 = uy0;
    *output1 = uy1;
}

void hdr_luminance_large_range_unpack(int input[2], int quantization_level, ushort4 * output0, ushort4 * output1) {
    int v0 = color_unquantization_tables[quantization_level][input[0]];
    int v1 = color_unquantization_tables[quantization_level][input[1]];

    int y0, y1;
    if (v1 >= v0) {
        y0 = v0 << 4;
        y1 = v1 << 4;
    } else {
        y0 = (v1 << 4) + 8;
        y1 = (v0 << 4) - 8;
    }

    ushort4 uy0 = { (ushort)(y0 << 4), (ushort)(y0 << 4), (ushort)(y0 << 4), (ushort)0x7800 };
    ushort4 uy1 = { (ushort)(y1 << 4), (ushort)(y1 << 4), (ushort)(y1 << 4), (ushort)0x7800 };

    *output0 = uy0;
    *output1 = uy1;
}

void luminance_alpha_unpack(int input[4], int quantization_level, ushort4 * output0, ushort4 * output1) {
    int lum0 = color_unquantization_tables[quantization_level][input[0]];
    int lum1 = color_unquantization_tables[quantization_level][input[1]];
    int alpha0 = color_unquantization_tables[quantization_level][input[2]];
    int alpha1 = color_unquantization_tables[quantization_level][input[3]];

    ushort4 lu0 = { (ushort)lum0, (ushort)lum0, (ushort)lum0, (ushort)alpha0 };
    ushort4 lu1 = { (ushort)lum1, (ushort)lum1, (ushort)lum1, (ushort)alpha1 };

    *output0 = lu0;
    *output1 = lu1;
}

void luminance_alpha_delta_unpack(int input[4], int quantization_level, ushort4 * output0, ushort4 * output1) {
    int lum0 = color_unquantization_tables[quantization_level][input[0]];
    int lum1 = color_unquantization_tables[quantization_level][input[1]];
    int alpha0 = color_unquantization_tables[quantization_level][input[2]];
    int alpha1 = color_unquantization_tables[quantization_level][input[3]];

    lum0 |= (lum1 & 0x80) << 1;
    alpha0 |= (alpha1 & 0x80) << 1;
    lum1 &= 0x7F;
    alpha1 &= 0x7F;
    if (lum1 & 0x40)
        lum1 -= 0x80;
    if (alpha1 & 0x40)
        alpha1 -= 0x80;

    lum0 >>= 1;
    lum1 >>= 1;
    alpha0 >>= 1;
    alpha1 >>= 1;
    lum1 += lum0;
    alpha1 += alpha0;

    if (lum1 < 0)
        lum1 = 0;
    else if (lum1 > 255)
        lum1 = 255;

    if (alpha1 < 0)
        alpha1 = 0;
    else if (alpha1 > 255)
        alpha1 = 255;

    ushort4 lu0 = { (ushort)lum0, (ushort)lum0, (ushort)lum0, (ushort)alpha0 };
    ushort4 lu1 = { (ushort)lum1, (ushort)lum1, (ushort)lum1, (ushort)alpha1 };

    *output0 = lu0;
    *output1 = lu1;
}

void rgb_scale_unpack(int input[4], int quantization_level, ushort4 * output0, ushort4 * output1) {
    int ir = color_unquantization_tables[quantization_level][input[0]];
    int ig = color_unquantization_tables[quantization_level][input[1]];
    int ib = color_unquantization_tables[quantization_level][input[2]];

    int iscale = color_unquantization_tables[quantization_level][input[3]];

    ushort4 i0 = { (ushort)ir, (ushort)ig, (ushort)ib, (ushort)255 };
    ushort4 i1 = { (ushort)((ir * iscale) >> 8), (ushort)((ig * iscale) >> 8), (ushort)((ib * iscale) >> 8), (ushort)255 };

    *output1 = i0;
    *output0 = i1;
}

void rgb_scale_alpha_unpack(int input[6], int quantization_level, ushort4 * output0, ushort4 * output1) {
    rgb_scale_unpack(input, quantization_level, output0, output1);
    (*output0).w = color_unquantization_tables[quantization_level][input[4]];
    (*output1).w = color_unquantization_tables[quantization_level][input[5]];

}

// RGB-offset format
void hdr_rgbo_unpack3(int input[4], int quantization_level, ushort4 * output0, ushort4 * output1) {
    int v0 = color_unquantization_tables[quantization_level][input[0]];
    int v1 = color_unquantization_tables[quantization_level][input[1]];
    int v2 = color_unquantization_tables[quantization_level][input[2]];
    int v3 = color_unquantization_tables[quantization_level][input[3]];

    int modeval = ((v0 & 0xC0) >> 6) | (((v1 & 0x80) >> 7) << 2) | (((v2 & 0x80) >> 7) << 3);

    int majcomp;
    int mode;
    if ((modeval & 0xC) != 0xC) {
        majcomp = modeval >> 2;
        mode = modeval & 3;
    } else if (modeval != 0xF) {
        majcomp = modeval & 3;
        mode = 4;
    } else {
        majcomp = 0;
        mode = 5;
    }

    int red = v0 & 0x3F;
    int green = v1 & 0x1F;
    int blue = v2 & 0x1F;
    int scale = v3 & 0x1F;

    int bit0 = (v1 >> 6) & 1;
    int bit1 = (v1 >> 5) & 1;
    int bit2 = (v2 >> 6) & 1;
    int bit3 = (v2 >> 5) & 1;
    int bit4 = (v3 >> 7) & 1;
    int bit5 = (v3 >> 6) & 1;
    int bit6 = (v3 >> 5) & 1;

    int ohcomp = 1 << mode;

    if (ohcomp & 0x30)
        green |= bit0 << 6;
    if (ohcomp & 0x3A)
        green |= bit1 << 5;
    if (ohcomp & 0x30)
        blue |= bit2 << 6;
    if (ohcomp & 0x3A)
        blue |= bit3 << 5;

    if (ohcomp & 0x3D)
        scale |= bit6 << 5;
    if (ohcomp & 0x2D)
        scale |= bit5 << 6;
    if (ohcomp & 0x04)
        scale |= bit4 << 7;

    if (ohcomp & 0x3B)
        red |= bit4 << 6;
    if (ohcomp & 0x04)
        red |= bit3 << 6;

    if (ohcomp & 0x10)
        red |= bit5 << 7;
    if (ohcomp & 0x0F)
        red |= bit2 << 7;

    if (ohcomp & 0x05)
        red |= bit1 << 8;
    if (ohcomp & 0x0A)
        red |= bit0 << 8;

    if (ohcomp & 0x05)
        red |= bit0 << 9;
    if (ohcomp & 0x02)
        red |= bit6 << 9;

    if (ohcomp & 0x01)
        red |= bit3 << 10;
    if (ohcomp & 0x02)
        red |= bit5 << 10;


    // expand to 12 bits.

    int shamt = shamts[mode];
    red <<= shamt;
    green <<= shamt;
    blue <<= shamt;
    scale <<= shamt;

    // on modes 0 to 4, the values stored for "green" and "blue" are differentials,
    // not absolute values.
    if (mode != 5) {
        green = red - green;
        blue = red - blue;
    }

    // switch around components.
    int temp;
    switch (majcomp) {
    case 1:
        temp = red;
        red = green;
        green = temp;
        break;
    case 2:
        temp = red;
        red = blue;
        blue = temp;
        break;
    default:
        break;
    }


    int red0 = red - scale;
    int green0 = green - scale;
    int blue0 = blue - scale;

    // clamp to [0,0xFFF].
    if (red < 0)
        red = 0;
    if (green < 0)
        green = 0;
    if (blue < 0)
        blue = 0;

    if (red0 < 0)
        red0 = 0;
    if (green0 < 0)
        green0 = 0;
    if (blue0 < 0)
        blue0 = 0;

    ushort4 rgb0 = { (ushort)(red0 << 4), (ushort)(green0 << 4), (ushort)(blue0 << 4), (ushort)0x7800 };
    ushort4 rgb1 = { (ushort)(red << 4), (ushort)(green << 4), (ushort)(blue << 4), (ushort)0x7800 };

    *output0 = rgb0;
    *output1 = rgb1;
}

int rgb_unpack(int input[6], int quantization_level, ushort4 * output0, ushort4 * output1) {

    int ri0b = color_unquantization_tables[quantization_level][input[0]];
    int ri1b = color_unquantization_tables[quantization_level][input[1]];
    int gi0b = color_unquantization_tables[quantization_level][input[2]];
    int gi1b = color_unquantization_tables[quantization_level][input[3]];
    int bi0b = color_unquantization_tables[quantization_level][input[4]];
    int bi1b = color_unquantization_tables[quantization_level][input[5]];

    if (ri0b + gi0b + bi0b > ri1b + gi1b + bi1b) {
        // blue-contraction
        ri0b = (ri0b + bi0b) >> 1;
        gi0b = (gi0b + bi0b) >> 1;
        ri1b = (ri1b + bi1b) >> 1;
        gi1b = (gi1b + bi1b) >> 1;

        (*output0).x = (ushort)ri1b;
        (*output0).y = (ushort)gi1b;
        (*output0).z = (ushort)bi1b;
        (*output0).w = (ushort)255;

        (*output1).x = (ushort)ri0b;
        (*output1).y = (ushort)gi0b;
        (*output1).z = (ushort)bi0b;
        (*output1).w = (ushort)255;
        return 1;
    } else {
        (*output0).x = (ushort)ri0b;
        (*output0).y = (ushort)gi0b;
        (*output0).z = (ushort)bi0b;
        (*output0).w = (ushort)255;

        (*output1).x = (ushort)ri1b;
        (*output1).y = (ushort)gi1b;
        (*output1).z = (ushort)bi1b;
        (*output1).w = (ushort)255;
        return 0;
    }
}

int rgb_delta_unpack(int input[6], int quantization_level, ushort4 * output0, ushort4 * output1) {
    // unquantize the color endpoints
    int r0 = color_unquantization_tables[quantization_level][input[0]];
    int g0 = color_unquantization_tables[quantization_level][input[2]];
    int b0 = color_unquantization_tables[quantization_level][input[4]];

    int r1 = color_unquantization_tables[quantization_level][input[1]];
    int g1 = color_unquantization_tables[quantization_level][input[3]];
    int b1 = color_unquantization_tables[quantization_level][input[5]];

    // perform the bit-transfer procedure
    r0 |= (r1 & 0x80) << 1;
    g0 |= (g1 & 0x80) << 1;
    b0 |= (b1 & 0x80) << 1;
    r1 &= 0x7F;
    g1 &= 0x7F;
    b1 &= 0x7F;
    if (r1 & 0x40)
        r1 -= 0x80;
    if (g1 & 0x40)
        g1 -= 0x80;
    if (b1 & 0x40)
        b1 -= 0x80;

    r0 >>= 1;
    g0 >>= 1;
    b0 >>= 1;
    r1 >>= 1;
    g1 >>= 1;
    b1 >>= 1;

    int rgbsum = r1 + g1 + b1;

    r1 += r0;
    g1 += g0;
    b1 += b0;


    int retval;

    int r0e, g0e, b0e;
    int r1e, g1e, b1e;

    if (rgbsum >= 0) {
        r0e = r0;
        g0e = g0;
        b0e = b0;

        r1e = r1;
        g1e = g1;
        b1e = b1;

        retval = 0;
    } else {
        r0e = (r1 + b1) >> 1;
        g0e = (g1 + b1) >> 1;
        b0e = b1;

        r1e = (r0 + b0) >> 1;
        g1e = (g0 + b0) >> 1;
        b1e = b0;

        retval = 1;
    }

    if (r0e < 0)
        r0e = 0;
    else if (r0e > 255)
        r0e = 255;

    if (g0e < 0)
        g0e = 0;
    else if (g0e > 255)
        g0e = 255;

    if (b0e < 0)
        b0e = 0;
    else if (b0e > 255)
        b0e = 255;

    if (r1e < 0)
        r1e = 0;
    else if (r1e > 255)
        r1e = 255;

    if (g1e < 0)
        g1e = 0;
    else if (g1e > 255)
        g1e = 255;

    if (b1e < 0)
        b1e = 0;
    else if (b1e > 255)
        b1e = 255;

    (*output0).x = (ushort)r0e;
    (*output0).y = (ushort)g0e;
    (*output0).z = (ushort)b0e;
    (*output0).w = (ushort)0xFF;

    (*output1).x = (ushort)r1e;
    (*output1).y = (ushort)g1e;
    (*output1).z = (ushort)b1e;
    (*output1).w = (ushort)0xFF;

    return retval;
}

void hdr_rgb_unpack3(int input[6], int quantization_level, ushort4 * output0, ushort4 * output1) {

    int v0 = color_unquantization_tables[quantization_level][input[0]];
    int v1 = color_unquantization_tables[quantization_level][input[1]];
    int v2 = color_unquantization_tables[quantization_level][input[2]];
    int v3 = color_unquantization_tables[quantization_level][input[3]];
    int v4 = color_unquantization_tables[quantization_level][input[4]];
    int v5 = color_unquantization_tables[quantization_level][input[5]];

    // extract all the fixed-placement bitfields
    int modeval = ((v1 & 0x80) >> 7) | (((v2 & 0x80) >> 7) << 1) | (((v3 & 0x80) >> 7) << 2);

    int majcomp = ((v4 & 0x80) >> 7) | (((v5 & 0x80) >> 7) << 1);

    if (majcomp == 3) {
        ushort4 uv0 = { (ushort)(v0 << 8), (ushort)(v2 << 8), (ushort)((v4 & 0x7F) << 9), (ushort)0x7800 };
        ushort4 uv1 = { (ushort)(v1 << 8), (ushort)(v3 << 8), (ushort)((v5 & 0x7F) << 9), (ushort)0x7800 };

        *output0 = uv0;
        *output1 = uv1;
        return;
    }

    int a = v0 | ((v1 & 0x40) << 2);
    int b0 = v2 & 0x3f;
    int b1 = v3 & 0x3f;
    int c = v1 & 0x3f;
    int d0 = v4 & 0x7f;
    int d1 = v5 & 0x7f;


    int dbits = dbits_tab[modeval];

    // extract six variable-placement bits
    int bit0 = (v2 >> 6) & 1;
    int bit1 = (v3 >> 6) & 1;

    int bit2 = (v4 >> 6) & 1;
    int bit3 = (v5 >> 6) & 1;
    int bit4 = (v4 >> 5) & 1;
    int bit5 = (v5 >> 5) & 1;


    // and prepend the variable-placement bits depending on mode.
    int ohmod = 1 << modeval;    // one-hot-mode
    if (ohmod & 0xA4)
        a |= bit0 << 9;
    if (ohmod & 0x8)
        a |= bit2 << 9;
    if (ohmod & 0x50)
        a |= bit4 << 9;

    if (ohmod & 0x50)
        a |= bit5 << 10;
    if (ohmod & 0xA0)
        a |= bit1 << 10;

    if (ohmod & 0xC0)
        a |= bit2 << 11;

    if (ohmod & 0x4)
        c |= bit1 << 6;
    if (ohmod & 0xE8)
        c |= bit3 << 6;

    if (ohmod & 0x20)
        c |= bit2 << 7;


    if (ohmod & 0x5B)
        b0 |= bit0 << 6;
    if (ohmod & 0x5B)
        b1 |= bit1 << 6;

    if (ohmod & 0x12)
        b0 |= bit2 << 7;
    if (ohmod & 0x12)
        b1 |= bit3 << 7;

    if (ohmod & 0xAF)
        d0 |= bit4 << 5;
    if (ohmod & 0xAF)
        d1 |= bit5 << 5;
    if (ohmod & 0x5)
        d0 |= bit2 << 6;
    if (ohmod & 0x5)
        d1 |= bit3 << 6;

    // sign-extend 'd0' and 'd1'
    // note: this code assumes that signed right-shift actually sign-fills, not zero-fills.
    int32_t d0x = d0;
    int32_t d1x = d1;
    int sx_shamt = 32 - dbits;
    d0x <<= sx_shamt;
    d0x >>= sx_shamt;
    d1x <<= sx_shamt;
    d1x >>= sx_shamt;
    d0 = d0x;
    d1 = d1x;

    // expand all values to 12 bits, with left-shift as needed.
    int val_shamt = (modeval >> 1) ^ 3;
    a <<= val_shamt;
    b0 <<= val_shamt;
    b1 <<= val_shamt;
    c <<= val_shamt;
    d0 <<= val_shamt;
    d1 <<= val_shamt;

    // then compute the actual color values.
    int red1 = a;
    int green1 = a - b0;
    int blue1 = a - b1;
    int red0 = a - c;
    int green0 = a - b0 - c - d0;
    int blue0 = a - b1 - c - d1;

    // clamp the color components to [0,2^12 - 1]
    if (red0 < 0)
        red0 = 0;
    else if (red0 > 0xFFF)
        red0 = 0xFFF;

    if (green0 < 0)
        green0 = 0;
    else if (green0 > 0xFFF)
        green0 = 0xFFF;

    if (blue0 < 0)
        blue0 = 0;
    else if (blue0 > 0xFFF)
        blue0 = 0xFFF;

    if (red1 < 0)
        red1 = 0;
    else if (red1 > 0xFFF)
        red1 = 0xFFF;

    if (green1 < 0)
        green1 = 0;
    else if (green1 > 0xFFF)
        green1 = 0xFFF;

    if (blue1 < 0)
        blue1 = 0;
    else if (blue1 > 0xFFF)
        blue1 = 0xFFF;


    // switch around the color components
    int temp0, temp1;
    switch (majcomp) {
    case 1:                    // switch around red and green
        temp0 = red0;
        temp1 = red1;
        red0 = green0;
        red1 = green1;
        green0 = temp0;
        green1 = temp1;
        break;
    case 2:                    // swithc around red and blue
        temp0 = red0;
        temp1 = red1;
        red0 = blue0;
        red1 = blue1;
        blue0 = temp0;
        blue1 = temp1;
        break;
    case 0:                    // no switch
        break;
    }

    ushort4 rgb0 = { (ushort)(red0 << 4), (ushort)(green0 << 4), (ushort)(blue0 << 4), (ushort)0x7800 };
    ushort4 rgb1 = { (ushort)(red1 << 4), (ushort)(green1 << 4), (ushort)(blue1 << 4), (ushort)0x7800 };

    *output0 = rgb0;
    *output1 = rgb1;
}

void rgba_unpack(int input[8], int quantization_level, ushort4 * output0, ushort4 * output1) {
    int order = rgb_unpack(input, quantization_level, output0, output1);
    if (order == 0) {
        (*output0).w = color_unquantization_tables[quantization_level][input[6]];
        (*output1).w = color_unquantization_tables[quantization_level][input[7]];
    } else {
        (*output0).w = color_unquantization_tables[quantization_level][input[7]];
        (*output1).w = color_unquantization_tables[quantization_level][input[6]];
    }
}

void hdr_rgb_ldr_alpha_unpack3(int input[8], int quantization_level, ushort4 * output0, ushort4 * output1) {
    hdr_rgb_unpack3(input, quantization_level, output0, output1);

    int v6 = color_unquantization_tables[quantization_level][input[6]];
    int v7 = color_unquantization_tables[quantization_level][input[7]];
    (*output0).w = (ushort)v6;
    (*output1).w = (ushort)v7;
}

void rgba_delta_unpack(int input[8], int quantization_level, ushort4 * output0, ushort4 * output1) {
    int a0 = color_unquantization_tables[quantization_level][input[6]];
    int a1 = color_unquantization_tables[quantization_level][input[7]];
    a0 |= (a1 & 0x80) << 1;
    a1 &= 0x7F;
    if (a1 & 0x40)
        a1 -= 0x80;
    a0 >>= 1;
    a1 >>= 1;
    a1 += a0;

    if (a1 < 0)
        a1 = 0;
    else if (a1 > 255)
        a1 = 255;

    int order = rgb_delta_unpack(input, quantization_level, output0, output1);
    if (order == 0) {
        (*output0).w = (ushort)a0;
        (*output1).w = (ushort)a1;
    } else {
        (*output0).w = (ushort)a1;
        (*output1).w = (ushort)a0;
    }
}

void hdr_alpha_unpack(int input[2], int quantization_level, int *a0, int *a1) {

    int v6 = color_unquantization_tables[quantization_level][input[0]];
    int v7 = color_unquantization_tables[quantization_level][input[1]];

    int selector = ((v6 >> 7) & 1) | ((v7 >> 6) & 2);
    v6 &= 0x7F;
    v7 &= 0x7F;
    if (selector == 3) {
        *a0 = v6 << 5;
        *a1 = v7 << 5;
    } else {
        v6 |= (v7 << (selector + 1)) & 0x780;
        v7 &= (0x3f >> selector);
        v7 ^= 32 >> selector;
        v7 -= 32 >> selector;
        v6 <<= (4 - selector);
        v7 <<= (4 - selector);
        v7 += v6;

        if (v7 < 0)
            v7 = 0;
        else if (v7 > 0xFFF)
            v7 = 0xFFF;

        *a0 = v6;
        *a1 = v7;
    }

    *a0 <<= 4;
    *a1 <<= 4;
}

void hdr_rgb_hdr_alpha_unpack3(int input[8], int quantization_level, ushort4 * output0, ushort4 * output1) {
    hdr_rgb_unpack3(input, quantization_level, output0, output1);

    int alpha0, alpha1;
    hdr_alpha_unpack(input + 6, quantization_level, &alpha0, &alpha1);

    (*output0).w = (ushort)alpha0;
    (*output1).w = (ushort)alpha1;
}

void unpack_color_endpoints(
    int format,
    int quantization_level,
    int *input,
    int *rgb_hdr,                           // out
    int *alpha_hdr,                         // out
    int *nan_endpoint,                      // out
    ushort4 * output0, ushort4 * output1,   // out
    __global ASTC_Encode *ASTCEncode) {
    DEBUG("unpack_color_endpoints");
    *nan_endpoint = 0;

//#ifdef __OPENCL_VERSION__
//    if ((get_global_id(0) == 0) && (get_global_id(1) == 0))
//#endif
//        printf("format %d quantization_level %d *input %d\n",  format, quantization_level, *input);


    switch (format) {
    case FMT_LUMINANCE:
        *rgb_hdr = 0;
        *alpha_hdr = 0;
        luminance_unpack(input, quantization_level, output0, output1);
        break;

    case FMT_LUMINANCE_DELTA:
        *rgb_hdr = 0;
        *alpha_hdr = 0;
        luminance_delta_unpack(input, quantization_level, output0, output1);
        break;

    case FMT_HDR_LUMINANCE_SMALL_RANGE:
        *rgb_hdr = 1;
        *alpha_hdr = -1;
        hdr_luminance_small_range_unpack(input, quantization_level, output0, output1);
        break;

    case FMT_HDR_LUMINANCE_LARGE_RANGE:
        *rgb_hdr = 1;
        *alpha_hdr = -1;
        hdr_luminance_large_range_unpack(input, quantization_level, output0, output1);
        break;

    case FMT_LUMINANCE_ALPHA:
        *rgb_hdr = 0;
        *alpha_hdr = 0;
        luminance_alpha_unpack(input, quantization_level, output0, output1);
        break;

    case FMT_LUMINANCE_ALPHA_DELTA:
        *rgb_hdr = 0;
        *alpha_hdr = 0;
        luminance_alpha_delta_unpack(input, quantization_level, output0, output1);
        break;

    case FMT_RGB_SCALE:
        *rgb_hdr = 0;
        *alpha_hdr = 0;
        rgb_scale_unpack(input, quantization_level, output0, output1);
        break;

    case FMT_RGB_SCALE_ALPHA:
        *rgb_hdr = 0;
        *alpha_hdr = 0;
        rgb_scale_alpha_unpack(input, quantization_level, output0, output1);
        break;

    case FMT_HDR_RGB_SCALE:
        *rgb_hdr = 1;
        *alpha_hdr = -1;
        hdr_rgbo_unpack3(input, quantization_level, output0, output1);
        break;

    case FMT_RGB:
        *rgb_hdr = 0;
        *alpha_hdr = 0;
        rgb_unpack(input, quantization_level, output0, output1);
        break;

    case FMT_RGB_DELTA:
        *rgb_hdr = 0;
        *alpha_hdr = 0;
        rgb_delta_unpack(input, quantization_level, output0, output1);
        break;

    case FMT_HDR_RGB:
        *rgb_hdr = 1;
        *alpha_hdr = -1;
        hdr_rgb_unpack3(input, quantization_level, output0, output1);
        break;

    case FMT_RGBA:
        *rgb_hdr = 0;
        *alpha_hdr = 0;
        rgba_unpack(input, quantization_level, output0, output1);
        break;

    case FMT_RGBA_DELTA:
        *rgb_hdr = 0;
        *alpha_hdr = 0;
        rgba_delta_unpack(input, quantization_level, output0, output1);
        break;

    case FMT_HDR_RGB_LDR_ALPHA:
        *rgb_hdr = 1;
        *alpha_hdr = 0;
        hdr_rgb_ldr_alpha_unpack3(input, quantization_level, output0, output1);
        break;

    case FMT_HDR_RGBA:
        *rgb_hdr = 1;
        *alpha_hdr = 1;
        hdr_rgb_hdr_alpha_unpack3(input, quantization_level, output0, output1);
        break;

    default:
        astc_codec_internal_error("ERROR: unpack_color_endpoints");
        break;
    }


    if (*alpha_hdr == -1) {
        if (ASTCEncode->m_alpha_force_use_of_hdr) {
            (*output0).w = 0x7800;
            (*output1).w = 0x7800;
            *alpha_hdr = 1;
        } else {
            (*output0).w = 0x00FF;
            (*output1).w = 0x00FF;
            *alpha_hdr = 0;
        }
    }


    switch (ASTCEncode->m_decode_mode) {
    case DECODE_LDR_SRGB:
        if (*rgb_hdr == 1) {
            (*output0).x = 0xFF00;
            (*output0).y = 0x0000;
            (*output0).z = 0xFF00;
            (*output0).w = 0xFF00;
            (*output1).x = 0xFF00;
            (*output1).y = 0x0000;
            (*output1).z = 0xFF00;
            (*output1).w = 0xFF00;
        } else {
            (*output0).x *= 257;
            (*output0).y *= 257;
            (*output0).z *= 257;
            (*output0).w *= 257;
            (*output1).x *= 257;
            (*output1).y *= 257;
            (*output1).z *= 257;
            (*output1).w *= 257;
        }
        *rgb_hdr = 0;
        *alpha_hdr = 0;
        break;

    case DECODE_LDR:
        if (*rgb_hdr == 1) {
            (*output0).x = 0xFFFF;
            (*output0).y = 0xFFFF;
            (*output0).z = 0xFFFF;
            (*output0).w = 0xFFFF;
            (*output1).x = 0xFFFF;
            (*output1).y = 0xFFFF;
            (*output1).z = 0xFFFF;
            (*output1).w = 0xFFFF;
            *nan_endpoint = 1;
        } else {
            (*output0).x *= 257;
            (*output0).y *= 257;
            (*output0).z *= 257;
            (*output0).w *= 257;
            (*output1).x *= 257;
            (*output1).y *= 257;
            (*output1).z *= 257;
            (*output1).w *= 257;
        }
        *rgb_hdr = 0;
        *alpha_hdr = 0;
        break;

    case DECODE_HDR:

        if (*rgb_hdr == 0) {
            (*output0).x *= 257;
            (*output0).y *= 257;
            (*output0).z *= 257;
            (*output1).x *= 257;
            (*output1).y *= 257;
            (*output1).z *= 257;
        }
        if (*alpha_hdr == 0) {
            (*output0).w *= 257;
            (*output1).w *= 257;
        }
        break;
    }

//#ifdef __OPENCL_VERSION__
//    if ((get_global_id(0) == 0) && (get_global_id(1) == 0))
//#endif
//        printf("(*output0) %d %d %d %d (*output1) %d %d %d %d \n", (*output0).x, (*output0).y, (*output0).z, (*output0).w, (*output1).x, (*output1).y, (*output1).z, (*output1).w);
//


}

ushort4 lerp_color_int(
    ushort4 color0,
    ushort4 color1,
    int4 tmp_weight1,
    __global ASTC_Encode *ASTCEncode
) {
    DEBUG("lerp_color_int");
    int4 ecolor0        = { color0.x, color0.y, color0.z, color0.w };
    int4 ecolor1        = { color1.x, color1.y, color1.z, color1.w };
    int4 tmp_weight0 = { 64, 64, 64, 64 };

    tmp_weight0 = tmp_weight0 - tmp_weight1;

    if (ASTCEncode->m_decode_mode == DECODE_LDR_SRGB) {
        ecolor0 = ecolor0 >> 8;
        ecolor1 = ecolor1 >> 8;
    }

    int4 color = { 32, 32, 32, 32 };

    ecolor0 = ecolor0 * tmp_weight0;
    ecolor1 = ecolor1 * tmp_weight1;

    color = ecolor0 + ecolor1 + color;
    color = color >> 6;
    if (ASTCEncode->m_decode_mode == DECODE_LDR_SRGB)
        color = color | (color << 8);

    ushort4 rcolor  = { (ushort)color.x, (ushort)color.y, (ushort)color.z, (ushort)color.w };

    return rcolor;
}

ushort4 COMPUTE_LRP_COLOR(
    ushort4                 color0,
    ushort4                 color1,
    float                   plane1_weight,
    float                   plane2_weight,
    int                     plane2_color_component,
    __global ASTC_Encode    *ASTCEncode
) {
    DEBUG("COMPUTE_LRP_COLOR");
    int4    tmp_weight0 = { 64,64,64,64 };
    int4    tmp_weight1;

    int int_plane1_weight = (int)floor(plane1_weight);
    int int_plane2_weight = (int)floor(plane2_weight);

    //# Patch for LLVM intrinsic space error
    tmp_weight1.x = int_plane1_weight;
    tmp_weight1.y = int_plane1_weight;
    tmp_weight1.z = int_plane1_weight;
    tmp_weight1.w = int_plane1_weight;

    switch (plane2_color_component) {
    case 0:
        tmp_weight1.x = int_plane2_weight;
        break;
    case 1:
        tmp_weight1.y = int_plane2_weight;
        break;
    case 2:
        tmp_weight1.z = int_plane2_weight;
        break;
    case 3:
        tmp_weight1.w = int_plane2_weight;
        break;
    default:
        break;
    }

//#ifdef __OPENCL_VERSION__
//    if ((get_global_id(0) == 0) && (get_global_id(1) == 0))
//#endif
//        printf("color0 %d %d %d %d color1 %d %d %d %d \n", color0.x, color0.y, color0.z, color0.w, color1.x, color1.y, color1.z, color1.w);


//#ifdef __OPENCL_VERSION__
//    if ((get_global_id(0) == 0) && (get_global_id(1) == 0))
//#endif
//        printf("tmp_weight %d %d %d %d\n", tmp_weight1.x, tmp_weight1.y, tmp_weight1.z, tmp_weight1.w);


    return lerp_color_int(
               color0,
               color1,
               tmp_weight1,
               ASTCEncode);
}

// for each weight, unquantize the weight, use it to compute a color and a color error.
// then, increment the weight until the color error stops decreasing
// then, decrement the weight until the color error stops increasing
void COMPUTE_ERROR(
    float                   *errorvar,
    int                     texels_to_evaluate,
    __global decimation_table        *it,
    __global partition_info          *pt,
    float                   uq_plane1_weights[MAX_WEIGHTS_PER_BLOCK],
    float                   uq_plane2_weights[MAX_WEIGHTS_PER_BLOCK],
    int                     is_dual_plane,
    ushort4                 color_endpoint0[4],
    ushort4                 color_endpoint1[4],
    int                     i,
    imageblock              *blk,
    int                     plane2_color_component,
    error_weight_block      *ewb,
    __global ASTC_Encode    *ASTCEncode) {
    DEBUG("COMPUTE_ERROR");
    *errorvar = 0.0f;
    for (int j = 0; j < texels_to_evaluate; j++) {
        int texel = it->weight_texel[i][j];
        int partition = pt->partition_of_texel[texel];
        float plane1_weight = compute_value_of_texel_flt_localVar(texel, it, uq_plane1_weights);
        float plane2_weight = 0.0f;
        if (is_dual_plane) {
            plane2_weight = compute_value_of_texel_flt_localVar(texel, it, uq_plane2_weights);
        }

//#ifdef __OPENCL_VERSION__
//        if ((get_global_id(0) == 0) && (get_global_id(1) == 0))
//#endif
//        printf("plane1_weight %3.3f plane2_weight %3.3f  plane2_color_component %d\n", plane1_weight, plane2_weight, plane2_color_component);

        ushort4 lrp_color = COMPUTE_LRP_COLOR(
                                color_endpoint0[partition],
                                color_endpoint1[partition],
                                plane1_weight,
                                plane2_weight,
                                plane2_color_component,
                                ASTCEncode);

        float4 color = { (float)lrp_color.x, (float)lrp_color.y, (float)lrp_color.z, (float)lrp_color.w };
        float4 origcolor = {
            blk->work_data[4 * texel],
            blk->work_data[4 * texel + 1],
            blk->work_data[4 * texel + 2],
            blk->work_data[4 * texel + 3]
        };

//#ifdef __OPENCL_VERSION__
//        if ((get_global_id(0) == 0) && (get_global_id(1) == 0))
//#endif
//            printf("color %3.3f %3.3f %3.3f %3.3f origcolor %3.3f %3.3f %3.3f %3.3f \n", color.x, color.y, color.z, color.w, origcolor.x, origcolor.y, origcolor.z, origcolor.w);

        float4 error_weight = ewb->error_weights[texel];
        float4 colordiff = color - origcolor;
        *errorvar += dot(colordiff*colordiff, error_weight);

//#ifdef __OPENCL_VERSION__
//        if ((get_global_id(0) == 0) && (get_global_id(1) == 0))
//#endif
//            printf("%3.3f\n", *errorvar);

    }


}

int realign_weights(
    imageblock * blk,
    error_weight_block * ewb,
    symbolic_compressed_block * scb,
    __global2 uint8_t * weight_set8,
    __global2 uint8_t * plane2_weight_set8,
    __global ASTC_Encode *ASTCEncode) {
    DEBUG("realign_weights");
    int i;

    // get the appropriate partition descriptor.
    int partition_count = scb->partition_count;

    // get the appropriate block descriptor
    int modeindex = ASTCEncode->bsd.block_modes[scb->block_mode].decimation_mode;

    int is_dual_plane = ASTCEncode->bsd.block_modes[scb->block_mode].is_dual_plane;

    // get quantization-parameters
    int weight_quantization_level = ASTCEncode->bsd.block_modes[scb->block_mode].quantization_mode;


    // decode the color endpoints
    ushort4 color_endpoint0[4];
    ushort4 color_endpoint1[4];
    int rgb_hdr[4];
    int alpha_hdr[4];
    int nan_endpoint[4];

    for (i = 0; i < partition_count; i++)
        unpack_color_endpoints(
            scb->color_formats[i], scb->color_quantization_level, scb->color_values[i], &rgb_hdr[i], &alpha_hdr[i],
            &nan_endpoint[i], &(color_endpoint0[i]), &(color_endpoint1[i]), ASTCEncode);

    float uq_plane1_weights[MAX_WEIGHTS_PER_BLOCK];
    float uq_plane2_weights[MAX_WEIGHTS_PER_BLOCK];
    int weight_count = ASTCEncode->bsd.decimation_tables[modeindex].num_weights;

    // read and unquantize the weights.
    __constant quantization_and_transfer_table *qat = &(quant_and_xfer_tables[weight_quantization_level]);

    for (i = 0; i < weight_count; i++) {
        uq_plane1_weights[i] = qat->unquantized_value_flt[weight_set8[i]];
    }
    if (is_dual_plane) {
        for (i = 0; i < weight_count; i++)
            uq_plane2_weights[i] = qat->unquantized_value_flt[plane2_weight_set8[i]];
    }


    int plane2_color_component = is_dual_plane ? scb->plane2_color_component : -1;

    // for each weight, unquantize the weight, use it to compute a color and a color error.
    // then, increment the weight until the color error stops decreasing
    // then, decrement the weight until the color error stops increasing

    int adjustments = 0;

    for (i = 0; i < weight_count; i++) {
        uint8_t current_wt = weight_set8[i];
        int texels_to_evaluate = ASTCEncode->bsd.decimation_tables[modeindex].weight_num_texels[i];

        float current_error;

        COMPUTE_ERROR(
            &current_error,
            texels_to_evaluate,
            &ASTCEncode->bsd.decimation_tables[modeindex],
            &ASTCEncode->partition_tables[partition_count][scb->partition_index],
            uq_plane1_weights,
            uq_plane2_weights,
            is_dual_plane,
            color_endpoint0,
            color_endpoint1,
            i,
            blk,
            plane2_color_component,
            ewb,
            ASTCEncode);

        // increment until error starts increasing.
        while (1) {
            uint8_t next_wt = qat->next_quantized_value[current_wt];
            if (next_wt == current_wt)
                break;
            uq_plane1_weights[i] = qat->unquantized_value_flt[next_wt];
            float next_error;

            COMPUTE_ERROR(
                &next_error,
                texels_to_evaluate,
                &ASTCEncode->bsd.decimation_tables[modeindex],
                &ASTCEncode->partition_tables[partition_count][scb->partition_index],
                uq_plane1_weights,
                uq_plane2_weights,
                is_dual_plane,
                color_endpoint0,
                color_endpoint1,
                i,
                blk,
                plane2_color_component,
                ewb,
                ASTCEncode);


            if (next_error < current_error) {
                // succeeded, increment the weight
                current_wt = next_wt;
                current_error = next_error;
                adjustments++;
            } else {
                // failed, back out the attempted increment
                uq_plane1_weights[i] = qat->unquantized_value_flt[current_wt];
                break;
            }
        }
        // decrement until error starts increasing
        while (1) {
            uint8_t prev_wt = qat->prev_quantized_value[current_wt];
            if (prev_wt == current_wt)
                break;
            uq_plane1_weights[i] = qat->unquantized_value_flt[prev_wt];
            float prev_error;
            COMPUTE_ERROR(
                &prev_error,
                texels_to_evaluate,
                &ASTCEncode->bsd.decimation_tables[modeindex],
                &ASTCEncode->partition_tables[partition_count][scb->partition_index],
                uq_plane1_weights,
                uq_plane2_weights,
                is_dual_plane,
                color_endpoint0,
                color_endpoint1,
                i,
                blk,
                plane2_color_component,
                ewb,
                ASTCEncode);
            if (prev_error < current_error) {
                // succeeded, decrement the weight
                current_wt = prev_wt;
                current_error = prev_error;
                adjustments++;
            } else {
                // failed, back out the attempted decrement
                uq_plane1_weights[i] = qat->unquantized_value_flt[current_wt];
                break;
            }
        }

        weight_set8[i] = current_wt;
    }

    if (!is_dual_plane)
        return adjustments;

    // processing of the second plane of weights
    for (i = 0; i < weight_count; i++) {
        uint8_t current_wt = plane2_weight_set8[i];
        int texels_to_evaluate = ASTCEncode->bsd.decimation_tables[modeindex].weight_num_texels[i];

        float current_error;

        COMPUTE_ERROR(
            &current_error,
            texels_to_evaluate,
            &ASTCEncode->bsd.decimation_tables[modeindex],
            &ASTCEncode->partition_tables[partition_count][scb->partition_index],
            uq_plane1_weights,
            uq_plane2_weights,
            is_dual_plane,
            color_endpoint0,
            color_endpoint1,
            i,
            blk,
            plane2_color_component,
            ewb,
            ASTCEncode);

        // increment until error starts increasing.
        while (1) {
            uint8_t next_wt = qat->next_quantized_value[current_wt];
            if (next_wt == current_wt)
                break;
            uq_plane2_weights[i] = qat->unquantized_value_flt[next_wt];
            float next_error;
            COMPUTE_ERROR(
                &next_error,
                texels_to_evaluate,
                &ASTCEncode->bsd.decimation_tables[modeindex],
                &ASTCEncode->partition_tables[partition_count][scb->partition_index],
                uq_plane1_weights,
                uq_plane2_weights,
                is_dual_plane,
                color_endpoint0,
                color_endpoint1,
                i,
                blk,
                plane2_color_component,
                ewb,
                ASTCEncode);

            if (next_error < current_error) {
                // succeeded, increment the weight
                current_wt = next_wt;
                current_error = next_error;
                adjustments++;
            } else {
                // failed, back out the attempted increment
                uq_plane2_weights[i] = qat->unquantized_value_flt[current_wt];
                break;
            }
        }
        // decrement until error starts increasing
        while (1) {
            uint8_t prev_wt = qat->prev_quantized_value[current_wt];
            if (prev_wt == current_wt)
                break;
            uq_plane1_weights[i] = qat->unquantized_value_flt[prev_wt];
            float prev_error;
            COMPUTE_ERROR(
                &prev_error,
                texels_to_evaluate,
                &ASTCEncode->bsd.decimation_tables[modeindex],
                &ASTCEncode->partition_tables[partition_count][scb->partition_index],
                uq_plane1_weights,
                uq_plane2_weights,
                is_dual_plane,
                color_endpoint0,
                color_endpoint1,
                i,
                blk,
                plane2_color_component,
                ewb,
                ASTCEncode);

            if (prev_error < current_error) {
                // succeeded, decrement the weight
                current_wt = prev_wt;
                current_error = prev_error;
                adjustments++;
            } else {
                // failed, back out the attempted decrement
                uq_plane2_weights[i] = qat->unquantized_value_flt[current_wt];
                break;
            }
        }

        plane2_weight_set8[i] = current_wt;
    }

    return adjustments;
}

void compress_symbolic_block_fixed_partition_1_plane(
    float mode_cutoff,
    int max_refinement_iters,
    int partition_count, int partition_index,
    imageblock * blk,
    error_weight_block * ewb,
    symbolic_compressed_block * scb,
    endpoints_and_weights *ei1,
    endpoints_and_weights *eix1,
    __global2 float   *decimated_weights,
    __global2 uint8_t *u8_quantized_decimated_quantized_weights,
    __global2 float   *decimated_quantized_weights,
    __global2 float   *flt_quantized_decimated_quantized_weights,
    __global ASTC_Encode *ASTCEncode) {
    DEBUG("compress_symbolic_block_fixed_partition_1_plane");

    int i, j, k;
    int free_bits_for_partition_count[5] = { 0, 115 - 4, 111 - 4 - PARTITION_BITS, 108 - 4 - PARTITION_BITS, 105 - 4 - PARTITION_BITS };

    // first, compute ideal weights and endpoint colors, under thre assumption that
    // there is no quantization or decimation going on.
    compute_endpoints_and_ideal_weights_1_plane(&ASTCEncode->partition_tables[partition_count][partition_index], blk, ewb, ei1, ASTCEncode);

    // for each decimation mode, compute an ideal set of weights
    // (that is, weights computed with the assumption that they are not quantized)
    for (i = 0; i < MAX_DECIMATION_MODES; i++) {
        if (ASTCEncode->bsd.permit_encode[i] == 0 ||
                ASTCEncode->bsd.decimation_mode_maxprec_1plane[i] < 0 ||
                ASTCEncode->bsd.decimation_mode_percentile[i] > mode_cutoff)
            continue;
        eix1[i] = *ei1;
        compute_ideal_weights_for_decimation_table(&(eix1[i]),
                &ASTCEncode->bsd.decimation_tables[i],
                decimated_quantized_weights + i * MAX_WEIGHTS_PER_BLOCK,
                decimated_weights + i * MAX_WEIGHTS_PER_BLOCK);
    }

    // compute maximum colors for the endpoints and ideal weights.
    // for each endpoint-and-ideal-weight pair, compute the smallest weight value
    // that will result in a color value greater than 1.

    float4 min_ep = { 10.0f, 10.0f, 10.0f, 10.0f };
    float4 onef   = { 1.0f, 1.0f, 1.0f, 1.0f };

    for (i = 0; i < partition_count; i++) {
        float4 ep = (onef - ei1->ep.endpt0[i]) / (ei1->ep.endpt1[i] - ei1->ep.endpt0[i]);
        if (ep.x > 0.5f && ep.x < min_ep.x)
            min_ep.x = ep.x;
        if (ep.y > 0.5f && ep.y < min_ep.y)
            min_ep.y = ep.y;
        if (ep.z > 0.5f && ep.z < min_ep.z)
            min_ep.z = ep.z;
        if (ep.w > 0.5f && ep.w < min_ep.w)
            min_ep.w = ep.w;
    }

    float min_wt_cutoff = (float)(std::min)((std::min)(min_ep.x, min_ep.y), (std::min)(min_ep.z, min_ep.w));

    // for each mode, use the angular method to compute a shift.
    float weight_low_value[MAX_WEIGHT_MODES];
    float weight_high_value[MAX_WEIGHT_MODES];

    compute_angular_endpoints_1plane(mode_cutoff,
                                     decimated_quantized_weights,
                                     decimated_weights,
                                     weight_low_value, weight_high_value,
                                     ASTCEncode);

    // for each mode (which specifies a decimation and a quantization):
    // * compute number of bits needed for the quantized weights.
    // * generate an optimized set of quantized weights.
    // * compute quantization errors for the mode.

    int qwt_bitcounts[MAX_WEIGHT_MODES];
    float qwt_errors[MAX_WEIGHT_MODES];

    for (i = 0; i < MAX_WEIGHT_MODES; i++) {
        if (ASTCEncode->bsd.block_modes[i].permit_encode == 0 || ASTCEncode->bsd.block_modes[i].is_dual_plane != 0 || ASTCEncode->bsd.block_modes[i].percentile > mode_cutoff) {
            qwt_errors[i] = FLOAT_38;
            continue;
        }
        if (weight_high_value[i] > 1.02f * min_wt_cutoff)
            weight_high_value[i] = 1.0f;

        int decimation_mode = ASTCEncode->bsd.block_modes[i].decimation_mode;
        if (ASTCEncode->bsd.decimation_mode_percentile[decimation_mode] > mode_cutoff) {
            astc_codec_internal_error("ERROR: compress_symbolic_block_fixed_partition_1_plane");
        }

        // compute weight bitcount for the mode
        uint8_t qmode = ASTCEncode->bsd.block_modes[i].quantization_mode;
        int nweights = ASTCEncode->bsd.decimation_tables[decimation_mode].num_weights;
        int bits_used_by_weights = compute_ise_bitcount(nweights, (quantization_method)qmode);

        int bitcount = free_bits_for_partition_count[partition_count] - bits_used_by_weights;
        if (bitcount <= 0 || bits_used_by_weights < 24 || bits_used_by_weights > 96) {
            qwt_errors[i] = FLOAT_38;
            continue;
        }
        qwt_bitcounts[i] = bitcount;


        // then, generate the optimized set of weights for the weight mode.
        compute_ideal_quantized_weights_for_decimation_table(&(eix1[decimation_mode]),
                &ASTCEncode->bsd.decimation_tables[decimation_mode],
                weight_low_value[i], weight_high_value[i],
                decimated_quantized_weights + MAX_WEIGHTS_PER_BLOCK * decimation_mode,
                flt_quantized_decimated_quantized_weights + MAX_WEIGHTS_PER_BLOCK * i,
                u8_quantized_decimated_quantized_weights + MAX_WEIGHTS_PER_BLOCK * i,
                ASTCEncode->bsd.block_modes[i].quantization_mode);
//#ifdef __OPENCL_VERSION__
//         if ((get_global_id(0) == 0) && (get_global_id(1) == 0))
//#endif
//             printf("A1:eix1[decimation_mode].ep.endpt0[0].x %3.3f\n", eix1[decimation_mode].ep.endpt0[0].x);

        // then, compute weight-errors for the weight mode.

        qwt_errors[i] = compute_error_of_weight_set(&(eix1[decimation_mode]),
                        &ASTCEncode->bsd.decimation_tables[decimation_mode],
                        flt_quantized_decimated_quantized_weights + MAX_WEIGHTS_PER_BLOCK * i);
//#ifdef __OPENCL_VERSION__
//         if ((get_global_id(0) == 0) && (get_global_id(1) == 0))
//#endif
//             printf("A2:eix1[decimation_mode].ep.endpt0[0].x %3.3f\n", eix1[decimation_mode].ep.endpt0[0].x);


    }

    // for each weighting mode, determine the optimal combination of color endpoint encodings
    // and weight encodings; return results for the 4 best-looking modes.

    int partition_format_specifiers[4][4];
    int quantized_weight[4];
    int color_quantization_level[4];
    int color_quantization_level_mod[4];

    determine_optimal_set_of_endpoint_formats_to_use( &ASTCEncode->partition_tables[partition_count][partition_index],
            blk,
            ewb,
            &(ei1->ep),
            -1,    // used to flag that we are in single-weight mode
            qwt_bitcounts,
            qwt_errors,
            partition_format_specifiers,
            quantized_weight,
            color_quantization_level,
            color_quantization_level_mod,
            ASTCEncode);

    // then iterate over the 4 believed-to-be-best modes to find out which one is
    // actually best.
    for (i = 0; i < 4; i++) {
        __global2 uint8_t *u8_weight_src;
        int weights_to_copy;

        if (quantized_weight[i] < 0) {
            scb->error_block = 1;
            scb++;
            continue;
        }

        int decimation_mode = ASTCEncode->bsd.block_modes[quantized_weight[i]].decimation_mode;
        int weight_quantization_mode = ASTCEncode->bsd.block_modes[quantized_weight[i]].quantization_mode;

        u8_weight_src = u8_quantized_decimated_quantized_weights + (MAX_WEIGHTS_PER_BLOCK * quantized_weight[i]);

        weights_to_copy = ASTCEncode->bsd.decimation_tables[decimation_mode].num_weights;

        // recompute the ideal color endpoints before storing them.
        float4 rgbs_colors[4];
        float4 rgbo_colors[4];
        float2 lum_intervals[4];

        int l;
        for (l = 0; l < max_refinement_iters; l++) {
//#ifdef __OPENCL_VERSION__
//             if ((get_global_id(0) == 0) && (get_global_id(1) == 0))
//#endif
//             printf("B1:eix1[decimation_mode].ep.endpt0[0].x %3.3f\n", eix1[decimation_mode].ep.endpt0[0].x);

            recompute_ideal_colors(weight_quantization_mode,
                                   &(eix1[decimation_mode].ep),
                                   rgbs_colors, rgbo_colors,
                                   lum_intervals, u8_weight_src,
                                   0, -1, &ASTCEncode->partition_tables[partition_count][partition_index],
                                   &ASTCEncode->bsd.decimation_tables[decimation_mode],
                                   blk, ewb, ASTCEncode);

            // quantize the chosen color
//#ifdef __OPENCL_VERSION__
//             if ((get_global_id(0) == 0) && (get_global_id(1) == 0))
//#endif
//                 printf("B2:eix1[decimation_mode].ep.endpt0[0].x %3.3f\n", eix1[decimation_mode].ep.endpt0[0].x);

            // store the colors for the block
            for (j = 0; j < partition_count; j++) {
                scb->color_formats[j] = pack_color_endpoints(eix1[decimation_mode].ep.endpt0[j],
                                        eix1[decimation_mode].ep.endpt1[j],
                                        rgbs_colors[j], rgbo_colors[j], partition_format_specifiers[i][j],
                                        scb->color_values[j], color_quantization_level[i]);
            }


            // if all the color endpoint modes are the same, we get a few more
            // bits to store colors; let's see if we can take advantage of this:
            // requantize all the colors and see if the endpoint modes remain the same;
            // if they do, then exploit it.
            scb->color_formats_matched = 0;

            if (
                (partition_count >= 2)
                &&  ((scb->color_formats[0]) == scb->color_formats[1])
                // &&  (color_quantization_level != color_quantization_level_mod)  ASTC bug needs fixing
                &&  (color_quantization_level[i] != color_quantization_level_mod[i])  // Added [i] as a tmp fix!
                &&  ((partition_count == 2) || (scb->color_formats[0] == scb->color_formats[2]))
                &&  ((partition_count == 3)  || (scb->color_formats[0] == scb->color_formats[3]))
            ) {
                int colorvals[4][12];
                int color_formats_mod[4];
                for (j = 0; j < partition_count; j++) {
                    color_formats_mod[j] = pack_color_endpoints(eix1[decimation_mode].ep.endpt0[j],
                                           eix1[decimation_mode].ep.endpt1[j],
                                           rgbs_colors[j], rgbo_colors[j],
                                           partition_format_specifiers[i][j], colorvals[j], color_quantization_level_mod[i]);
                }


                if (color_formats_mod[0] == color_formats_mod[1]
                        && (partition_count == 2 || (color_formats_mod[0] == color_formats_mod[2] && (partition_count == 3 || (color_formats_mod[0] == color_formats_mod[3]))))) {
                    scb->color_formats_matched = 1;
                    for (j = 0; j < 4; j++)
                        for (k = 0; k < 12; k++)
                            scb->color_values[j][k] = colorvals[j][k];
                    for (j = 0; j < 4; j++)
                        scb->color_formats[j] = color_formats_mod[j];
                }
            }


            // store header fields
            scb->partition_count = partition_count;
            scb->partition_index = partition_index;
            scb->color_quantization_level = scb->color_formats_matched ? color_quantization_level_mod[i] : color_quantization_level[i];
            scb->block_mode = quantized_weight[i];
            scb->error_block = 0;

            if (scb->color_quantization_level < 4) {
                scb->error_block = 1;    // should never happen, but cannot prove it impossible.
            }

            // perform a final pass over the weights to try to improve them.
            int adjustments = realign_weights(
                                  blk, ewb, scb,
                                  u8_weight_src,
                                  0, ASTCEncode);

            if (adjustments == 0)
                break;
        }

        for (j = 0; j < weights_to_copy; j++)
            scb->plane1_weights[j] = u8_weight_src[j];

        scb++;
    }

}

int compute_value_of_texel_int(int texel_to_get, decimation_table * it, int *weights) {
    int i;
    int summed_value = 8;
    int weights_to_evaluate = it->texel_num_weights[texel_to_get];
    for (i = 0; i < weights_to_evaluate; i++) {
        summed_value += weights[it->texel_weights[texel_to_get][i]] * it->texel_weights_int[texel_to_get][i];
    }
    return summed_value >> 4;
}

int compute_value_of_texel_global(int texel_to_get, __global decimation_table * it, int *weights) {
    int i;
    int summed_value = 8;
    int weights_to_evaluate = it->texel_num_weights[texel_to_get];
    for (i = 0; i < weights_to_evaluate; i++) {
        summed_value += weights[it->texel_weights[texel_to_get][i]] * it->texel_weights_int[texel_to_get][i];
    }
    return summed_value >> 4;
}

/* 32-bit count-leading-zeroes function: use the Assembly instruction whenever possible. */
uint32_t clz32(uint32_t inp) {
    /* slow default version */
    uint32_t summa = 24;
    if (inp >= (uint32_t)(0x10000)) {
        inp >>= 16;
        summa -= 16;
    }
    if (inp >= (uint32_t)(0x100)) {
        inp >>= 8;
        summa -= 8;
    }
    return summa + clz_table[inp];
}

/* convert from FP16 to FP32. */
CGU_UINT sf16_to_sf32(CGU_SHORT inp) {
    uint32_t inpx = inp;

    int32_t res = tbl2[inpx >> 10];
    res += inpx;

    /* the normal cases: the MSB of 'res' is not set. */
    if (res >= 0)                /* signed compare */
        return res << 13;

    /* Infinity and Zero: the bottom 10 bits of 'res' are clear. */
    if ((res & (uint32_t)(0x3FF)) == 0)
        return res << 13;

    /* NaN: the exponent field of 'inp' is not zero; NaNs must be quitened. */
    if ((inpx & 0x7C00) != 0)
        return (res << 13) | (uint32_t)(0x400000);

    /* the remaining cases are Denormals. */
    {
        uint32_t sign = (inpx & (uint32_t)(0x8000)) << 16;
        uint32_t mskval = inpx & (uint32_t)(0x7FFF);
        uint32_t leadingzeroes = clz32(mskval);
        mskval <<= leadingzeroes;
        return (mskval >> 8) + ((0x85 - leadingzeroes) << 23) + sign;
    }
}

/* convert from soft-float to native-float */
float sf16_to_float(CGU_SHORT p) {
    if32 i;
    i.u = sf16_to_sf32(p);
    return i.f;
}

// conversion function from 16-bit LDR value to FP16.
// note: for LDR interpolation, it is impossible to get a denormal result;
// this simplifies the conversion.
// FALSE; we can receive a very small UNORM16 through the constant-block.
uint16_t unorm16_to_sf16(uint16_t p) {
    if (p == 0xFFFF)
        return 0x3C00;            // value of 1.0 .
    if (p < 4)
        return p << 8;

    int lz = clz32(p) - 16;
    p <<= (lz + 1);
    p >>= 6;
    p |= (14 - lz) << 10;
    return p;
}

uint16_t lns_to_sf16(uint16_t p) {

    uint16_t mc = p & 0x7FF;
    uint16_t ec = p >> 11;
    uint16_t mt;
    if (mc < 512)
        mt = 3 * mc;
    else if (mc < 1536)
        mt = 4 * mc - 512;
    else
        mt = 5 * mc - 2048;

    uint16_t res = (ec << 10) | (mt >> 3);
    if (res >= 0x7BFF)
        res = 0x7BFF;
    return res;
}

// helper function to initialize the orig-data from the work-data
void imageblock_initialize_orig_from_work(imageblock * blk, int pixelcount) {
    DEBUG("imageblock_initialize_orig_from_work");
    int i;
    float *fptr = blk->orig_data;
    float *wptr = blk->work_data;

    for (i = 0; i < pixelcount; i++) {
        if (blk->rgb_lns[i]) {
            fptr[0] = sf16_to_float(lns_to_sf16((uint16_t)wptr[0]));
            fptr[1] = sf16_to_float(lns_to_sf16((uint16_t)wptr[1]));
            fptr[2] = sf16_to_float(lns_to_sf16((uint16_t)wptr[2]));
        } else {
            fptr[0] = sf16_to_float(unorm16_to_sf16((uint16_t)wptr[0]));
            fptr[1] = sf16_to_float(unorm16_to_sf16((uint16_t)wptr[1]));
            fptr[2] = sf16_to_float(unorm16_to_sf16((uint16_t)wptr[2]));
        }

        if (blk->alpha_lns[i]) {
            fptr[3] = sf16_to_float(lns_to_sf16((uint16_t)wptr[3]));
        } else {
            fptr[3] = sf16_to_float(unorm16_to_sf16((uint16_t)wptr[3]));
        }

        fptr += 4;
        wptr += 4;
    }

    imageblock_initialize_deriv_from_work_and_orig(blk, pixelcount);
}

void decompress_symbolic_block(
    symbolic_compressed_block * scb,
    imageblock * blk,
    __global ASTC_Encode *ASTCEncode) {
    DEBUG("decompress_symbolic_block");
    int i;

    // if we detected an error-block, blow up immediately.
    if (scb->error_block) {
        if (ASTCEncode->m_decode_mode == DECODE_LDR_SRGB) {
            for (i = 0; i < ASTCEncode->m_texels_per_block; i++) {
                blk->orig_data[4 * i] = 1.0f;
                blk->orig_data[4 * i + 1] = 0.0f;
                blk->orig_data[4 * i + 2] = 1.0f;
                blk->orig_data[4 * i + 3] = 1.0f;
                blk->rgb_lns[i] = 0;
                blk->alpha_lns[i] = 0;
                blk->nan_texel[i] = 0;
            }
        } else {
            for (i = 0; i < ASTCEncode->m_texels_per_block; i++) {
                blk->orig_data[4 * i] = 0.0f;
                blk->orig_data[4 * i + 1] = 0.0f;
                blk->orig_data[4 * i + 2] = 0.0f;
                blk->orig_data[4 * i + 3] = 0.0f;
                blk->rgb_lns[i] = 0;
                blk->alpha_lns[i] = 0;
                blk->nan_texel[i] = 1;
            }
        }

        imageblock_initialize_work_from_orig(blk, ASTCEncode->m_texels_per_block);
        update_imageblock_flags(blk, ASTCEncode);
        return;
    }

    if (scb->block_mode < 0) {
        float red = 0, green = 0, blue = 0, alpha = 0;
        int use_lns = 0;
        int use_nan = 0;

        if (scb->block_mode == -2) {
            // For sRGB decoding, we should return only the top 8 bits.
            int mask = (ASTCEncode->m_decode_mode == DECODE_LDR_SRGB) ? 0xFF00 : 0xFFFF;

            red = sf16_to_float(unorm16_to_sf16((uint16_t)scb->constant_color[0] & mask));
            green = sf16_to_float(unorm16_to_sf16((uint16_t)scb->constant_color[1] & mask));
            blue = sf16_to_float(unorm16_to_sf16((uint16_t)scb->constant_color[2] & mask));
            alpha = sf16_to_float(unorm16_to_sf16((uint16_t)scb->constant_color[3] & mask));
            use_lns = 0;
            use_nan = 0;
        } else {
            switch (ASTCEncode->m_decode_mode) {
            case DECODE_LDR_SRGB:
                red = 1.0f;
                green = 0.0f;
                blue = 1.0f;
                alpha = 1.0f;
                use_lns = 0;
                use_nan = 0;
                break;
            case DECODE_LDR:
                red = 0.0f;
                green = 0.0f;
                blue = 0.0f;
                alpha = 0.0f;
                use_lns = 0;
                use_nan = 1;
                break;
            case DECODE_HDR:
                // constant-color block; unpack from FP16 to FP32.
                red = sf16_to_float((CGU_SHORT)scb->constant_color[0]);
                green = sf16_to_float((CGU_SHORT)scb->constant_color[1]);
                blue = sf16_to_float((CGU_SHORT)scb->constant_color[2]);
                alpha = sf16_to_float((CGU_SHORT)scb->constant_color[3]);
                use_lns = 1;
                use_nan = 0;
                break;
            }
        }

        for (i = 0; i < ASTCEncode->m_texels_per_block; i++) {
            blk->orig_data[4 * i] = red;
            blk->orig_data[4 * i + 1] = green;
            blk->orig_data[4 * i + 2] = blue;
            blk->orig_data[4 * i + 3] = alpha;
            blk->rgb_lns[i] = (uint8_t)use_lns;
            blk->alpha_lns[i] = (uint8_t)use_lns;
            blk->nan_texel[i] = (uint8_t)use_nan;
        }


        imageblock_initialize_work_from_orig(blk, ASTCEncode->m_texels_per_block);
        update_imageblock_flags(blk, ASTCEncode);
        return;
    }

    // get the appropriate partition-table entry
    int partition_count = scb->partition_count;

    // get the appropriate block descriptor
    int is_dual_plane = ASTCEncode->bsd.block_modes[scb->block_mode].is_dual_plane;
    int weight_quantization_level = ASTCEncode->bsd.block_modes[scb->block_mode].quantization_mode;


    // decode the color endpoints
    ushort4 color_endpoint0[4];
    ushort4 color_endpoint1[4];
    int rgb_hdr_endpoint[4];
    int alpha_hdr_endpoint[4];
    int nan_endpoint[4];

    for (i = 0; i < partition_count; i++)
        unpack_color_endpoints(
            scb->color_formats[i],
            scb->color_quantization_level, scb->color_values[i], &(rgb_hdr_endpoint[i]), &(alpha_hdr_endpoint[i]),
            &(nan_endpoint[i]), &(color_endpoint0[i]), &(color_endpoint1[i]), ASTCEncode);

    // first unquantize the weights
    int uq_plane1_weights[MAX_WEIGHTS_PER_BLOCK];
    int uq_plane2_weights[MAX_WEIGHTS_PER_BLOCK];
    int weight_count = ASTCEncode->bsd.decimation_tables[ASTCEncode->bsd.block_modes[scb->block_mode].decimation_mode].num_weights;

    __constant quantization_and_transfer_table *qat = &(quant_and_xfer_tables[weight_quantization_level]);

    for (i = 0; i < weight_count; i++) {
        uq_plane1_weights[i] = qat->unquantized_value[scb->plane1_weights[i]];
    }
    if (is_dual_plane) {
        for (i = 0; i < weight_count; i++)
            uq_plane2_weights[i] = qat->unquantized_value[scb->plane2_weights[i]];
    }

    // then un-decimate them.
    float weights[MAX_TEXELS_PER_BLOCK];
    float plane2_weights[MAX_TEXELS_PER_BLOCK];

    for (i = 0; i < ASTCEncode->m_texels_per_block; i++)
        weights[i] = (float)compute_value_of_texel_global(i, &ASTCEncode->bsd.decimation_tables[ASTCEncode->bsd.block_modes[scb->block_mode].decimation_mode], uq_plane1_weights);

    if (is_dual_plane)
        for (i = 0; i < ASTCEncode->m_texels_per_block; i++)
            plane2_weights[i] = (float)compute_value_of_texel_global(i, &ASTCEncode->bsd.decimation_tables[ASTCEncode->bsd.block_modes[scb->block_mode].decimation_mode], uq_plane2_weights);

    int plane2_color_component = scb->plane2_color_component;

    // now that we have endpoint colors and weights, we can unpack actual colors for
    // each texel.
    ushort4 color;
    for (i = 0; i < ASTCEncode->m_texels_per_block; i++) {
        int partition = ASTCEncode->partition_tables[partition_count][scb->partition_index].partition_of_texel[i];
        if (partition > 3) partition = 3;


        color = COMPUTE_LRP_COLOR(
                    color_endpoint0[partition],
                    color_endpoint1[partition],
                    weights[i],
                    plane2_weights[i],
                    is_dual_plane ? plane2_color_component : -1,
                    ASTCEncode );

        blk->rgb_lns[i] = (uint8_t)rgb_hdr_endpoint[partition];
        blk->alpha_lns[i] = (uint8_t)alpha_hdr_endpoint[partition];
        blk->nan_texel[i] = (uint8_t)nan_endpoint[partition];

        blk->work_data[4 * i] = color.x;
        blk->work_data[4 * i + 1] = color.y;
        blk->work_data[4 * i + 2] = color.z;
        blk->work_data[4 * i + 3] = color.w;
    }

    imageblock_initialize_orig_from_work(blk, ASTCEncode->m_texels_per_block);

    update_imageblock_flags(blk, ASTCEncode);
}

float compute_imageblock_difference(imageblock * p1, imageblock * p2, error_weight_block * ewb,
                                    __global ASTC_Encode *ASTCEncode) {
    DEBUG("compute_imageblock_difference");
    int i;
    float summa = 0.0f;
    float *f1 = p1->work_data;
    float *f2 = p2->work_data;
    for (i = 0; i < ASTCEncode->m_texels_per_block; i++) {
        float rdiff = (float)fabs(f1[4 * i]     - f2[4 * i]);
        float gdiff = (float)fabs(f1[4 * i + 1] - f2[4 * i + 1]);
        float bdiff = (float)fabs(f1[4 * i + 2] - f2[4 * i + 2]);
        float adiff = (float)fabs(f1[4 * i + 3] - f2[4 * i + 3]);
        rdiff = MIN(rdiff, FLOAT_15);
        gdiff = MIN(gdiff, FLOAT_15);
        bdiff = MIN(bdiff, FLOAT_15);
        adiff = MIN(adiff, FLOAT_15);
        summa += rdiff * rdiff * ewb->error_weights[i].x + gdiff * gdiff * ewb->error_weights[i].y + bdiff * bdiff * ewb->error_weights[i].z + adiff * adiff * ewb->error_weights[i].w;
    }
    return summa;
}

// compute averages and covariance matrices for 4 components
//static
void compute_covariance_matrix(imageblock * blk, error_weight_block * ewb, mat4t * cov_matrix, __global ASTC_Encode *ASTCEncode) {
    DEBUG("compute_covariance_matrix");
    int i;
    float r_sum = 0.0f;
    float g_sum = 0.0f;
    float b_sum = 0.0f;
    float a_sum = 0.0f;
    float rr_sum = 0.0f;
    float gg_sum = 0.0f;
    float bb_sum = 0.0f;
    float aa_sum = 0.0f;
    float rg_sum = 0.0f;
    float rb_sum = 0.0f;
    float ra_sum = 0.0f;
    float gb_sum = 0.0f;
    float ga_sum = 0.0f;
    float ba_sum = 0.0f;

    float weight_sum = 0.0f;

    for (i = 0; i < ASTCEncode->m_texels_per_block; i++) {
        float weight = ewb->texel_weight[i];
        if (weight < 0.0f) {
            astc_codec_internal_error("ERROR: compute_covariance_matrix");
        }
        weight_sum += weight;
        float r = blk->work_data[4 * i];
        float g = blk->work_data[4 * i + 1];
        float b = blk->work_data[4 * i + 2];
        float a = blk->work_data[4 * i + 3];
        r_sum += r * weight;
        rr_sum += r * (r * weight);
        rg_sum += g * (r * weight);
        rb_sum += b * (r * weight);
        ra_sum += a * (r * weight);
        g_sum += g * weight;
        gg_sum += g * (g * weight);
        gb_sum += b * (g * weight);
        ga_sum += a * (g * weight);
        b_sum += b * weight;
        bb_sum += b * (b * weight);
        ba_sum += a * (b * weight);
        a_sum += a * weight;
        aa_sum += a * (a * weight);
    }

    float rpt = 1.0f / MAX(weight_sum, FLOAT_n7);
    float rs = r_sum;
    float gs = g_sum;
    float bs = b_sum;
    float as = a_sum;

    float4 cf0 = {rr_sum - rs * rs * rpt, rg_sum - rs * gs * rpt, rb_sum - rs * bs * rpt, ra_sum - rs * as * rpt};
    float4 cf1 = {rg_sum - rs * gs * rpt, gg_sum - gs * gs * rpt, gb_sum - gs * bs * rpt, ga_sum - gs * as * rpt};
    float4 cf2 = {rb_sum - rs * bs * rpt, gb_sum - gs * bs * rpt, bb_sum - bs * bs * rpt, ba_sum - bs * as * rpt};
    float4 cf3 = {ra_sum - rs * as * rpt, ga_sum - gs * as * rpt, ba_sum - bs * as * rpt, aa_sum - as * as * rpt};


    cov_matrix->v[0] = cf0;
    cov_matrix->v[1] = cf1;
    cov_matrix->v[2] = cf2;
    cov_matrix->v[3] = cf3;

}

void prepare_block_statistics(
    imageblock * blk,
    error_weight_block * ewb,
    int *is_normal_map,
    float *lowest_correl,
    __global ASTC_Encode *ASTCEncode) {
    DEBUG("prepare_block_statistics");
    int i;

    mat4t cov_matrix;

    compute_covariance_matrix( blk, ewb, &cov_matrix, ASTCEncode);

    // use the covariance matrix to compute
    // correllation coefficients
    float rr_var = cov_matrix.v[0].x;
    float gg_var = cov_matrix.v[1].y;
    float bb_var = cov_matrix.v[2].z;
    float aa_var = cov_matrix.v[3].w;

    float rg_correlation = cov_matrix.v[0].y / sqrt(MAX(rr_var * gg_var, FLOAT_n30));
    float rb_correlation = cov_matrix.v[0].z / sqrt(MAX(rr_var * bb_var, FLOAT_n30));
    float ra_correlation = cov_matrix.v[0].w / sqrt(MAX(rr_var * aa_var, FLOAT_n30));
    float gb_correlation = cov_matrix.v[1].z / sqrt(MAX(gg_var * bb_var, FLOAT_n30));
    float ga_correlation = cov_matrix.v[1].w / sqrt(MAX(gg_var * aa_var, FLOAT_n30));
    float ba_correlation = cov_matrix.v[2].w / sqrt(MAX(bb_var * aa_var, FLOAT_n30));

    if (astc_isnan(rg_correlation))
        rg_correlation = 1.0f;
    if (astc_isnan(rb_correlation))
        rb_correlation = 1.0f;
    if (astc_isnan(ra_correlation))
        ra_correlation = 1.0f;
    if (astc_isnan(gb_correlation))
        gb_correlation = 1.0f;
    if (astc_isnan(ga_correlation))
        ga_correlation = 1.0f;
    if (astc_isnan(ba_correlation))
        ba_correlation = 1.0f;

    float lowest_correlation = MIN(fabs(rg_correlation), fabs(rb_correlation));
    lowest_correlation = MIN(lowest_correlation, fabs(ra_correlation));
    lowest_correlation = MIN(lowest_correlation, fabs(gb_correlation));
    lowest_correlation = MIN(lowest_correlation, fabs(ga_correlation));
    lowest_correlation = MIN(lowest_correlation, fabs(ba_correlation));
    *lowest_correl = lowest_correlation;

    // compute a "normal-map" factor
    // this factor should be exactly 0.0 for a normal map, while it may be all over the
    // place for anything that is NOT a normal map. We can probably assume that a factor
    // of less than 0.2f represents a normal map.

    float nf_sum = 0.0f;

    float3 midf = {0.5f, 0.5f, 0.5f};
    for (i = 0; i <  ASTCEncode->m_texels_per_block; i++) {
        float3 val = {blk->orig_data[4 * i],
                      blk->orig_data[4 * i + 1],
                      blk->orig_data[4 * i + 2]
                     };
        val = (val - midf) * 2.0f;
        float length_squared = dot(val, val);
        float nf = fabs(length_squared - 1.0f);
        nf_sum += nf;
    }
    float nf_avg = nf_sum / ASTCEncode->m_texels_per_block;
    *is_normal_map = nf_avg < 0.2;
}

void compute_angular_endpoints_2planes(float mode_cutoff,
                                       __global2 float *decimated_quantized_weights,
                                       __global2 float *decimated_weights,
                                       float low_value1[MAX_WEIGHT_MODES],
                                       float high_value1[MAX_WEIGHT_MODES],
                                       float low_value2[MAX_WEIGHT_MODES],
                                       float high_value2[MAX_WEIGHT_MODES],
                                       __global ASTC_Encode *ASTCEncode) {
    DEBUG("compute_angular_endpoints_2planes");
    int i;
    float low_values1[MAX_DECIMATION_MODES][12];
    float high_values1[MAX_DECIMATION_MODES][12];
    float low_values2[MAX_DECIMATION_MODES][12];
    float high_values2[MAX_DECIMATION_MODES][12];

    for (i = 0; i < MAX_DECIMATION_MODES; i++) {
        int samplecount     = ASTCEncode->bsd.decimation_mode_samples[i];
        int quant_mode      = ASTCEncode->bsd.decimation_mode_maxprec_2planes[i];
        float percentile    = ASTCEncode->bsd.decimation_mode_percentile[i];
        int permit_encode   = ASTCEncode->bsd.permit_encode[i];
        if (permit_encode == 0 || samplecount < 1 || quant_mode < 0 || percentile > mode_cutoff)
            continue;

        compute_angular_endpoints_for_quantization_levels(samplecount,
                decimated_quantized_weights + 2 * i * MAX_WEIGHTS_PER_BLOCK,
                decimated_weights + 2 * i * MAX_WEIGHTS_PER_BLOCK, quant_mode, low_values1[i], high_values1[i], ASTCEncode);

        compute_angular_endpoints_for_quantization_levels(samplecount,
                decimated_quantized_weights + (2 * i + 1) * MAX_WEIGHTS_PER_BLOCK,
                decimated_weights + (2 * i + 1) * MAX_WEIGHTS_PER_BLOCK, quant_mode, low_values2[i], high_values2[i],ASTCEncode);

    }

    for (i = 0; i < MAX_WEIGHT_MODES; i++) {
        if (ASTCEncode->bsd.block_modes[i].is_dual_plane != 1 || ASTCEncode->bsd.block_modes[i].percentile > mode_cutoff)
            continue;
        int quant_mode = ASTCEncode->bsd.block_modes[i].quantization_mode;
        int decim_mode = ASTCEncode->bsd.block_modes[i].decimation_mode;

        low_value1[i] = low_values1[decim_mode][quant_mode];
        high_value1[i] = high_values1[decim_mode][quant_mode];
        low_value2[i] = low_values2[decim_mode][quant_mode];
        high_value2[i] = high_values2[decim_mode][quant_mode];
    }
}

void compress_symbolic_block_fixed_partition_2_planes(
    float mode_cutoff,
    int max_refinement_iters,
    int partition_count,
    int partition_index,
    int separate_component,
    imageblock * blk,
    error_weight_block * ewb,
    symbolic_compressed_block * scb,
    endpoints_and_weights *ei1,
    endpoints_and_weights *ei2,
    endpoints_and_weights *eix1,
    endpoints_and_weights *eix2,
    __global2 float   *decimated_weights,
    __global2 uint8_t *u8_quantized_decimated_quantized_weights,
    __global2 float   *decimated_quantized_weights,
    __global2 float   *flt_quantized_decimated_quantized_weights,
    __global ASTC_Encode *ASTCEncode) {
    DEBUG("compress_symbolic_block_fixed_partition_2_planes");
    int i, j, k;
    int free_bits_for_partition_count[5] =
    { 0, 113 - 4, 109 - 4 - PARTITION_BITS, 106 - 4 - PARTITION_BITS, 103 - 4 - PARTITION_BITS };

    // first, compute ideal weights and endpoint colors
    compute_endpoints_and_ideal_weights_2_planes(&ASTCEncode->partition_tables[partition_count][partition_index], blk, ewb, separate_component, ei1, ei2, ASTCEncode);

    // for each decimation mode, compute an ideal set of weights
    for (i = 0; i < MAX_DECIMATION_MODES; i++) {
        if (ASTCEncode->bsd.permit_encode[i] == 0 || ASTCEncode->bsd.decimation_mode_maxprec_2planes[i] < 0 || ASTCEncode->bsd.decimation_mode_percentile[i] > mode_cutoff)
            continue;

        eix1[i] = *ei1;
        eix2[i] = *ei2;
        compute_ideal_weights_for_decimation_table(&(eix1[i]),
                &ASTCEncode->bsd.decimation_tables[i],
                decimated_quantized_weights + (2 * i) * MAX_WEIGHTS_PER_BLOCK,
                decimated_weights + (2 * i) * MAX_WEIGHTS_PER_BLOCK);
        compute_ideal_weights_for_decimation_table(&(eix2[i]),
                &ASTCEncode->bsd.decimation_tables[i],
                decimated_quantized_weights + (2 * i + 1) * MAX_WEIGHTS_PER_BLOCK,
                decimated_weights + (2 * i + 1) * MAX_WEIGHTS_PER_BLOCK);
    }

    // compute maximum colors for the endpoints and ideal weights.
    // for each endpoint-and-ideal-weight pair, compute the smallest weight value
    // that will result in a color value greater than 1.

    float4 min_ep1 = {10.0f, 10.0f, 10.0f, 10.0f};
    float4 min_ep2 = {10.0f, 10.0f, 10.0f, 10.0f};
    for (i = 0; i < partition_count; i++) {
        float4 onef = {1.0f, 1.0f, 1.0f, 1.0f};
        float4 ep1 = (onef - ei1->ep.endpt0[i]) / (ei1->ep.endpt1[i] - ei1->ep.endpt0[i]);
        if (ep1.x > 0.5f && ep1.x < min_ep1.x)
            min_ep1.x = ep1.x;
        if (ep1.y > 0.5f && ep1.y < min_ep1.y)
            min_ep1.y = ep1.y;
        if (ep1.z > 0.5f && ep1.z < min_ep1.z)
            min_ep1.z = ep1.z;
        if (ep1.w > 0.5f && ep1.w < min_ep1.w)
            min_ep1.w = ep1.w;
        float4 ep2 = (onef - ei2->ep.endpt0[i]) / (ei2->ep.endpt1[i] - ei2->ep.endpt0[i]);
        if (ep2.x > 0.5f && ep2.x < min_ep2.x)
            min_ep2.x = ep2.x;
        if (ep2.y > 0.5f && ep2.y < min_ep2.y)
            min_ep2.y = ep2.y;
        if (ep2.z > 0.5f && ep2.z < min_ep2.z)
            min_ep2.z = ep2.z;
        if (ep2.w > 0.5f && ep2.w < min_ep2.w)
            min_ep2.w = ep2.w;
    }

    float min_wt_cutoff1, min_wt_cutoff2;
    switch (separate_component) {
    case 0:
        min_wt_cutoff2 = min_ep2.x;
        min_ep1.x = FLOAT_30;
        break;
    case 1:
        min_wt_cutoff2 = min_ep2.y;
        min_ep1.y = FLOAT_30;
        break;
    case 2:
        min_wt_cutoff2 = min_ep2.z;
        min_ep1.z = FLOAT_30;
        break;
    case 3:
        min_wt_cutoff2 = min_ep2.w;
        min_ep1.w = FLOAT_30;
        break;
    default:
        min_wt_cutoff2 = FLOAT_30;
    }

    min_wt_cutoff1 = MIN(MIN(min_ep1.x, min_ep1.y), MIN(min_ep1.z, min_ep1.w));

    float weight_low_value1[MAX_WEIGHT_MODES];
    float weight_high_value1[MAX_WEIGHT_MODES];
    float weight_low_value2[MAX_WEIGHT_MODES];
    float weight_high_value2[MAX_WEIGHT_MODES];

    compute_angular_endpoints_2planes(mode_cutoff,
                                      decimated_quantized_weights,
                                      decimated_weights, weight_low_value1,
                                      weight_high_value1, weight_low_value2, weight_high_value2,
                                      ASTCEncode);

    // for each mode (which specifies a decimation and a quantization):
    // * generate an optimized set of quantized weights.
    // * compute quantization errors for each mode
    // * compute number of bits needed for the quantized weights.

    int qwt_bitcounts[MAX_WEIGHT_MODES];
    float qwt_errors[MAX_WEIGHT_MODES];
    for (i = 0; i < MAX_WEIGHT_MODES; i++) {
        if (ASTCEncode->bsd.block_modes[i].permit_encode == 0 || ASTCEncode->bsd.block_modes[i].is_dual_plane != 1 || ASTCEncode->bsd.block_modes[i].percentile > mode_cutoff) {
            qwt_errors[i] = FLOAT_38;
            continue;
        }
        int decimation_mode = ASTCEncode->bsd.block_modes[i].decimation_mode;

        if (weight_high_value1[i] > 1.02f * min_wt_cutoff1)
            weight_high_value1[i] = 1.0f;
        if (weight_high_value2[i] > 1.02f * min_wt_cutoff2)
            weight_high_value2[i] = 1.0f;

        // compute weight bitcount for the mode
        int bits_used_by_weights = compute_ise_bitcount(2 * ASTCEncode->bsd.decimation_tables[decimation_mode].num_weights,
                                   (quantization_method)ASTCEncode->bsd.block_modes[i].quantization_mode);
        int bitcount = free_bits_for_partition_count[partition_count] - bits_used_by_weights;
        if (bitcount <= 0 || bits_used_by_weights < 24 || bits_used_by_weights > 96) {
            qwt_errors[i] = FLOAT_38;
            continue;
        }
        qwt_bitcounts[i] = bitcount;


        // then, generate the optimized set of weights for the mode.
        compute_ideal_quantized_weights_for_decimation_table(&(eix1[decimation_mode]),
                &ASTCEncode->bsd.decimation_tables[decimation_mode],
                weight_low_value1[i],
                weight_high_value1[i],
                decimated_quantized_weights + MAX_WEIGHTS_PER_BLOCK * (2 * decimation_mode),
                flt_quantized_decimated_quantized_weights + MAX_WEIGHTS_PER_BLOCK * (2 * i),
                u8_quantized_decimated_quantized_weights + MAX_WEIGHTS_PER_BLOCK * (2 * i),
                ASTCEncode->bsd.block_modes[i].quantization_mode);

        compute_ideal_quantized_weights_for_decimation_table(&(eix2[decimation_mode]),
                &ASTCEncode->bsd.decimation_tables[decimation_mode],
                weight_low_value2[i],
                weight_high_value2[i],
                decimated_quantized_weights + MAX_WEIGHTS_PER_BLOCK * (2 * decimation_mode + 1),
                flt_quantized_decimated_quantized_weights + MAX_WEIGHTS_PER_BLOCK * (2 * i + 1),
                u8_quantized_decimated_quantized_weights + MAX_WEIGHTS_PER_BLOCK * (2 * i + 1),
                ASTCEncode->bsd.block_modes[i].quantization_mode);


        // then, compute quantization errors for the block mode.
        qwt_errors[i] =
            compute_error_of_weight_set(&(eix1[decimation_mode]),
                                        &ASTCEncode->bsd.decimation_tables[decimation_mode],
                                        flt_quantized_decimated_quantized_weights + MAX_WEIGHTS_PER_BLOCK * (2 * i))
            + compute_error_of_weight_set(&(eix2[decimation_mode]),
                                          &ASTCEncode->bsd.decimation_tables[decimation_mode],
                                          flt_quantized_decimated_quantized_weights + MAX_WEIGHTS_PER_BLOCK * (2 * i + 1));
    }


    // decide the optimal combination of color endpoint encodings and weight encoodings.
    int partition_format_specifiers[4][4];
    int quantized_weight[4];
    int color_quantization_level[4];
    int color_quantization_level_mod[4];

    endpoints epm;
    merge_endpoints(&(ei1->ep), &(ei2->ep), separate_component, &epm);

    determine_optimal_set_of_endpoint_formats_to_use(
        &ASTCEncode->partition_tables[partition_count][partition_index],
        blk,
        ewb,
        &epm, separate_component, qwt_bitcounts,
        qwt_errors,
        partition_format_specifiers,
        quantized_weight,
        color_quantization_level, color_quantization_level_mod,
        ASTCEncode);

    for (i = 0; i < 4; i++) {
        if (quantized_weight[i] < 0) {
            scb->error_block = 1;
            scb++;
            continue;
        }

        __global2 uint8_t *u8_weight1_src;
        __global2 uint8_t *u8_weight2_src;
        int weights_to_copy;

        int decimation_mode = ASTCEncode->bsd.block_modes[quantized_weight[i]].decimation_mode;
        int weight_quantization_mode = ASTCEncode->bsd.block_modes[quantized_weight[i]].quantization_mode;

        int weight1_offfset = MAX_WEIGHTS_PER_BLOCK * (2 * quantized_weight[i]);
        int weight2_offfset = MAX_WEIGHTS_PER_BLOCK * (2 * quantized_weight[i] + 1);
        u8_weight1_src = u8_quantized_decimated_quantized_weights + weight1_offfset;
        u8_weight2_src = u8_quantized_decimated_quantized_weights + weight2_offfset;


        weights_to_copy = ASTCEncode->bsd.decimation_tables[decimation_mode].num_weights;

        // recompute the ideal color endpoints before storing them.
        merge_endpoints(&(eix1[decimation_mode].ep), &(eix2[decimation_mode].ep), separate_component, &epm);

        float4 rgbs_colors[4];
        float4 rgbo_colors[4];
        float2 lum_intervals[4];

        int l;
        for (l = 0; l < max_refinement_iters; l++) {
            recompute_ideal_colors(weight_quantization_mode,
                                   &epm, rgbs_colors, rgbo_colors, lum_intervals,
                                   u8_weight1_src,
                                   u8_weight2_src,
                                   separate_component, &ASTCEncode->partition_tables[partition_count][partition_index],
                                   &ASTCEncode->bsd.decimation_tables[decimation_mode],
                                   blk, ewb,
                                   ASTCEncode);

            // store the colors for the block
            for (j = 0; j < partition_count; j++) {
                scb->color_formats[j] = pack_color_endpoints(epm.endpt0[j],
                                        epm.endpt1[j],
                                        rgbs_colors[j], rgbo_colors[j],  partition_format_specifiers[i][j], scb->color_values[j], color_quantization_level[i]);
            }
            scb->color_formats_matched = 0;

            if ((partition_count >= 2 && scb->color_formats[0] == scb->color_formats[1]
//#                && color_quantization_level != color_quantization_level_mod
                )
                    && (partition_count == 2 || (scb->color_formats[0] == scb->color_formats[2] && (partition_count == 3 || (scb->color_formats[0] == scb->color_formats[3]))))) {
                int colorvals[4][12];
                int color_formats_mod[4];
                for (j = 0; j < partition_count; j++) {
                    color_formats_mod[j] = pack_color_endpoints(epm.endpt0[j],
                                           epm.endpt1[j],
                                           rgbs_colors[j], rgbo_colors[j],  partition_format_specifiers[i][j], colorvals[j], color_quantization_level_mod[i]);
                }
                if (color_formats_mod[0] == color_formats_mod[1]
                        && (partition_count == 2 || (color_formats_mod[0] == color_formats_mod[2] && (partition_count == 3 || (color_formats_mod[0] == color_formats_mod[3]))))) {
                    scb->color_formats_matched = 1;
                    for (j = 0; j < 4; j++)
                        for (k = 0; k < 12; k++)
                            scb->color_values[j][k] = colorvals[j][k];
                    for (j = 0; j < 4; j++)
                        scb->color_formats[j] = color_formats_mod[j];
                }
            }


            // store header fields
            scb->partition_count = partition_count;
            scb->partition_index = partition_index;
            scb->color_quantization_level = scb->color_formats_matched ? color_quantization_level_mod[i] : color_quantization_level[i];
            scb->block_mode = quantized_weight[i];
            scb->plane2_color_component = separate_component;
            scb->error_block = 0;

            if (scb->color_quantization_level < 4) {
                scb->error_block = 1;    // should never happen, but cannot prove it impossible
            }

            int adjustments = realign_weights(
                                  blk, ewb, scb,
                                  u8_weight1_src,
                                  u8_weight2_src,
                                  ASTCEncode);

            if (adjustments == 0)
                break;
        }

        for (j = 0; j < weights_to_copy; j++) {
            scb->plane1_weights[j] = u8_weight1_src[j];
            scb->plane2_weights[j] = u8_weight2_src[j];
        }

        scb++;
    }

}

// for k++ means, we need pseudorandom numbers, however using random numbers directly
// results in irreproducible encoding results. As such, we will instead
// just supply a handful of numbers from random.org, and apply an algorithm similar
// to XKCD #221. (http://xkcd.com/221/)
// cluster the texels using the k++ means clustering initialization algorithm.
void kpp_initialize(int partition_count, imageblock * blk, float4 * cluster_centers, __global ASTC_Encode *ASTCEncode) {
    DEBUG("kpp_initialize");
    int i;

    int cluster_center_samples[4];
    // pick a random sample as first center-point.
    cluster_center_samples[0] = 145897 /* number from random.org */ % ASTCEncode->m_texels_per_block;
    int samples_selected = 1;

    float distances[MAX_TEXELS_PER_BLOCK];

    // compute the distance to the first point.
    int sample = cluster_center_samples[0];
    float4 center_color = { blk->work_data[4 * sample],
                            blk->work_data[4 * sample + 1],
                            blk->work_data[4 * sample + 2],
                            blk->work_data[4 * sample + 3]
                          };

    float distance_sum = 0.0f;
    for (i = 0; i < ASTCEncode->m_texels_per_block; i++) {
        float4 color = { blk->work_data[4 * i],
                         blk->work_data[4 * i + 1],
                         blk->work_data[4 * i + 2],
                         blk->work_data[4 * i + 3]
                       };
        float4 diff = color - center_color;
        float distance = dot(diff, diff);
        distance_sum += distance;
        distances[i] = distance;
    }


    bool isTrue = true;
    while (isTrue) {
        // pick a point in a weighted-random fashion.
        float summa = 0.0f;
        float distance_cutoff = distance_sum * cluster_cutoffs[samples_selected + 5 * partition_count];
        for (i = 0; i < ASTCEncode->m_texels_per_block; i++) {
            summa += distances[i];
            if (summa >= distance_cutoff)
                break;
        }
        sample = i;
        if (sample >= ASTCEncode->m_texels_per_block)
            sample = ASTCEncode->m_texels_per_block - 1;


        cluster_center_samples[samples_selected] = sample;
        samples_selected++;
        if (samples_selected >= partition_count)
            break;

        // update the distances with the new point.
        float4 cr = { blk->work_data[4 * sample], blk->work_data[4 * sample + 1], blk->work_data[4 * sample + 2], blk->work_data[4 * sample + 3] };
        center_color = cr;

        distance_sum = 0.0f;
        for (i = 0; i < ASTCEncode->m_texels_per_block; i++) {
            float4 color = { blk->work_data[4 * i],
                             blk->work_data[4 * i + 1],
                             blk->work_data[4 * i + 2],
                             blk->work_data[4 * i + 3]
                           };
            float4 diff = color - center_color;
            float distance = dot(diff, diff);
            distance = MIN(distance, distances[i]);
            distance_sum += distance;
            distances[i] = distance;
        }
    }

    // finally, gather up the results.
    for (i = 0; i < partition_count; i++) {
        int sample1 = cluster_center_samples[i];
        float4 color = { blk->work_data[4 * sample1],
                         blk->work_data[4 * sample1 + 1],
                         blk->work_data[4 * sample1 + 2],
                         blk->work_data[4 * sample1 + 3]
                       };
        cluster_centers[i] = color;
    }
}

// basic K-means clustering: given a set of cluster centers,
// assign each texel to a partition
void basic_kmeans_assign_pass(int partition_count, imageblock * blk, float4 * cluster_centers, int *partition_of_texel, __global ASTC_Encode *ASTCEncode) {
    DEBUG("basic_kmeans_assign_pass");
    int i, j;

    float distances[MAX_TEXELS_PER_BLOCK];
    float4 center_color = cluster_centers[0];

    int texels_per_partition[4];

    texels_per_partition[0] = ASTCEncode->m_texels_per_block;
    for (i = 1; i < partition_count; i++)
        texels_per_partition[i] = 0;


    for (i = 0; i < ASTCEncode->m_texels_per_block; i++) {
        float4 color = { blk->work_data[4 * i],
                         blk->work_data[4 * i + 1],
                         blk->work_data[4 * i + 2],
                         blk->work_data[4 * i + 3]
                       };
        float4 diff = color - center_color;
        float distance = dot(diff, diff);
        distances[i] = distance;
        partition_of_texel[i] = 0;
    }


    for (j = 1; j < partition_count; j++) {
        float4 center_color1 = cluster_centers[j];

        for (i = 0; i < ASTCEncode->m_texels_per_block; i++) {
            float4 color = { blk->work_data[4 * i],
                             blk->work_data[4 * i + 1],
                             blk->work_data[4 * i + 2],
                             blk->work_data[4 * i + 3]
                           };
            float4 diff = color - center_color1;
            float distance = dot(diff, diff);
            if (distance < distances[i]) {
                distances[i] = distance;
                texels_per_partition[partition_of_texel[i]]--;
                texels_per_partition[j]++;
                partition_of_texel[i] = j;
            }
        }
    }

    // it is possible to get a situation where one of the partitions ends up
    // without any texels. In this case, we assign texel N to partition N;
    // this is silly, but ensures that every partition retains at least one texel.
    // Reassigning a texel in this manner may cause another partition to go empty,
    // so if we actually did a reassignment, we run the whole loop over again.
    int problem_case;
    do {
        problem_case = 0;
        for (i = 0; i < partition_count; i++) {
            if (texels_per_partition[i] == 0) {
                texels_per_partition[partition_of_texel[i]]--;
                texels_per_partition[i]++;
                partition_of_texel[i] = i;
                problem_case = 1;
            }
        }
    } while (problem_case != 0);

}

// basic k-means clustering: given a set of cluster assignments
// for the texels, find the center position of each cluster.
void basic_kmeans_update(int partition_count, imageblock * blk, int *partition_of_texel, float4 * cluster_centers, __global ASTC_Encode *ASTCEncode) {
    DEBUG("basic_kmeans_update");
    int i;

    float4 color_sum[4];
    int weight_sum[4];
    float4 zero4f = { 0, 0, 0, 0 };
    for (i = 0; i < partition_count; i++) {
        color_sum[i] = zero4f;
        weight_sum[i] = 0;
    }


    // first, find the center-of-gravity in each cluster
    for (i = 0; i < ASTCEncode->m_texels_per_block; i++) {
        float4 color = { blk->work_data[4 * i],
                         blk->work_data[4 * i + 1],
                         blk->work_data[4 * i + 2],
                         blk->work_data[4 * i + 3]
                       };
        int part = partition_of_texel[i];
        color_sum[part] = color_sum[part] + color;
        weight_sum[part]++;
    }

    for (i = 0; i < partition_count; i++) {
        cluster_centers[i] = color_sum[i] * (1.0f / weight_sum[i]);
    }
}


// after a few rounds of k-means-clustering, we should have a set of 2, 3 or 4 partitions;
// we  turn this set into 2, 3 or 4 bitmaps. Then, for each of the ( 64 bit uses 1024) (32 bit uses ???) partitions,
// we try to match the bitmaps as well as possible.
//# maybe an issue for GPU to work in 64bits
//static inline
int bitcount(uint64_cl p) {
#ifdef ENABLE_64Bit_Support
    bool isTrue = sizeof(void *) > 4;
    if (isTrue) {
        uint64_cl mask1 = 0x5555555555555555ULL;
        uint64_cl mask2 = 0x3333333333333333ULL;
        uint64_cl mask3 = 0x0F0F0F0F0F0F0F0FULL;
        // best-known algorithm for 64-bit bitcount, assuming 64-bit processor
        // should probably be adapted for use with 32-bit processors and/or processors
        // with a POPCNT instruction, but leave that for later.
        p -= (p >> 1) & mask1;
        p = (p & mask2) + ((p >> 2) & mask2);
        p += p >> 4;
        p &= mask3;
        p *= 0x0101010101010101ULL;
        p >>= 56;
        return (int)p;
    } else
#endif
    {
        uint32_t mask1 = 0x55555555U;
        uint32_t mask2 = 0x33333333U;
        uint32_t mask3 = 0x0F0F0F0FU;

        // on 32-bit processor, split the 64-bit input argument in two,
        // and bitcount each half separately.
        uint32_t p1 = (uint32_t)p;

        p1 = p1 - ((p1 >> 1) & mask1);
        p1 = (p1 & mask2) + ((p1 >> 2) & mask2);
        p1 += p1 >> 4;
        p1 &= mask3;

#ifdef ENABLE_64Bit_Support
        uint32_t p2 = (uint32_t)(p >> 32);
        p2 = p2 - ((p2 >> 1) & mask1);
        p2 = (p2 & mask2) + ((p2 >> 2) & mask2);
        p2 += p2 >> 4;
        p2 &= mask3;
        p1 += p2;
#endif
        p1 *= 0x01010101U;
        p1 >>= 24;
        return (int)p1;
    }
}

//static inline
int MIN3(int a, int b, int c) {
    int d = MIN(a, b);
    return MIN(c, d);
}

// compute the bit-mismatch for a partitioning in 2-partition mode
//# maybe an issue for GPU to work in 64bits
//static inline
int partition_mismatch2(uint64_cl a0, uint64_cl a1, uint64_cl b0, uint64_cl b1) {
    int v1 = bitcount(a0 ^ b0) + bitcount(a1 ^ b1);
    int v2 = bitcount(a0 ^ b1) + bitcount(a1 ^ b0);
    return MIN(v1, v2);
}

#ifdef ENABLE_3_PARTITION_CODE
// compute the bit-mismatch for a partitioning in 3-partition mode
//# maybe an issue for GPU to work in 64bits
static inline int partition_mismatch3(uint64_cl a0, uint64_cl a1, uint64_cl a2, uint64_cl b0, uint64_cl b1, uint64_cl b2) {
    int p00 = bitcount(a0 ^ b0);
    int p01 = bitcount(a0 ^ b1);
    int p02 = bitcount(a0 ^ b2);

    int p10 = bitcount(a1 ^ b0);
    int p11 = bitcount(a1 ^ b1);
    int p12 = bitcount(a1 ^ b2);

    int p20 = bitcount(a2 ^ b0);
    int p21 = bitcount(a2 ^ b1);
    int p22 = bitcount(a2 ^ b2);

    int s0 = p11 + p22;
    int s1 = p12 + p21;
    int v0 = MIN(s0, s1) + p00;

    int s2 = p10 + p22;
    int s3 = p12 + p20;
    int v1 = MIN(s2, s3) + p01;

    int s4 = p10 + p21;
    int s5 = p11 + p20;
    int v2 = MIN(s4, s5) + p02;

    if (v1 < v0)
        v0 = v1;
    if (v2 < v0)
        v0 = v2;

    // 9 add, 5 MIN

    return v0;
}
#endif

#ifdef ENABLE_4_PARTITION_CODE
// compute the bit-mismatch for a partitioning in 4-partition mode
//# maybe an issue for GPU to work in 64bits
static inline int partition_mismatch4(uint64_cl a0, uint64_cl a1, uint64_cl a2, uint64_cl a3, uint64_cl b0, uint64_cl b1, uint64_cl b2, uint64_cl b3) {
    int p00 = bitcount(a0 ^ b0);
    int p01 = bitcount(a0 ^ b1);
    int p02 = bitcount(a0 ^ b2);
    int p03 = bitcount(a0 ^ b3);

    int p10 = bitcount(a1 ^ b0);
    int p11 = bitcount(a1 ^ b1);
    int p12 = bitcount(a1 ^ b2);
    int p13 = bitcount(a1 ^ b3);

    int p20 = bitcount(a2 ^ b0);
    int p21 = bitcount(a2 ^ b1);
    int p22 = bitcount(a2 ^ b2);
    int p23 = bitcount(a2 ^ b3);

    int p30 = bitcount(a3 ^ b0);
    int p31 = bitcount(a3 ^ b1);
    int p32 = bitcount(a3 ^ b2);
    int p33 = bitcount(a3 ^ b3);

    int mx23 = MIN(p22 + p33, p23 + p32);
    int mx13 = MIN(p21 + p33, p23 + p31);
    int mx12 = MIN(p21 + p32, p22 + p31);
    int mx03 = MIN(p20 + p33, p23 + p30);
    int mx02 = MIN(p20 + p32, p22 + p30);
    int mx01 = MIN(p21 + p30, p20 + p31);

    int v0 = p00 + MIN3(p11 + mx23, p12 + mx13, p13 + mx12);
    int v1 = p01 + MIN3(p10 + mx23, p12 + mx03, p13 + mx02);
    int v2 = p02 + MIN3(p11 + mx03, p10 + mx13, p13 + mx01);
    int v3 = p03 + MIN3(p11 + mx02, p12 + mx01, p10 + mx12);

    int x0 = MIN(v0, v1);
    int x1 = MIN(v2, v3);
    return MIN(x0, x1);

    // 16 bitcount, 17 MIN, 28 ADD
}
#endif

//# maybe an issue for GPU to work in 64bits
void count_partition_mismatch_bits( int partition_count,
                                    uint64_cl bitmaps[4],
                                    int bitcounts[PARTITION_COUNT],
                                    __global ASTC_Encode *ASTCEncode) {
    DEBUG("count_partition_mismatch_bits");
    int i;

    if (partition_count == 2) {
        uint64_cl bm0 = bitmaps[0];
        uint64_cl bm1 = bitmaps[1];
        for (i = 0; i < PARTITION_COUNT; i++) {
            if (ASTCEncode->partition_tables[partition_count][i].partition_count == 2) {
                bitcounts[i] = partition_mismatch2(bm0, bm1,
                                                   ASTCEncode->partition_tables[partition_count][i].coverage_bitmaps[0],
                                                   ASTCEncode->partition_tables[partition_count][i].coverage_bitmaps[1]);
            } else
                bitcounts[i] = 255;
        }
    }
#ifdef ENABLE_3_PARTITION_CODE
    else if (partition_count == 3) {
        uint64_cl bm0 = bitmaps[0];
        uint64_cl bm1 = bitmaps[1];
        uint64_cl bm2 = bitmaps[2];
        for (i = 0; i < PARTITION_COUNT; i++) {
            if (ASTCEncode->partition_tables[partition_count][i].partition_count == 3) {
                bitcounts[i] = partition_mismatch3(bm0, bm1, bm2,
                                                   ASTCEncode->partition_tables[partition_count][i].coverage_bitmaps[0],
                                                   ASTCEncode->partition_tables[partition_count][i].coverage_bitmaps[1],
                                                   ASTCEncode->partition_tables[partition_count][i].coverage_bitmaps[2]);
            } else
                bitcounts[i] = 255;
        }
    }
#endif
#ifdef ENABLE_4_PARTITION_CODE
    else if (partition_count == 4) {
        uint64_cl bm0 = bitmaps[0];
        uint64_cl bm1 = bitmaps[1];
        uint64_cl bm2 = bitmaps[2];
        uint64_cl bm3 = bitmaps[3];
        for (i = 0; i < PARTITION_COUNT; i++) {
            if (ASTCEncode->partition_tables[partition_count][i].partition_count == 4) {
                bitcounts[i] = partition_mismatch4(bm0, bm1, bm2, bm3,
                                                   ASTCEncode->partition_tables[partition_count][i].coverage_bitmaps[0],
                                                   ASTCEncode->partition_tables[partition_count][i].coverage_bitmaps[1],
                                                   ASTCEncode->partition_tables[partition_count][i].coverage_bitmaps[2],
                                                   ASTCEncode->partition_tables[partition_count][i].coverage_bitmaps[3]);
            } else
                bitcounts[i] = 255;
        }
    }
#endif
}

// counting-sort on the mismatch-bits, thereby
// sorting the partitions into an ordering.
void get_partition_ordering_by_mismatch_bits(int mismatch_bits[PARTITION_COUNT], int partition_ordering[PARTITION_COUNT]) {
    DEBUG("get_partition_ordering_by_mismatch_bits");
    int i;

    int mscount[256];
    for (i = 0; i < 256; i++)
        mscount[i] = 0;

    for (i = 0; i < PARTITION_COUNT; i++)
        mscount[mismatch_bits[i]]++;

    int summa = 0;
    for (i = 0; i < 256; i++) {
        int cnt = mscount[i];
        mscount[i] = summa;
        summa += cnt;
    }

    for (i = 0; i < PARTITION_COUNT; i++) {
        int idx = mscount[mismatch_bits[i]]++;
        partition_ordering[idx] = i;
    }
}

void kmeans_compute_partition_ordering(int partition_count, imageblock * blk, int *ordering, __global ASTC_Encode *ASTCEncode) {
    DEBUG("kmeans_compute_partition_ordering");
    int i;
    float4 cluster_centers[4];
    int partition_of_texel[MAX_TEXELS_PER_BLOCK];

    // 3 passes of plain k-means partitioning
    for (i = 0; i < 3; i++) {
        if (i == 0)
            kpp_initialize(partition_count, blk, cluster_centers, ASTCEncode);
        else
            basic_kmeans_update(partition_count, blk, partition_of_texel, cluster_centers, ASTCEncode);

        basic_kmeans_assign_pass( partition_count, blk, cluster_centers, partition_of_texel, ASTCEncode);
    }

    // at this point, we have a near-ideal partitioning.

//#  // construct bitmaps
    uint64_cl shiftbit = 1;
    uint64_cl bitmaps[4];
    for (i = 0; i < 4; i++)
        bitmaps[i] = 0;
//# need to check the limit of 64bit when used in GPU
    int texels_to_process = ASTCEncode->bsd.texelcount_for_bitmap_partitioning;
    if (texels_to_process > COVERAGE_BITMAPS_MAX)
        texels_to_process = COVERAGE_BITMAPS_MAX;
    for (i = 0; i < texels_to_process; i++) {
        int idx = ASTCEncode->bsd.texels_for_bitmap_partitioning[i];
        bitmaps[partition_of_texel[idx]] |= shiftbit << i;
    }

    int bitcounts[PARTITION_COUNT];
    // for each entry in the partition table, count bits of partition-mismatch.
    count_partition_mismatch_bits( partition_count, bitmaps, bitcounts, ASTCEncode);

    // finally, sort the partitions by bits-of-partition-mismatch
    get_partition_ordering_by_mismatch_bits(bitcounts, ordering);

}

float compute_error_squared_rgba(__global  partition_info * pt,    // the partition that we use when computing the squared-error.
                                 imageblock * blk, error_weight_block * ewb, processed_line4 * plines, float *length_of_lines) {
    DEBUG("compute_error_squared_rgba");
    int i;

    float errorsum = 0.0f;
    int partition;
    for (partition = 0; partition < pt->partition_count; partition++) {
        int texelcount = pt->texels_per_partition[partition];
        float lowparam = FLOAT_10;
        float highparam = -FLOAT_10;

        processed_line4 l = plines[partition];

        if (ewb->contains_zeroweight_texels) {
            for (i = 0; i < texelcount; i++) {
                int iwt = pt->texels_of_partition[partition][i];
                if (ewb->texel_weight[iwt] > FLOAT_n20) {
                    float4 point = { blk->work_data[4 * iwt],
                                     blk->work_data[4 * iwt + 1],
                                     blk->work_data[4 * iwt + 2],
                                     blk->work_data[4 * iwt + 3]
                                   };
                    float param = dot(point, l.bs);
                    float4 rp1 = l.amod + param * l.bis;
                    float4 dist = rp1 - point;
                    float4 ews = ewb->error_weights[iwt];
                    errorsum += dot(ews, dist * dist);
                    if (param < lowparam)
                        lowparam = param;
                    if (param > highparam)
                        highparam = param;
                }
            }
        } else {
            for (i = 0; i < texelcount; i++) {
                int iwt = pt->texels_of_partition[partition][i];
                float4 point = { blk->work_data[4 * iwt], blk->work_data[4 * iwt + 1], blk->work_data[4 * iwt + 2], blk->work_data[4 * iwt + 3] };
                float param = dot(point, l.bs);
                float4 rp1 = l.amod + param * l.bis;
                float4 dist = rp1 - point;
                float4 ews = ewb->error_weights[iwt];
                errorsum += dot(ews, dist * dist);
                if (param < lowparam)
                    lowparam = param;
                if (param > highparam)
                    highparam = param;
            }
        }

        float linelen = highparam - lowparam;
        if (!(linelen > FLOAT_n7))
            linelen = FLOAT_n7;
        length_of_lines[partition] = linelen;
    }

    return errorsum;
}

void compute_alpha_minmax(__global partition_info * pt, imageblock * blk, error_weight_block * ewb, float *alpha_min, float *alpha_max, __global ASTC_Encode *ASTCEncode) {
    DEBUG("compute_alpha_minmax");
    int i;
    int partition_count = pt->partition_count;

    for (i = 0; i < partition_count; i++) {
        alpha_min[i] = FLOAT_38;
        alpha_max[i] = -FLOAT_38;
    }

    for (i = 0; i < ASTCEncode->m_texels_per_block; i++) {
        if (ewb->texel_weight[i] > FLOAT_n10) {
            int partition = pt->partition_of_texel[i];
            float alphaval = blk->work_data[4 * i + 3];
            if (alphaval > alpha_max[partition])
                alpha_max[partition] = alphaval;
            if (alphaval < alpha_min[partition])
                alpha_min[partition] = alphaval;
        }
    }

    for (i = 0; i < partition_count; i++) {
        if (alpha_min[i] >= alpha_max[i]) {
            alpha_min[i] = 0;
            alpha_max[i] = FLOAT_n10;
        }
    }
}

#define XPASTE(x,y) x##y
#define PASTE(x,y) XPASTE(x,y)

#define TWO_COMPONENT_ERROR_FUNC( funcname, c0_iwt, c1_iwt, c01_name, c01_rname ) \
float funcname( \
    __global partition_info *pt, \
    imageblock *blk, \
    error_weight_block *ewb, \
    processed_line2 *plines, \
    float *length_of_lines \
    ) \
    { \
    int i; \
    float errorsum = 0.0f; \
    int partition; \
    for(partition=0; partition<pt->partition_count; partition++) \
        { \
        int texelcount = pt->texels_per_partition[ partition ]; \
        float lowparam = FLOAT_10; \
        float highparam = -FLOAT_10; \
        processed_line2 l = plines[partition]; \
        if( ewb->contains_zeroweight_texels ) \
            { \
            for(i=0;i<texelcount;i++) \
                { \
                int iwt = pt->texels_of_partition[ partition ][i]; \
                float texel_weight = ewb-> PASTE(texel_weight_ , c01_rname) [i]; \
                if( texel_weight > FLOAT_n20 ) \
                    { \
                    float2 point = {blk->work_data[4*iwt + c0_iwt], blk->work_data[4*iwt + c1_iwt] }; \
                    float param = dot( point, l.bs ); \
                    float2 rp1 = l.amod + param*l.bis; \
                    float2 dist = rp1 - point; \
                    float4 ews = ewb->error_weights[iwt]; \
                    errorsum += dot( ews. c01_name, dist*dist ); \
                    if( param < lowparam ) lowparam = param; \
                    if( param > highparam ) highparam = param; \
                    } \
                } \
            } \
        else \
            { \
            for(i=0;i<texelcount;i++) \
                { \
                int iwt = pt->texels_of_partition[ partition ][i]; \
                float2 point = {blk->work_data[4*iwt + c0_iwt], blk->work_data[4*iwt + c1_iwt]}; \
                float param = dot( point, l.bs ); \
                float2 rp1 = l.amod + param*l.bis; \
                float2 dist = rp1 - point; \
                float4 ews = ewb->error_weights[iwt]; \
                errorsum += dot( ews. c01_name, dist*dist ); \
                if( param < lowparam ) lowparam = param; \
                if( param > highparam ) highparam = param; \
                } \
            } \
        float linelen = highparam - lowparam; \
        if( !(linelen > FLOAT_n7) ) \
            linelen = FLOAT_n7; \
        length_of_lines[partition] = linelen; \
        } \
    return errorsum; \
    }

TWO_COMPONENT_ERROR_FUNC(compute_error_squared_rg, 0, 1, xy, rg)
TWO_COMPONENT_ERROR_FUNC(compute_error_squared_rb, 0, 2, xz, rb)
TWO_COMPONENT_ERROR_FUNC(compute_error_squared_gb, 1, 2, yz, gb)
TWO_COMPONENT_ERROR_FUNC(compute_error_squared_ra, 0, 3, zw, ra)

// function to compute the error across a tile when using a particular set of
// lines for a particular partitioning. Also compute the length of each
// color-space line in each partitioning.

#define THREE_COMPONENT_ERROR_FUNC( funcname, c0_iwt, c1_iwt, c2_iwt, c012_name, c012_rname ) \
float funcname( \
    __global partition_info *pt, \
    imageblock *blk, \
    error_weight_block *ewb, \
    processed_line3 *plines, \
    float *length_of_lines \
    ) \
    { \
    int i; \
    float errorsum = 0.0f; \
    int partition; \
    for(partition=0; partition<pt->partition_count; partition++) \
        { \
        int texelcount = pt->texels_per_partition[ partition ]; \
        float lowparam = FLOAT_10; \
        float highparam = -FLOAT_10; \
        processed_line3 l = plines[partition]; \
        if( ewb->contains_zeroweight_texels ) \
            { \
            for(i=0;i<texelcount;i++) \
                { \
                int iwt = pt->texels_of_partition[ partition ][i]; \
                float texel_weight = ewb-> PASTE(texel_weight_ , c012_rname) [i]; \
                if( texel_weight > FLOAT_n20 ) \
                    { \
                    float3 point = {blk->work_data[4*iwt + c0_iwt], blk->work_data[4*iwt + c1_iwt], blk->work_data[4*iwt + c2_iwt]}; \
                    float param = dot( point, l.bs ); \
                    float3 rp1 = l.amod + param*l.bis; \
                    float3 dist = rp1 - point; \
                    float4 ews = ewb->error_weights[iwt]; \
                    errorsum += dot( ews. c012_name, dist*dist ); \
                    if( param < lowparam ) lowparam = param; \
                    if( param > highparam ) highparam = param; \
                    } \
                } \
            } \
        else \
            { \
            for(i=0;i<texelcount;i++) \
                { \
                int iwt = pt->texels_of_partition[ partition ][i]; \
                float3 point = {blk->work_data[4*iwt + c0_iwt], blk->work_data[4*iwt + c1_iwt], blk->work_data[4*iwt + c2_iwt]}; \
                float param = dot( point, l.bs ); \
                float3 rp1 = l.amod + param*l.bis; \
                float3 dist = rp1 - point; \
                float4 ews = ewb->error_weights[iwt]; \
                errorsum += dot( ews. c012_name, dist*dist ); \
                if( param < lowparam ) lowparam = param; \
                if( param > highparam ) highparam = param; \
                } \
            } \
        float linelen = highparam - lowparam; \
        if( !(linelen > FLOAT_n7) ) \
            linelen = FLOAT_n7; \
        length_of_lines[partition] = linelen; \
        } \
    return errorsum; \
    }

THREE_COMPONENT_ERROR_FUNC(compute_error_squared_gba, 1, 2, 3, yzw, gba)
THREE_COMPONENT_ERROR_FUNC(compute_error_squared_rba, 0, 2, 3, xzw, rba)
THREE_COMPONENT_ERROR_FUNC(compute_error_squared_rga, 0, 1, 3, xyw, rga)
THREE_COMPONENT_ERROR_FUNC(compute_error_squared_rgb, 0, 1, 2, xyz, rgb)


void compute_rgb_minmax(
    __global partition_info * pt,
    imageblock * blk,
    error_weight_block * ewb,
    float *red_min, float *red_max,
    float *green_min, float *green_max,
    float *blue_min, float *blue_max,
    __global ASTC_Encode *ASTCEncode) {
    DEBUG("compute_rgb_minmax");
    int i;
    int partition_count = pt->partition_count;

    for (i = 0; i < partition_count; i++) {
        red_min[i] = FLOAT_38;
        red_max[i] = -FLOAT_38;
        green_min[i] = FLOAT_38;
        green_max[i] = -FLOAT_38;
        blue_min[i] = FLOAT_38;
        blue_max[i] = -FLOAT_38;
    }

    for (i = 0; i < ASTCEncode->m_texels_per_block; i++) {
        if (ewb->texel_weight[i] > FLOAT_n10) {
            int partition = pt->partition_of_texel[i];
            float redval = blk->work_data[4 * i];
            float greenval = blk->work_data[4 * i + 1];
            float blueval = blk->work_data[4 * i + 2];
            if (redval > red_max[partition])
                red_max[partition] = redval;
            if (redval < red_min[partition])
                red_min[partition] = redval;
            if (greenval > green_max[partition])
                green_max[partition] = greenval;
            if (greenval < green_min[partition])
                green_min[partition] = greenval;
            if (blueval > blue_max[partition])
                blue_max[partition] = blueval;
            if (blueval < blue_min[partition])
                blue_min[partition] = blueval;
        }
    }
    for (i = 0; i < partition_count; i++) {
        if (red_min[i] >= red_max[i]) {
            red_min[i] = 0.0f;
            red_max[i] = FLOAT_n10;
        }
        if (green_min[i] >= green_max[i]) {
            green_min[i] = 0.0f;
            green_max[i] = FLOAT_n10;
        }
        if (blue_min[i] >= blue_max[i]) {
            blue_min[i] = 0.0f;
            blue_max[i] = FLOAT_n10;
        }
    }
}

void find_best_partitionings(int partition_search_limit, int partition_count,
                             imageblock * pb, error_weight_block * ewb, int candidates_to_return,
                             int *best_partitions_uncorrellated,     // best partitionings to use if the endpoint colors are assumed to be uncorrellated
                             int *best_partitions_samechroma,        // best partitionings to use if the endpoint colors have the same chroma
                             int *best_partitions_dual_weight_planes,// best partitionings to use if using dual plane of weightss
                             __global ASTC_Encode *ASTCEncode) {
//#
    DEBUG("find_best_partitionings");
    int i, j;

    // constant used to estimate quantization error for a given partitioning;
    // the optimal value for this constant depends on bitrate.
    // These constants have been determined empirically.

    float weight_imprecision_estim = 100;
    if (ASTCEncode->m_texels_per_block <= 20)
        weight_imprecision_estim = 0.03f;
    else if (ASTCEncode->m_texels_per_block <= 31)
        weight_imprecision_estim = 0.04f;
    else if (ASTCEncode->m_texels_per_block <= 41)
        weight_imprecision_estim = 0.05f;
    else
        weight_imprecision_estim = 0.055f;

    int partition_sequence[PARTITION_COUNT];

    kmeans_compute_partition_ordering(partition_count, pb, partition_sequence, ASTCEncode);

    float weight_imprecision_estim_squared = weight_imprecision_estim * weight_imprecision_estim;

    int uses_alpha = imageblock_uses_alpha1(pb);

    // partitioning errors assuming uncorrellated-chrominance endpoints
    float uncorr_errors[PARTITION_COUNT];
    // partitioning errors assuming same-chrominance endpoints
    float samechroma_errors[PARTITION_COUNT];

    // partitioning errors assuming that one of the color channels
    // is uncorrellated from all the other ones
    float separate_errors[4 * PARTITION_COUNT];
    float *separate_red_errors = separate_errors;
    float *separate_green_errors = separate_errors + PARTITION_COUNT;
    float *separate_blue_errors = separate_errors + 2 * PARTITION_COUNT;
    float *separate_alpha_errors = separate_errors + 3 * PARTITION_COUNT;

    int defacto_search_limit = PARTITION_COUNT - 1;

    if (uses_alpha) {

        for (i = 0; i < PARTITION_COUNT; i++) {
            int partition = partition_sequence[i];
            int bk_partition_count = ASTCEncode->partition_tables[partition_count][partition].partition_count;

            if (bk_partition_count < partition_count) {
                uncorr_errors[i] = FLOAT_35;
                samechroma_errors[i] = FLOAT_35;
                separate_red_errors[i] = FLOAT_35;
                separate_green_errors[i] = FLOAT_35;
                separate_blue_errors[i] = FLOAT_35;
                separate_alpha_errors[i] = FLOAT_35;
                continue;
            }
            // the sentinel value for partitions above the search limit must be smaller
            // than the sentinel value for invalid partitions
            if (i >= partition_search_limit) {
                defacto_search_limit = i;

                uncorr_errors[i] = FLOAT_34;
                samechroma_errors[i] = FLOAT_34;
                separate_red_errors[i] = FLOAT_34;
                separate_green_errors[i] = FLOAT_34;
                separate_blue_errors[i] = FLOAT_34;
                separate_alpha_errors[i] = FLOAT_34;
                break;
            }

            // compute the weighting to give to each color channel
            // in each partition.
            float4 error_weightings[4];
            float4 color_scalefactors[4];
            float4 inverse_color_scalefactors[4];
            compute_partition_error_color_weightings(ewb,
                    &ASTCEncode->partition_tables[partition_count][partition],
                    error_weightings, color_scalefactors, ASTCEncode);

            for (j = 0; j < partition_count; j++) {
                inverse_color_scalefactors[j].x = 1.0f / (std::max)(color_scalefactors[j].x, FLOAT_n7);
                inverse_color_scalefactors[j].y = 1.0f / (std::max)(color_scalefactors[j].y, FLOAT_n7);
                inverse_color_scalefactors[j].z = 1.0f / (std::max)(color_scalefactors[j].z, FLOAT_n7);
                inverse_color_scalefactors[j].w = 1.0f / (std::max)(color_scalefactors[j].w, FLOAT_n7);
            }

            float4 averages[4];
            float4 directions_rgba[4];
            float3 directions_gba[4];
            float3 directions_rba[4];
            float3 directions_rga[4];
            float3 directions_rgb[4];

            compute_averages_and_directions_rgba(
                &ASTCEncode->partition_tables[partition_count][partition],
                pb, ewb, color_scalefactors, averages, directions_rgba, directions_gba, directions_rba, directions_rga, directions_rgb);

            line4 uncorr_lines[4];
            line4 samechroma_lines[4];
            line3 separate_red_lines[4];
            line3 separate_green_lines[4];
            line3 separate_blue_lines[4];
            line3 separate_alpha_lines[4];

            processed_line4 proc_uncorr_lines[4];
            processed_line4 proc_samechroma_lines[4];
            processed_line3 proc_separate_red_lines[4];
            processed_line3 proc_separate_green_lines[4];
            processed_line3 proc_separate_blue_lines[4];
            processed_line3 proc_separate_alpha_lines[4];

            float uncorr_linelengths[4];
            float samechroma_linelengths[4];
            float separate_red_linelengths[4];
            float separate_green_linelengths[4];
            float separate_blue_linelengths[4];
            float separate_alpha_linelengths[4];

            float3 one3f  = { 1.0f,1.0f,1.0f};
            float4 zero4f = { 0.0f,0.0f,0.0f,0.0f };
            float4 one4f  = { 1.0f,1.0f,1.0f,1.0f };

            for (j = 0; j < partition_count; j++) {
                uncorr_lines[j].a = averages[j];
                if (dot(directions_rgba[j], directions_rgba[j]) == 0.0f) {
                    uncorr_lines[j].b = normalize(one4f);
                } else
                    uncorr_lines[j].b = normalize(directions_rgba[j]);

                proc_uncorr_lines[j].amod = (uncorr_lines[j].a - uncorr_lines[j].b * dot(uncorr_lines[j].a, uncorr_lines[j].b)) * inverse_color_scalefactors[j];
                proc_uncorr_lines[j].bs = (uncorr_lines[j].b * color_scalefactors[j]);
                proc_uncorr_lines[j].bis = (uncorr_lines[j].b * inverse_color_scalefactors[j]);


                samechroma_lines[j].a = zero4f;
                if (dot(averages[j], averages[j]) == 0) {
                    samechroma_lines[j].b = normalize(one4f);
                } else
                    samechroma_lines[j].b = normalize(averages[j]);

                proc_samechroma_lines[j].amod = (samechroma_lines[j].a - samechroma_lines[j].b * dot(samechroma_lines[j].a, samechroma_lines[j].b)) * inverse_color_scalefactors[j];
                proc_samechroma_lines[j].bs = (samechroma_lines[j].b * color_scalefactors[j]);
                proc_samechroma_lines[j].bis = (samechroma_lines[j].b * inverse_color_scalefactors[j]);

                separate_red_lines[j].a = averages[j].yzw;
                if (dot(directions_gba[j], directions_gba[j]) == 0.0f)
                    separate_red_lines[j].b = normalize(one3f);
                else
                    separate_red_lines[j].b = normalize(directions_gba[j]);

                separate_green_lines[j].a = averages[j].xzw;
                if (dot(directions_rba[j], directions_rba[j]) == 0.0f)
                    separate_green_lines[j].b = normalize(one3f);
                else
                    separate_green_lines[j].b = normalize(directions_rba[j]);

                separate_blue_lines[j].a = averages[j].xyw;
                if (dot(directions_rga[j], directions_rga[j]) == 0.0f)
                    separate_blue_lines[j].b = normalize(one3f);
                else
                    separate_blue_lines[j].b = normalize(directions_rga[j]);

                separate_alpha_lines[j].a = averages[j].xyz;
                if (dot(directions_rgb[j], directions_rgb[j]) == 0.0f) {
                    separate_alpha_lines[j].b = normalize(one3f);
                } else
                    separate_alpha_lines[j].b = normalize(directions_rgb[j]);

                proc_separate_red_lines[j].amod = (separate_red_lines[j].a - separate_red_lines[j].b * dot(separate_red_lines[j].a, separate_red_lines[j].b)) * inverse_color_scalefactors[j].yzw;
                proc_separate_red_lines[j].bs = (separate_red_lines[j].b * color_scalefactors[j].yzw);
                proc_separate_red_lines[j].bis = (separate_red_lines[j].b * inverse_color_scalefactors[j].yzw);

                proc_separate_green_lines[j].amod =
                    (separate_green_lines[j].a - separate_green_lines[j].b * dot(separate_green_lines[j].a, separate_green_lines[j].b)) * inverse_color_scalefactors[j].xzw;
                proc_separate_green_lines[j].bs = (separate_green_lines[j].b * color_scalefactors[j].xzw);
                proc_separate_green_lines[j].bis = (separate_green_lines[j].b * inverse_color_scalefactors[j].xzw);

                proc_separate_blue_lines[j].amod = (separate_blue_lines[j].a - separate_blue_lines[j].b * dot(separate_blue_lines[j].a, separate_blue_lines[j].b)) * inverse_color_scalefactors[j].xyw;
                proc_separate_blue_lines[j].bs = (separate_blue_lines[j].b * color_scalefactors[j].xyw);
                proc_separate_blue_lines[j].bis = (separate_blue_lines[j].b * inverse_color_scalefactors[j].xyw);

                proc_separate_alpha_lines[j].amod =
                    (separate_alpha_lines[j].a - separate_alpha_lines[j].b * dot(separate_alpha_lines[j].a, separate_alpha_lines[j].b)) * inverse_color_scalefactors[j].xyz;
                proc_separate_alpha_lines[j].bs = (separate_alpha_lines[j].b * color_scalefactors[j].xyz);
                proc_separate_alpha_lines[j].bis = (separate_alpha_lines[j].b * inverse_color_scalefactors[j].xyz);

            }

            float uncorr_error = compute_error_squared_rgba(
                                     &ASTCEncode->partition_tables[partition_count][partition],
                                     pb,
                                     ewb,
                                     proc_uncorr_lines,
                                     uncorr_linelengths);
            float samechroma_error = compute_error_squared_rgba(
                                         &ASTCEncode->partition_tables[partition_count][partition],
                                         pb,
                                         ewb,
                                         proc_samechroma_lines,
                                         samechroma_linelengths);

            float separate_red_error  = compute_error_squared_gba(
                                            &ASTCEncode->partition_tables[partition_count][partition],
                                            pb,
                                            ewb,
                                            proc_separate_red_lines,
                                            separate_red_linelengths);

            float separate_green_error = compute_error_squared_rba(
                                             &ASTCEncode->partition_tables[partition_count][partition],
                                             pb,
                                             ewb,
                                             proc_separate_green_lines,
                                             separate_green_linelengths);

            float separate_blue_error = compute_error_squared_rga(
                                            &ASTCEncode->partition_tables[partition_count][partition],
                                            pb,
                                            ewb,
                                            proc_separate_blue_lines,
                                            separate_blue_linelengths);

            float separate_alpha_error = compute_error_squared_rgb(
                                             &ASTCEncode->partition_tables[partition_count][partition],
                                             pb,
                                             ewb,
                                             proc_separate_alpha_lines,
                                             separate_alpha_linelengths);

            // compute minimum & maximum alpha values in each partition
            float red_min[4], red_max[4];
            float green_min[4], green_max[4];
            float blue_min[4], blue_max[4];
            float alpha_min[4], alpha_max[4];
            compute_alpha_minmax(
                &ASTCEncode->partition_tables[partition_count][partition],
                pb, ewb, alpha_min, alpha_max, ASTCEncode);

            compute_rgb_minmax(
                &ASTCEncode->partition_tables[partition_count][partition],
                pb, ewb, red_min, red_max, green_min, green_max, blue_min, blue_max, ASTCEncode);

            /*
               Compute an estimate of error introduced by weight quantization imprecision.
               This error is computed as follows, for each partition
               1: compute the principal-axis vector (full length) in error-space
               2: convert the principal-axis vector to regular RGB-space
               3: scale the vector by a constant that estimates average quantization error
               4: for each texel, square the vector, then do a dot-product with the texel's error weight;
                  sum up the results across all texels.
               4(optimized): square the vector once, then do a dot-product with the average texel error,
                  then multiply by the number of texels.
             */

            for (j = 0; j < partition_count; j++) {
                float tpp = (float)(ASTCEncode->partition_tables[partition_count][partition].texels_per_partition[j]);

                float4 ics = inverse_color_scalefactors[j];
                float4 error_weights = error_weightings[j] * (tpp * weight_imprecision_estim_squared);

                float4 uncorr_vector = (uncorr_lines[j].b * uncorr_linelengths[j]) * ics;
                float4 samechroma_vector = (samechroma_lines[j].b * samechroma_linelengths[j]) * ics;
                float3 separate_red_vector = (separate_red_lines[j].b * separate_red_linelengths[j]) * ics.yzw;
                float3 separate_green_vector = (separate_green_lines[j].b * separate_green_linelengths[j]) * ics.xzw;
                float3 separate_blue_vector = (separate_blue_lines[j].b * separate_blue_linelengths[j]) * ics.xyw;
                float3 separate_alpha_vector = (separate_alpha_lines[j].b * separate_alpha_linelengths[j]) * ics.xyz;

                uncorr_vector = uncorr_vector * uncorr_vector;
                samechroma_vector = samechroma_vector * samechroma_vector;
                separate_red_vector = separate_red_vector * separate_red_vector;
                separate_green_vector = separate_green_vector * separate_green_vector;
                separate_blue_vector = separate_blue_vector * separate_blue_vector;
                separate_alpha_vector = separate_alpha_vector * separate_alpha_vector;

                uncorr_error += dot(uncorr_vector, error_weights);
                samechroma_error += dot(samechroma_vector, error_weights);
                separate_red_error += dot(separate_red_vector, error_weights.yzw);
                separate_green_error += dot(separate_green_vector, error_weights.xzw);
                separate_blue_error += dot(separate_blue_vector, error_weights.xyw);
                separate_alpha_error += dot(separate_alpha_vector, error_weights.xyz);

                float red_scalar = (red_max[j] - red_min[j]);
                float green_scalar = (green_max[j] - green_min[j]);
                float blue_scalar = (blue_max[j] - blue_min[j]);
                float alpha_scalar = (alpha_max[j] - alpha_min[j]);
                red_scalar *= red_scalar;
                green_scalar *= green_scalar;
                blue_scalar *= blue_scalar;
                alpha_scalar *= alpha_scalar;
                separate_red_error += red_scalar * error_weights.x;
                separate_green_error += green_scalar * error_weights.y;
                separate_blue_error += blue_scalar * error_weights.z;
                separate_alpha_error += alpha_scalar * error_weights.w;
            }

            uncorr_errors[i] = uncorr_error;
            samechroma_errors[i] = samechroma_error;
            separate_red_errors[i] = separate_red_error;
            separate_green_errors[i] = separate_green_error;
            separate_blue_errors[i] = separate_blue_error;
            separate_alpha_errors[i] = separate_alpha_error;

        }
    } else {

        for (i = 0; i < PARTITION_COUNT; i++) {

            int partition = partition_sequence[i];

            int bk_partition_count = ASTCEncode->partition_tables[partition_count][partition].partition_count;
            if (bk_partition_count < partition_count) {

                uncorr_errors[i] = FLOAT_35;
                samechroma_errors[i] = FLOAT_35;
                separate_red_errors[i] = FLOAT_35;
                separate_green_errors[i] = FLOAT_35;
                separate_blue_errors[i] = FLOAT_35;
                continue;
            }

            // the sentinel value for valid partitions above the search limit must be smaller
            // than the sentinel value for invalid partitions
            if (i >= partition_search_limit) {

                defacto_search_limit = i;
                uncorr_errors[i] = FLOAT_34;
                samechroma_errors[i] = FLOAT_34;
                separate_red_errors[i] = FLOAT_34;
                separate_green_errors[i] = FLOAT_34;
                separate_blue_errors[i] = FLOAT_34;
                break;

            }

            // compute the weighting to give to each color channel
            // in each partition.
            float4 error_weightings[4];
            float4 color_scalefactors[4];
            float4 inverse_color_scalefactors[4];

            compute_partition_error_color_weightings( ewb,
                    &ASTCEncode->partition_tables[partition_count][partition],
                    error_weightings, color_scalefactors, ASTCEncode);

            for (j = 0; j < partition_count; j++) {
                inverse_color_scalefactors[j].x = 1.0f / (std::max)(color_scalefactors[j].x, FLOAT_n7);
                inverse_color_scalefactors[j].y = 1.0f / (std::max)(color_scalefactors[j].y, FLOAT_n7);
                inverse_color_scalefactors[j].z = 1.0f / (std::max)(color_scalefactors[j].z, FLOAT_n7);
                inverse_color_scalefactors[j].w = 1.0f / (std::max)(color_scalefactors[j].w, FLOAT_n7);
            }

            float3 averages[4];
            float3 directions_rgb[4];
            float2 directions_rg[4];
            float2 directions_rb[4];
            float2 directions_gb[4];

            compute_averages_and_directions_rgb(
                &ASTCEncode->partition_tables[partition_count][partition],
                pb, ewb, color_scalefactors, averages, directions_rgb, directions_rg, directions_rb, directions_gb);

            line3 uncorr_lines[4];
            line3 samechroma_lines[4];
            line2 separate_red_lines[4];
            line2 separate_green_lines[4];
            line2 separate_blue_lines[4];

            processed_line3 proc_uncorr_lines[4];
            processed_line3 proc_samechroma_lines[4];

            processed_line2 proc_separate_red_lines[4];
            processed_line2 proc_separate_green_lines[4];
            processed_line2 proc_separate_blue_lines[4];

            float uncorr_linelengths[4];
            float samechroma_linelengths[4];
            float separate_red_linelengths[4];
            float separate_green_linelengths[4];
            float separate_blue_linelengths[4];

            float2 one2f  = { 1.0f,1.0f };
            float3 one3f  = { 1.0f,1.0f,1.0f };
            float3 zero3f = { 0.0f,0.0f,0.0f };

            for (j = 0; j < partition_count; j++) {
                uncorr_lines[j].a = averages[j];
                if (dot(directions_rgb[j], directions_rgb[j]) == 0.0f)
                    uncorr_lines[j].b = normalize(one3f);
                else
                    uncorr_lines[j].b = normalize(directions_rgb[j]);


                samechroma_lines[j].a = zero3f;

                if (dot(averages[j], averages[j]) == 0.0f)
                    samechroma_lines[j].b = normalize(one3f);
                else
                    samechroma_lines[j].b = normalize(averages[j]);

                proc_uncorr_lines[j].amod = (uncorr_lines[j].a - uncorr_lines[j].b * dot(uncorr_lines[j].a, uncorr_lines[j].b)) * inverse_color_scalefactors[j].xyz;
                proc_uncorr_lines[j].bs = (uncorr_lines[j].b * color_scalefactors[j].xyz);
                proc_uncorr_lines[j].bis = (uncorr_lines[j].b * inverse_color_scalefactors[j].xyz);

                proc_samechroma_lines[j].amod = (samechroma_lines[j].a - samechroma_lines[j].b * dot(samechroma_lines[j].a, samechroma_lines[j].b)) * inverse_color_scalefactors[j].xyz;
                proc_samechroma_lines[j].bs = (samechroma_lines[j].b * color_scalefactors[j].xyz);
                proc_samechroma_lines[j].bis = (samechroma_lines[j].b * inverse_color_scalefactors[j].xyz);

                separate_red_lines[j].a = averages[j].yz;
                if (dot(directions_gb[j], directions_gb[j]) == 0.0f)
                    separate_red_lines[j].b = normalize(one2f);
                else
                    separate_red_lines[j].b = normalize(directions_gb[j]);

                separate_green_lines[j].a = averages[j].xz;
                if (dot(directions_rb[j], directions_rb[j]) == 0.0f)
                    separate_green_lines[j].b = normalize(one2f);
                else
                    separate_green_lines[j].b = normalize(directions_rb[j]);

                separate_blue_lines[j].a = averages[j].xy;
                if (dot(directions_rg[j], directions_rg[j]) == 0.0f)
                    separate_blue_lines[j].b = normalize(one2f);
                else
                    separate_blue_lines[j].b = normalize(directions_rg[j]);

                proc_separate_red_lines[j].amod = (separate_red_lines[j].a - separate_red_lines[j].b * dot(separate_red_lines[j].a, separate_red_lines[j].b)) * inverse_color_scalefactors[j].yz;
                proc_separate_red_lines[j].bs = (separate_red_lines[j].b * color_scalefactors[j].yz);
                proc_separate_red_lines[j].bis = (separate_red_lines[j].b * inverse_color_scalefactors[j].yz);

                proc_separate_green_lines[j].amod =
                    (separate_green_lines[j].a - separate_green_lines[j].b * dot(separate_green_lines[j].a, separate_green_lines[j].b)) * inverse_color_scalefactors[j].xz;
                proc_separate_green_lines[j].bs = (separate_green_lines[j].b * color_scalefactors[j].xz);
                proc_separate_green_lines[j].bis = (separate_green_lines[j].b * inverse_color_scalefactors[j].xz);

                proc_separate_blue_lines[j].amod = (separate_blue_lines[j].a - separate_blue_lines[j].b * dot(separate_blue_lines[j].a, separate_blue_lines[j].b)) * inverse_color_scalefactors[j].xy;
                proc_separate_blue_lines[j].bs = (separate_blue_lines[j].b * color_scalefactors[j].xy);
                proc_separate_blue_lines[j].bis = (separate_blue_lines[j].b * inverse_color_scalefactors[j].xy);

            }

            float uncorr_error = compute_error_squared_rgb(
                                     &ASTCEncode->partition_tables[partition_count][partition],
                                     pb,
                                     ewb,
                                     proc_uncorr_lines,
                                     uncorr_linelengths);
            float samechroma_error = compute_error_squared_rgb(
                                         &ASTCEncode->partition_tables[partition_count][partition],
                                         pb,
                                         ewb,
                                         proc_samechroma_lines,
                                         samechroma_linelengths);

            float separate_red_error = compute_error_squared_gb(
                                           &ASTCEncode->partition_tables[partition_count][partition],
                                           pb,
                                           ewb,
                                           proc_separate_red_lines,
                                           separate_red_linelengths);

            float separate_green_error = compute_error_squared_rb(
                                             &ASTCEncode->partition_tables[partition_count][partition],
                                             pb,
                                             ewb,
                                             proc_separate_green_lines,
                                             separate_green_linelengths);

            float separate_blue_error = compute_error_squared_rg(
                                            &ASTCEncode->partition_tables[partition_count][partition],
                                            pb,
                                            ewb,
                                            proc_separate_blue_lines,
                                            separate_blue_linelengths);

            float red_min[4], red_max[4];
            float green_min[4], green_max[4];
            float blue_min[4], blue_max[4];

            compute_rgb_minmax(
                &ASTCEncode->partition_tables[partition_count][partition],
                pb, ewb, red_min, red_max, green_min, green_max, blue_min, blue_max, ASTCEncode);

            /*
               compute an estimate of error introduced by weight imprecision.
               This error is computed as follows, for each partition
               1: compute the principal-axis vector (full length) in error-space
               2: convert the principal-axis vector to regular RGB-space
               3: scale the vector by a constant that estimates average quantization error.
               4: for each texel, square the vector, then do a dot-product with the texel's error weight;
                  sum up the results across all texels.
               4(optimized): square the vector once, then do a dot-product with the average texel error,
                 then multiply by the number of texels.
             */

            for (j = 0; j < partition_count; j++) {
                float tpp = (float)(ASTCEncode->partition_tables[partition_count][partition].texels_per_partition[j]);

                float3 ics = inverse_color_scalefactors[j].xyz;
                float3 error_weights = error_weightings[j].xyz * (tpp * weight_imprecision_estim_squared);

                float3 uncorr_vector = (uncorr_lines[j].b * uncorr_linelengths[j]) * ics;
                float3 samechroma_vector = (samechroma_lines[j].b * samechroma_linelengths[j]) * ics;

                float2 separate_red_vector = (separate_red_lines[j].b * separate_red_linelengths[j]) * ics.yz;
                float2 separate_green_vector = (separate_green_lines[j].b * separate_green_linelengths[j]) * ics.xz;
                float2 separate_blue_vector = (separate_blue_lines[j].b * separate_blue_linelengths[j]) * ics.xy;

                uncorr_vector = uncorr_vector * uncorr_vector;
                samechroma_vector = samechroma_vector * samechroma_vector;
                separate_red_vector = separate_red_vector * separate_red_vector;
                separate_green_vector = separate_green_vector * separate_green_vector;
                separate_blue_vector = separate_blue_vector * separate_blue_vector;

                uncorr_error += dot(uncorr_vector, error_weights);
                samechroma_error += dot(samechroma_vector, error_weights);
                separate_red_error += dot(separate_red_vector, error_weights.yz);
                separate_green_error += dot(separate_green_vector, error_weights.xz);
                separate_blue_error += dot(separate_blue_vector, error_weights.xy);

                float red_scalar = (red_max[j] - red_min[j]);
                float green_scalar = (green_max[j] - green_min[j]);
                float blue_scalar = (blue_max[j] - blue_min[j]);

                red_scalar *= red_scalar;
                green_scalar *= green_scalar;
                blue_scalar *= blue_scalar;

                separate_red_error += red_scalar * error_weights.x;
                separate_green_error += green_scalar * error_weights.y;
                separate_blue_error += blue_scalar * error_weights.z;
            }

            uncorr_errors[i] = uncorr_error;
            samechroma_errors[i] = samechroma_error;

            separate_red_errors[i] = separate_red_error;
            separate_green_errors[i] = separate_green_error;
            separate_blue_errors[i] = separate_blue_error;
        }
    }

    for (i = 0; i < candidates_to_return; i++) {
        int best_uncorr_partition = 0;
        int best_samechroma_partition = 0;
        float best_uncorr_error = FLOAT_30;
        float best_samechroma_error = FLOAT_30;
        for (j = 0; j <= defacto_search_limit; j++) {
            if (uncorr_errors[j] < best_uncorr_error) {
                best_uncorr_partition = j;
                best_uncorr_error = uncorr_errors[j];
            }
        }
        best_partitions_uncorrellated[i] = partition_sequence[best_uncorr_partition];
        uncorr_errors[best_uncorr_partition] = FLOAT_30;
        samechroma_errors[best_uncorr_partition] = FLOAT_30;

        for (j = 0; j <= defacto_search_limit; j++) {
            if (samechroma_errors[j] < best_samechroma_error) {
                best_samechroma_partition = j;
                best_samechroma_error = samechroma_errors[j];
            }
        }
        best_partitions_samechroma[i] = partition_sequence[best_samechroma_partition];
        samechroma_errors[best_samechroma_partition] = FLOAT_30;
        uncorr_errors[best_samechroma_partition] = FLOAT_30;
    }

    for (i = 0; i < 2 * candidates_to_return; i++) {
        int best_partition = 0;
        float best_partition_error = FLOAT_30;

        for (j = 0; j < defacto_search_limit; j++) {

            if (separate_errors[j] < best_partition_error) {
                best_partition = j;
                best_partition_error = separate_errors[j];
            }
            if (separate_errors[j + PARTITION_COUNT] < best_partition_error) {
                best_partition = j + PARTITION_COUNT;
                best_partition_error = separate_errors[j + PARTITION_COUNT];
            }
            if (separate_errors[j + 2 * PARTITION_COUNT] < best_partition_error) {
                best_partition = j + 2 * PARTITION_COUNT;
                best_partition_error = separate_errors[j + 2 * PARTITION_COUNT];
            }

            if (uses_alpha) {
                if (separate_errors[j + 3 * PARTITION_COUNT] < best_partition_error) {
                    best_partition = j + 3 * PARTITION_COUNT;
                    best_partition_error = separate_errors[j + 3 * PARTITION_COUNT];
                }
            }
        }

        separate_errors[best_partition] = FLOAT_30;
        best_partition = ((best_partition >> PARTITION_BITS) << PARTITION_BITS) | partition_sequence[best_partition & (PARTITION_COUNT - 1)];
        best_partitions_dual_weight_planes[i] = best_partition;
    }

}

float compress_symbolic_block(
    imageblock * blk,
    symbolic_compressed_block * scb,
    __global ASTC_Encode *  ASTCEncode
) {
    DEBUG("compress_symbolic_block");
    int i, j;
    imageblock temp;
    error_weight_block ewb;
    error_weight_block_orig ewbo;
    symbolic_compressed_block tempblocks[4];

    endpoints_and_weights ei1;
    endpoints_and_weights ei2;
    endpoints_and_weights eix1[MAX_DECIMATION_MODES];
    endpoints_and_weights eix2[MAX_DECIMATION_MODES];

    __global2 float   *decimated_weights                           = ASTCEncode->decimated_weights;
    __global2 uint8_t *u8_quantized_decimated_quantized_weights    = ASTCEncode->u8_quantized_decimated_quantized_weights;
    __global2 float   *decimated_quantized_weights                 = ASTCEncode->decimated_quantized_weights;
    __global2 float   *flt_quantized_decimated_quantized_weights   = ASTCEncode->flt_quantized_decimated_quantized_weights;

    if (blk->red_min == blk->red_max && blk->green_min == blk->green_max && blk->blue_min == blk->blue_max && blk->alpha_min == blk->alpha_max) {
        // detected a constant-color block. Encode as FP16 if using HDR
        scb->error_block = 0;

        if (ASTCEncode->m_rgb_force_use_of_hdr) {
            scb->block_mode = -1;
            scb->partition_count = 0;
            scb->constant_color[0] = float_to_sf16(blk->orig_data[0], SF_NEARESTEVEN);
            scb->constant_color[1] = float_to_sf16(blk->orig_data[1], SF_NEARESTEVEN);
            scb->constant_color[2] = float_to_sf16(blk->orig_data[2], SF_NEARESTEVEN);
            scb->constant_color[3] = float_to_sf16(blk->orig_data[3], SF_NEARESTEVEN);
        } else {
            // Encode as UNORM16 if NOT using HDR.
            scb->block_mode = -2;
            scb->partition_count = 0;
            float red = blk->orig_data[0];
            float green = blk->orig_data[1];
            float blue = blk->orig_data[2];
            float alpha = blk->orig_data[3];
            if (red < 0)
                red = 0;
            else if (red > 1)
                red = 1;
            if (green < 0)
                green = 0;
            else if (green > 1)
                green = 1;
            if (blue < 0)
                blue = 0;
            else if (blue > 1)
                blue = 1;
            if (alpha < 0)
                alpha = 0;
            else if (alpha > 1)
                alpha = 1;
            scb->constant_color[0] = (int)floor(red     * 65535.0f + 0.5f);
            scb->constant_color[1] = (int)floor(green   * 65535.0f + 0.5f);
            scb->constant_color[2] = (int)floor(blue    * 65535.0f + 0.5f);
            scb->constant_color[3] = (int)floor(alpha   * 65535.0f + 0.5f);
        }
        return 0.0f;
    }


    float error_weight_sum = prepare_error_weight_block(blk, &ewb, &ewbo,ASTCEncode);

//#ifdef __OPENCL_VERSION__
//      if ((get_global_id(0) == 0) && (get_global_id(1) == 0))
//#endif
//          printf("error_weight_sum %3.3f\n", error_weight_sum);


    float error_of_best_block = FLOAT_20;

    float best_errorvals_in_modes[17];
    for (i = 0; i < 17; i++)
        best_errorvals_in_modes[i] = FLOAT_30;

    int uses_alpha = imageblock_uses_alpha1(blk);
    float mode_cutoff = ASTCEncode->m_ewp.block_mode_cutoff;

    // next, test mode #0. This mode uses 1 plane of weights and 1 partition.
    // we test it twice, first with a modecutoff of 0, then with the specified mode-cutoff.
    // This causes an early-out that speeds up encoding of "easy" content.

    float modecutoffs[2];
    float errorval_mult[2] = { 2.5, 1 };
    modecutoffs[0] = 0;
    modecutoffs[1] = mode_cutoff;

    // compute ideal weights and endpoint colors for every decimation.

    float best_errorval_in_mode;
    for (i = 0; i < 2; i++) {
        compress_symbolic_block_fixed_partition_1_plane(
            modecutoffs[i],
            ASTCEncode->m_ewp.max_refinement_iters,
            1,    // partition count
            0,    // partition indexdone
            blk,
            &ewb,
            tempblocks,
            &ei1,
            eix1,
            decimated_weights,
            u8_quantized_decimated_quantized_weights,
            decimated_quantized_weights,
            flt_quantized_decimated_quantized_weights,
            ASTCEncode
        );

//#ifdef __OPENCL_VERSION__
//        if ((get_global_id(0) == 0) && (get_global_id(1) == 0))
//#endif
//        printf("1_plane : ei1.ep.endpt0[0].x %3.3f\n", ei1.ep.endpt0[0].x);


        best_errorval_in_mode = FLOAT_30;
        for (j = 0; j < 4; j++) {
            if (tempblocks[j].error_block)
                continue;

            decompress_symbolic_block(tempblocks + j, &temp, ASTCEncode);
            float errorval = compute_imageblock_difference(blk, &temp, &ewb, ASTCEncode) * errorval_mult[i];
            if (errorval < best_errorval_in_mode)
                best_errorval_in_mode = errorval;

            if (errorval < error_of_best_block) {
                error_of_best_block = errorval;
                *scb = tempblocks[j];
            }

        }


        best_errorvals_in_modes[0] = best_errorval_in_mode;
        if ((error_of_best_block / error_weight_sum) < ASTCEncode->m_ewp.texel_avg_error_limit) {
            // mean squared error per color component.
            return (error_of_best_block / ASTCEncode->m_texels_per_block);
        }
    }

    int is_normal_map;
    float lowest_correl;
    prepare_block_statistics(blk, &ewb, &is_normal_map, &lowest_correl, ASTCEncode);

    if (is_normal_map && lowest_correl < 0.99f)
        lowest_correl = 0.99f;

    // next, test the four possible 1-partition, 2-planes modes
    for (i = 0; i < 4; i++) {

        if (lowest_correl > ASTCEncode->m_ewp.lowest_correlation_cutoff)
            continue;

        if (blk->grayscale && i != 3)
            continue;

        if (!uses_alpha && i == 3)
            continue;

        compress_symbolic_block_fixed_partition_2_planes(  mode_cutoff,
                ASTCEncode->m_ewp.max_refinement_iters,
                1,    // partition count
                0,    // partition index
                i,    // the color component to test a separate plane of weights for.
                blk,
                &ewb,
                tempblocks,
                &ei1,
                &ei2,
                eix1,
                eix2,
                decimated_weights,
                u8_quantized_decimated_quantized_weights,
                decimated_quantized_weights,
                flt_quantized_decimated_quantized_weights,
                ASTCEncode
                                                        );

//#ifdef __OPENCL_VERSION__
//        if ((get_global_id(0) == 0) && (get_global_id(1) == 0))
//#endif
//            printf("2_plane : ei1.ep.endpt0[0].x %3.3f  ei2.ep.endpt0[0].x %3.3f\n", ei1.ep.endpt0[0].x, ei2.ep.endpt0[0].x);


        for (j = 0; j < 4; j++) {
            if (tempblocks[j].error_block)
                continue;
            decompress_symbolic_block(tempblocks + j, &temp, ASTCEncode);
            float errorval = compute_imageblock_difference(blk, &temp, &ewb, ASTCEncode);
            if (errorval < best_errorval_in_mode)
                best_errorval_in_mode = errorval;

            if (errorval < error_of_best_block) {
                error_of_best_block = errorval;
                *scb = tempblocks[j];
            }

            best_errorvals_in_modes[i + 1] = best_errorval_in_mode;
        }

        if ((error_of_best_block / error_weight_sum) < ASTCEncode->m_ewp.texel_avg_error_limit) {
            // mean squared error per color component.
            return (error_of_best_block / ASTCEncode->m_texels_per_block);
        }
    }


    // find best blocks for 2, 3 and 4 partitions
    int partition_count;
    int max_partitions = 2;

#ifdef ENABLE_3_PARTITION_CODE
    max_partitions++;
#endif

#ifdef ENABLE_4_PARTITION_CODE
    max_partitions++;
#endif

    for (partition_count = 2; partition_count <= max_partitions; partition_count++) {
        int partition_indices_1plane[2];
        int partition_indices_2planes[2];

        find_best_partitionings(ASTCEncode->m_ewp.partition_search_limit,
                                partition_count, blk, &ewb, 1,
                                &(partition_indices_1plane[0]), &(partition_indices_1plane[1]), &(partition_indices_2planes[0]),ASTCEncode);

        for (i = 0; i < 2; i++) {
            compress_symbolic_block_fixed_partition_1_plane(
                mode_cutoff,
                ASTCEncode->m_ewp.max_refinement_iters,
                partition_count,
                partition_indices_1plane[i],
                blk,
                &ewb,
                tempblocks,
                &ei1,
                eix1,
                decimated_weights,
                u8_quantized_decimated_quantized_weights,
                decimated_quantized_weights,
                flt_quantized_decimated_quantized_weights,
                ASTCEncode
            );

            best_errorval_in_mode = FLOAT_30;
            for (j = 0; j < 4; j++) {
                if (tempblocks[j].error_block)
                    continue;
                decompress_symbolic_block(tempblocks + j, &temp, ASTCEncode);
                float errorval = compute_imageblock_difference(blk, &temp, &ewb, ASTCEncode);
                if (errorval < best_errorval_in_mode)
                    best_errorval_in_mode = errorval;

                if (errorval < error_of_best_block) {
                    error_of_best_block = errorval;
                    *scb = tempblocks[j];
                }
            }

            best_errorvals_in_modes[4 * (partition_count - 2) + 5] = best_errorval_in_mode;

            if ((error_of_best_block / error_weight_sum) < ASTCEncode->m_ewp.texel_avg_error_limit) {
                // mean squared error per color component.
                return (error_of_best_block / ASTCEncode->m_texels_per_block);
            }
        }


        if (partition_count == 2 && !is_normal_map && (std::min)(best_errorvals_in_modes[5], best_errorvals_in_modes[6]) > (best_errorvals_in_modes[0] * ASTCEncode->m_ewp.partition_1_to_2_limit)) {
            // mean squared error per color component.
            return (error_of_best_block / ASTCEncode->m_texels_per_block);
        }

        // don't bother to check 4 partitions for dual plane of weightss, ever.
        if (partition_count == 4)
            break;

        for (i = 0; i < 2; i++) {
            if (lowest_correl > ASTCEncode->m_ewp.lowest_correlation_cutoff)
                continue;


            compress_symbolic_block_fixed_partition_2_planes(
                mode_cutoff,
                ASTCEncode->m_ewp.max_refinement_iters,
                partition_count,
                (partition_indices_2planes[i] & (PARTITION_COUNT - 1)),
                (partition_indices_2planes[i] >> PARTITION_BITS),
                blk,
                &ewb,
                tempblocks,
                &ei1,
                &ei2,
                eix1,
                eix2,
                decimated_weights,
                u8_quantized_decimated_quantized_weights,
                decimated_quantized_weights,
                flt_quantized_decimated_quantized_weights,
                ASTCEncode
            );


            best_errorval_in_mode = FLOAT_30;
            for (j = 0; j < 4; j++) {
                if (tempblocks[j].error_block)
                    continue;
                decompress_symbolic_block(tempblocks + j, &temp, ASTCEncode);

                float errorval = compute_imageblock_difference(
                                     blk, &temp, &ewb, ASTCEncode);

                if (errorval < best_errorval_in_mode)
                    best_errorval_in_mode = errorval;

                if (errorval < error_of_best_block) {
                    error_of_best_block = errorval;
                    *scb = tempblocks[j];
                }
            }

            best_errorvals_in_modes[4 * (partition_count - 2) + 5 + 2] = best_errorval_in_mode;

            if ((error_of_best_block / error_weight_sum) < ASTCEncode->m_ewp.texel_avg_error_limit) {
                // mean squared error per color component.
                return (error_of_best_block / ASTCEncode->m_texels_per_block);
            }
        }
    }

    // mean squared error per color component.
    return (error_of_best_block / ASTCEncode->m_texels_per_block);
}


//===================== SYMBOLIC TO PHYSICAL START =============================

// routine to write up to 8 bits
//static inline
void write_bits(int value, int bitcount, int bitoffset, uint8_t * ptr) {
    int mask = (1 << bitcount) - 1;
    value &= mask;
    ptr += bitoffset >> 3;
    bitoffset &= 7;
    value <<= bitoffset;
    mask <<= bitoffset;
    mask = ~mask;

    ptr[0] &= mask;
    ptr[0] |= value;
    ptr[1] &= mask >> 8;
    ptr[1] |= value >> 8;
}

uint8_t bitrev8(uint8_t p) {
    p = ((p & 0xF) << 4) | ((p >> 4) & 0xF);
    p = ((p & 0x33) << 2) | ((p >> 2) & 0x33);
    p = ((p & 0x55) << 1) | ((p >> 1) & 0x55);
    return p;
}

void find_number_of_bits_trits_quints(int quantization_level, int *bits, int *trits, int *quints) {
    *bits = 0;
    *trits = 0;
    *quints = 0;
    switch (quantization_level) {
    case QUANT_2:
        *bits = 1;
        break;
    case QUANT_3:
        *bits = 0;
        *trits = 1;
        break;
    case QUANT_4:
        *bits = 2;
        break;
    case QUANT_5:
        *bits = 0;
        *quints = 1;
        break;
    case QUANT_6:
        *bits = 1;
        *trits = 1;
        break;
    case QUANT_8:
        *bits = 3;
        break;
    case QUANT_10:
        *bits = 1;
        *quints = 1;
        break;
    case QUANT_12:
        *bits = 2;
        *trits = 1;
        break;
    case QUANT_16:
        *bits = 4;
        break;
    case QUANT_20:
        *bits = 2;
        *quints = 1;
        break;
    case QUANT_24:
        *bits = 3;
        *trits = 1;
        break;
    case QUANT_32:
        *bits = 5;
        break;
    case QUANT_40:
        *bits = 3;
        *quints = 1;
        break;
    case QUANT_48:
        *bits = 4;
        *trits = 1;
        break;
    case QUANT_64:
        *bits = 6;
        break;
    case QUANT_80:
        *bits = 4;
        *quints = 1;
        break;
    case QUANT_96:
        *bits = 5;
        *trits = 1;
        break;
    case QUANT_128:
        *bits = 7;
        break;
    case QUANT_160:
        *bits = 5;
        *quints = 1;
        break;
    case QUANT_192:
        *bits = 6;
        *trits = 1;
        break;
    case QUANT_256:
        *bits = 8;
        break;
    }
}

void encode_ise(int quantization_level, int elements, uint8_t * input_data, uint8_t * output_data, int bit_offset) {
    int i;
    uint8_t lowparts[64];
    uint8_t highparts[69];        // 64 elements + 5 elements for padding
    uint8_t tq_blocks[22];        // trit-blocks or quint-blocks

    int bits, trits, quints;
    find_number_of_bits_trits_quints(quantization_level, &bits, &trits, &quints);

    for (i = 0; i < elements; i++) {
        lowparts[i] = input_data[i] & ((1 << bits) - 1);
        highparts[i] = input_data[i] >> bits;
    }
    for (i = elements; i < elements + 5; i++)
        highparts[i] = 0;        // padding before we start constructing trit-blocks or quint-blocks

    // construct trit-blocks or quint-blocks as necessary
    if (trits) {
        int trit_blocks = (elements + 4) / 5;
        for (i = 0; i < trit_blocks; i++)
            tq_blocks[i] = integer_of_trits[highparts[5 * i + 4]][highparts[5 * i + 3]][highparts[5 * i + 2]][highparts[5 * i + 1]][highparts[5 * i]];
    }
    if (quints) {
        int quint_blocks = (elements + 2) / 3;
        for (i = 0; i < quint_blocks; i++)
            tq_blocks[i] = integer_of_quints[highparts[3 * i + 2]][highparts[3 * i + 1]][highparts[3 * i]];
    }

    // then, write out the actual bits.
    int lcounter = 0;
    int hcounter = 0;
    for (i = 0; i < elements; i++) {
        write_bits(lowparts[i], bits, bit_offset, output_data);
        bit_offset += bits;
        if (trits) {

            write_bits(tq_blocks[hcounter] >> block_shift5[lcounter], bits_to_write5[lcounter], bit_offset, output_data);
            bit_offset += bits_to_write5[lcounter];
            hcounter += hcounter_incr5[lcounter];
            lcounter = next_lcounter5[lcounter];
        }
        if (quints) {

            write_bits(tq_blocks[hcounter] >> block_shift3[lcounter], bits_to_write3[lcounter], bit_offset, output_data);
            bit_offset += bits_to_write3[lcounter];
            hcounter += hcounter_incr3[lcounter];
            lcounter = next_lcounter3[lcounter];
        }
    }
}

physical_compressed_block symbolic_to_physical(symbolic_compressed_block * sc, __global ASTC_Encode *  ASTCEncode) {
    int i, j;
    physical_compressed_block res;

    if (sc->block_mode == -2) {
        // UNORM16 constant-color block.
        // This encodes separate constant-color blocks. There is currently
        // no attempt to coalesce them into larger void-extents.

        for (i = 0; i < 8; i++)
            res.data[i] = cbytes1[i];

        for (i = 0; i < 4; i++) {
            res.data[2 * i + 8] = sc->constant_color[i] & 0xFF;
            res.data[2 * i + 9] = (sc->constant_color[i] >> 8) & 0xFF;
        }
        return res;
    }

    if (sc->block_mode == -1) {
        // FP16 constant-color block.
        // This encodes separate constant-color blocks. There is currently
        // no attempt to coalesce them into larger void-extents.

        for (i = 0; i < 8; i++)
            res.data[i] = cbytes2[i];

        for (i = 0; i < 4; i++) {
            res.data[2 * i + 8] = sc->constant_color[i] & 0xFF;
            res.data[2 * i + 9] = (sc->constant_color[i] >> 8) & 0xFF;
        }
        return res;
    }

    int partition_count = sc->partition_count;

    // first, compress the weights. They are encoded as an ordinary
    // integer-sequence, then bit-reversed
    uint8_t weightbuf[16];
    for (i = 0; i < 16; i++)
        weightbuf[i] = 0;

    int weight_count = ASTCEncode->bsd.decimation_tables[ASTCEncode->bsd.block_modes[sc->block_mode].decimation_mode].num_weights;
    int weight_quantization_method = ASTCEncode->bsd.block_modes[sc->block_mode].quantization_mode;
    int is_dual_plane = ASTCEncode->bsd.block_modes[sc->block_mode].is_dual_plane;

    int real_weight_count = is_dual_plane ? 2 * weight_count : weight_count;

    int bits_for_weights = compute_ise_bitcount(real_weight_count,
                           (quantization_method)weight_quantization_method);


    if (is_dual_plane) {
        uint8_t weights[64];
        for (i = 0; i < weight_count; i++) {
            weights[2 * i] = sc->plane1_weights[i];
            weights[2 * i + 1] = sc->plane2_weights[i];
        }
        encode_ise(weight_quantization_method, real_weight_count, weights, weightbuf, 0);
    } else {
        encode_ise(weight_quantization_method, weight_count, sc->plane1_weights, weightbuf, 0);
    }

    for (i = 0; i < 16; i++)
        res.data[i] = bitrev8(weightbuf[15 - i]);

    write_bits(sc->block_mode, 11, 0, res.data);
    write_bits(partition_count - 1, 2, 11, res.data);

    int below_weights_pos = 128 - bits_for_weights;

    // encode partition index and color endpoint types for blocks with
    // 2 or more partitions.
    if (partition_count > 1) {
        write_bits(sc->partition_index, 6, 13, res.data);
        write_bits(sc->partition_index >> 6, PARTITION_BITS - 6, 19, res.data);

        if (sc->color_formats_matched) {
            write_bits(sc->color_formats[0] << 2, 6, 13 + PARTITION_BITS, res.data);
        } else {
            // go through the selected endpoint type classes for each partition
            // in order to determine the lowest class present.
            int low_class = 4;
            for (i = 0; i < partition_count; i++) {
                int class_of_format = sc->color_formats[i] >> 2;
                if (class_of_format < low_class)
                    low_class = class_of_format;
            }
            if (low_class == 3)
                low_class = 2;
            int encoded_type = low_class + 1;
            int bitpos = 2;
            for (i = 0; i < partition_count; i++) {
                int classbit_of_format = (sc->color_formats[i] >> 2) - low_class;

                encoded_type |= classbit_of_format << bitpos;
                bitpos++;
            }
            for (i = 0; i < partition_count; i++) {
                int lowbits_of_format = sc->color_formats[i] & 3;
                encoded_type |= lowbits_of_format << bitpos;
                bitpos += 2;
            }
            int encoded_type_lowpart = encoded_type & 0x3F;
            int encoded_type_highpart = encoded_type >> 6;
            int encoded_type_highpart_size = (3 * partition_count) - 4;
            int encoded_type_highpart_pos = 128 - bits_for_weights - encoded_type_highpart_size;
            write_bits(encoded_type_lowpart, 6, 13 + PARTITION_BITS, res.data);
            write_bits(encoded_type_highpart, encoded_type_highpart_size, encoded_type_highpart_pos, res.data);

            below_weights_pos -= encoded_type_highpart_size;
        }
    }

    else
        write_bits(sc->color_formats[0], 4, 13, res.data);

    // in dual-plane mode, encode the color component of the second plane of weights
    if (is_dual_plane)
        write_bits(sc->plane2_color_component, 2, below_weights_pos - 2, res.data);

    // finally, encode the color bits
    // first, get hold of all the color components to encode
    uint8_t values_to_encode[32];
    int valuecount_to_encode = 0;
    for (i = 0; i < sc->partition_count; i++) {
        int vals = 2 * (sc->color_formats[i] >> 2) + 2;
        for (j = 0; j < vals; j++)
            values_to_encode[j + valuecount_to_encode] = (uint8_t)sc->color_values[i][j];
        valuecount_to_encode += vals;
    }
    // then, encode an ISE based on them.
    encode_ise(sc->color_quantization_level, valuecount_to_encode, values_to_encode, res.data, (sc->partition_count == 1 ? 17 : 19 + PARTITION_BITS));

    return res;
}

//===================== SYMBOLIC TO PHYSICAL END =============================

//---------------------
// ASTC use with OpenCL
//---------------------
#if defined(__OPENCL_VERSION__)
__kernel void CMP_GPUEncoder(
    __global unsigned char      *p_source_pixels,
    __global unsigned char      *p_encoded_blocks,
    __global Source_Info        *SourceInfo,
    __global ASTC_Encode        *ASTCEncode
) {
    //=================================
    // Get the Thread workspace
    //=================================
#ifdef ASPM_GPU
    int pixel_block_x = get_global_id(0);
    int pixel_block_y = get_global_id(1);
#else
    (groupOffset);
    (SourceInfo);
#endif

    //=================================
    // Check scope of work is in range
    //=================================
    if (pixel_block_x >= ASTCEncode->m_width_in_blocks) return;
    if (pixel_block_y >= ASTCEncode->m_height_in_blocks) return;

    //==================================
    // Set Destination block index
    //==================================

    CGU_UINT dest_block_index = (pixel_block_y * (ASTCEncode->m_src_width / ASTCEncode->m_ydim) + pixel_block_x)*BYTES_PER_DESTINATION_BLOCK;

    //====================
    // Source Pixel block
    //====================
    CGU_UINT stride    = ASTCEncode->m_src_width * BYTEPP;
    CGU_UINT srcOffset = (pixel_block_x*ASTCEncode->m_xdim*BYTEPP) + (pixel_block_y*stride*ASTCEncode->m_ydim);

    //=================================
    // Load the pixels for this thread.
    //=================================
    astc_codec_image input_image;

    input_image.xsize   = ASTCEncode->m_xdim;
    input_image.ysize   = ASTCEncode->m_ydim;
#ifdef ASTC_ENABLE_3D_SUPPORT
    input_image.zsize   = ASTCEncode->m_zdim;
#else
    input_image.zsize   = 1;
#endif
    input_image.padding = 0;

    CGU_UINT dest_index = 0;
    CGU_UINT srcidx;
    for (unsigned int i = 0; i < ASTCEncode->m_ydim; i++) {
        srcidx = (i*stride) + srcOffset;
        for (unsigned int j = 0; j < ASTCEncode->m_xdim; j++) {
            input_image.pixels[dest_index].x = p_source_pixels[srcidx++];
            input_image.pixels[dest_index].y = p_source_pixels[srcidx++];
            input_image.pixels[dest_index].z = p_source_pixels[srcidx++];
            input_image.pixels[dest_index].w = p_source_pixels[srcidx++];
            dest_index++;
        }
    }

    CGU_UINT pixelcount = dest_index;

    // Data is in this form AABBGGRR
    //printf("(%d %d) R = %x G = %x B = %x\n", pixel_block_x, pixel_block_y, input_image.pixels[0].x, input_image.pixels[0].y, input_image.pixels[0].z);

    //==================================================
    // fetch an imageblock from the input source
    //==================================================
    imageblock pb;
    symbolic_compressed_block  scb;

    //if ((pixel_block_x == 0) && (pixel_block_y == 0))
    //{
    //  printf("ASTCEncode partition[0][0]partition_of_texel[0]        = %3.3f\n", ASTCEncode->bsd.decimation_tables[0].texel_weights_float[0][0]);
    //  printf("ASTCEncode m_Quality          = %f", ASTCEncode->m_Quality);
    //  printf("ASTCEncode m_texels_per_block = %d", ASTCEncode->m_texels_per_block);
    //}

    fetch_imageblock(&input_image, &pb,pixelcount,ASTCEncode);

    //printf("(%d %d) orig data %f %f %f\n", pixel_block_x, pixel_block_y, pb.orig_data[0], pb.orig_data[1], pb.orig_data[2]);
    //printf("(%d %d) work data %f %f %f\n", pixel_block_x, pixel_block_y, pb.work_data[0], pb.work_data[1], pb.work_data[2]);
    //printf("(%d %d) alpha_max %.3f alpha_min %.3f\n", pixel_block_x, pixel_block_y, pb.alpha_max, pb.alpha_max);

    compress_symbolic_block(&pb,&scb,ASTCEncode );

    // Copy the compress data to destination
    physical_compressed_block   pcb;
    pcb = symbolic_to_physical(&scb, ASTCEncode);

    p_encoded_blocks = p_encoded_blocks + dest_block_index;
    int j;
    for (j = 0; j < 16; j++) {
        p_encoded_blocks[j] = pcb.data[j];
    }

    // ("Test %x %x %x %x", pcb.data[0], pcb.data[1], pcb.data[2], pcb.data[3]);
}
#endif


} // Namespace ASTC_Kernel