TexConv/CMP_CompressonatorLib/ASTC/ASTC_Host.cpp

/*----------------------------------------------------------------------------*/
/**
*    This confidential and proprietary software may be used only as
*    authorised by a licensing agreement from ARM Limited
*    (C) COPYRIGHT 2011-2012 ARM Limited
*    ALL RIGHTS RESERVED
*
*    The entire notice above must be reproduced on all authorised
*    copies and copies may only be made to the extent permitted
*    by a licensing agreement from ARM Limited.
*
*/
/*----------------------------------------------------------------------------*/
//=====================================================================
// Copyright (c) 2017  Advanced Micro Devices, Inc. All rights reserved
//=====================================================================

#include <iostream>
#include <fstream>
#include <cstdlib>

#include "astc_host.h"
#include "astc_encode_kernel.h"
#include "compressonator.h"


//================================= ASTC CPU HOST CODE  ===========================================

namespace ASTC_Encoder {

//# this is added just incase the definition is missed when using only REPLACE_CPU_CODE
#ifdef REPLACE_CPU_CODE
#ifndef USE_HOST_CALLS
#define USE_HOST_CALLS
#endif
#endif


int compute_ise_bitcount2(int items, quantization_method quant) {
    switch (quant) {
    case QUANT_2:
        return items;
    case QUANT_3:
        return (8 * items + 4) / 5;
    case QUANT_4:
        return 2 * items;
    case QUANT_5:
        return (7 * items + 2) / 3;
    case QUANT_6:
        return (13 * items + 4) / 5;
    case QUANT_8:
        return 3 * items;
    case QUANT_10:
        return (10 * items + 2) / 3;
    case QUANT_12:
        return (18 * items + 4) / 5;
    case QUANT_16:
        return items * 4;
    case QUANT_20:
        return (13 * items + 2) / 3;
    case QUANT_24:
        return (23 * items + 4) / 5;
    case QUANT_32:
        return 5 * items;
    case QUANT_40:
        return (16 * items + 2) / 3;
    case QUANT_48:
        return (28 * items + 4) / 5;
    case QUANT_64:
        return 6 * items;
    case QUANT_80:
        return (19 * items + 2) / 3;
    case QUANT_96:
        return (33 * items + 4) / 5;
    case QUANT_128:
        return 7 * items;
    case QUANT_160:
        return (22 * items + 2) / 3;
    case QUANT_192:
        return (38 * items + 4) / 5;
    case QUANT_256:
        return 8 * items;
    default:
        return 100000;
    }
}

/*
float dot(float2 p, float2 q)
{
    return p.x * q.x + p.y * q.y;
}

float dot(float3 p, float3 q)
{
    return p.x * q.x + p.y * q.y + p.z * q.z;
}

float dot(float4 p, float4 q)
{
    return p.x * q.x + p.y * q.y + p.z * q.z + p.w * q.w;
}

float3 cross(float3 p, float3 q)
{
    return p.yzx * q.zxy - p.zxy * q.yzx;
}

float length(float2 p)
{
    return (float)(sqrt(dot(p, p)));
}

float length(float3 p)
{
    return (float)(sqrt(dot(p, p)));
}

float length(float4 p)
{
    return (float)(sqrt(dot(p, p)));
}

float2 normalize(float2 p)
{
    return p / length(p);
}

float3 normalize(float3 p)
{
    return p / length(p);
}

float4 normalize(float4 p)
{
    return p / length(p);
}
*/

#ifdef ASTC_ENABLE_3D_SUPPORT
// These functions use new () and should either be in CPU or changed to share a pre allocated pointer
void initialize_decimation_table_3d(
    // dimensions of the block
    int xdim, int ydim, int zdim,
    // number of grid points in 3d weight grid
    int x_weights, int y_weights, int z_weights, decimation_table * dt) {
    int i, j;
    int x, y, z;

    int texels_per_block = xdim * ydim * zdim;
    int weights_per_block = x_weights * y_weights * z_weights;

    int weightcount_of_texel[MAX_TEXELS_PER_BLOCK];
    int grid_weights_of_texel[MAX_TEXELS_PER_BLOCK][4];
    int weights_of_texel[MAX_TEXELS_PER_BLOCK][4];

    int texelcount_of_weight[MAX_WEIGHTS_PER_BLOCK];
    int texels_of_weight[MAX_WEIGHTS_PER_BLOCK][MAX_TEXELS_PER_BLOCK];
    int texelweights_of_weight[MAX_WEIGHTS_PER_BLOCK][MAX_TEXELS_PER_BLOCK];

    for (i = 0; i < weights_per_block; i++)
        texelcount_of_weight[i] = 0;
    for (i = 0; i < texels_per_block; i++)
        weightcount_of_texel[i] = 0;

    for (z = 0; z < zdim; z++)
        for (y = 0; y < ydim; y++)
            for (x = 0; x < xdim; x++) {
                int texel = (z * ydim + y) * xdim + x;

                int x_weight = (((1024 + xdim / 2) / (xdim - 1)) * x * (x_weights - 1) + 32) >> 6;
                int y_weight = (((1024 + ydim / 2) / (ydim - 1)) * y * (y_weights - 1) + 32) >> 6;
                int z_weight = (((1024 + zdim / 2) / (zdim - 1)) * z * (z_weights - 1) + 32) >> 6;

                int x_weight_frac = x_weight & 0xF;
                int y_weight_frac = y_weight & 0xF;
                int z_weight_frac = z_weight & 0xF;
                int x_weight_int = x_weight >> 4;
                int y_weight_int = y_weight >> 4;
                int z_weight_int = z_weight >> 4;
                int qweight[4];
                int weight[4];
                qweight[0] = (z_weight_int * y_weights + y_weight_int) * x_weights + x_weight_int;
                qweight[3] = ((z_weight_int + 1) * y_weights + (y_weight_int + 1)) * x_weights + (x_weight_int + 1);

                // simplex interpolation
                int fs = x_weight_frac;
                int ft = y_weight_frac;
                int fp = z_weight_frac;

                int cas = ((fs > ft) << 2) + ((ft > fp) << 1) + ((fs > fp));
                int N = x_weights;
                int NM = x_weights * y_weights;

                int s1, s2, w0, w1, w2, w3;
                switch (cas) {
                case 7:
                    s1 = 1;
                    s2 = N;
                    w0 = 16 - fs;
                    w1 = fs - ft;
                    w2 = ft - fp;
                    w3 = fp;
                    break;
                case 3:
                    s1 = N;
                    s2 = 1;
                    w0 = 16 - ft;
                    w1 = ft - fs;
                    w2 = fs - fp;
                    w3 = fp;
                    break;
                case 5:
                    s1 = 1;
                    s2 = NM;
                    w0 = 16 - fs;
                    w1 = fs - fp;
                    w2 = fp - ft;
                    w3 = ft;
                    break;
                case 4:
                    s1 = NM;
                    s2 = 1;
                    w0 = 16 - fp;
                    w1 = fp - fs;
                    w2 = fs - ft;
                    w3 = ft;
                    break;
                case 2:
                    s1 = N;
                    s2 = NM;
                    w0 = 16 - ft;
                    w1 = ft - fp;
                    w2 = fp - fs;
                    w3 = fs;
                    break;
                case 0:
                    s1 = NM;
                    s2 = N;
                    w0 = 16 - fp;
                    w1 = fp - ft;
                    w2 = ft - fs;
                    w3 = fs;
                    break;

                default:
                    s1 = NM;
                    s2 = N;
                    w0 = 16 - fp;
                    w1 = fp - ft;
                    w2 = ft - fs;
                    w3 = fs;
                    break;
                }

                qweight[1] = qweight[0] + s1;
                qweight[2] = qweight[1] + s2;
                weight[0] = w0;
                weight[1] = w1;
                weight[2] = w2;
                weight[3] = w3;

                /*
                for(i=0;i<4;i++) weight[i] <<= 4; */

                for (i = 0; i < 4; i++)
                    if (weight[i] != 0) {
                        grid_weights_of_texel[texel][weightcount_of_texel[texel]] = qweight[i];
                        weights_of_texel[texel][weightcount_of_texel[texel]] = weight[i];
                        weightcount_of_texel[texel]++;
                        texels_of_weight[qweight[i]][texelcount_of_weight[qweight[i]]] = texel;
                        texelweights_of_weight[qweight[i]][texelcount_of_weight[qweight[i]]] = weight[i];
                        texelcount_of_weight[qweight[i]]++;
                    }
            }

    for (i = 0; i < texels_per_block; i++) {
        dt->texel_num_weights[i] = (ASTC_Encoder::uint8_t)weightcount_of_texel[i];

        // ensure that all 4 entries are actually initialized.
        // This allows a branch-free implemntation of compute_value_of_texel_flt()
        for (j = 0; j < 4; j++) {
            dt->texel_weights_int[i][j] = 0;
            dt->texel_weights_float[i][j] = 0.0f;
            dt->texel_weights[i][j] = 0;
        }

        for (j = 0; j < weightcount_of_texel[i]; j++) {
            dt->texel_weights_int[i][j] = (ASTC_Encoder::uint8_t)weights_of_texel[i][j];
            dt->texel_weights_float[i][j] = weights_of_texel[i][j] * (1.0f / TEXEL_WEIGHT_SUM);
            dt->texel_weights[i][j] = (ASTC_Encoder::uint8_t)grid_weights_of_texel[i][j];
        }
    }

    for (i = 0; i < weights_per_block; i++) {
        dt->weight_num_texels[i] = (ASTC_Encoder::uint8_t)texelcount_of_weight[i];
        for (j = 0; j < texelcount_of_weight[i]; j++) {
            dt->weight_texel[i][j] = (ASTC_Encoder::uint8_t)texels_of_weight[i][j];
            dt->weights_int[i][j] = (ASTC_Encoder::uint8_t)texelweights_of_weight[i][j];
            dt->weights_flt[i][j] = (float)texelweights_of_weight[i][j];
        }
    }

    dt->num_texels = texels_per_block;
    dt->num_weights = weights_per_block;
}

int decode_block_mode_3d(int blockmode, int *Nval, int *Mval, int *Qval, int *dual_weight_plane, int *quant_mode) {
    int base_quant_mode = (blockmode >> 4) & 1;
    int H = (blockmode >> 9) & 1;
    int D = (blockmode >> 10) & 1;

    int A = (blockmode >> 5) & 0x3;

    int N = 0, M = 0, Q = 0;

    if ((blockmode & 3) != 0) {
        base_quant_mode |= (blockmode & 3) << 1;
        int B = (blockmode >> 7) & 3;
        int C = (blockmode >> 2) & 0x3;
        N = A + 2;
        M = B + 2;
        Q = C + 2;
    } else {
        base_quant_mode |= ((blockmode >> 2) & 3) << 1;
        if (((blockmode >> 2) & 3) == 0)
            return 0;
        int B = (blockmode >> 9) & 3;
        if (((blockmode >> 7) & 3) != 3) {
            D = 0;
            H = 0;
        }
        switch ((blockmode >> 7) & 3) {
        case 0:
            N = 6;
            M = B + 2;
            Q = A + 2;
            break;
        case 1:
            N = A + 2;
            M = 6;
            Q = B + 2;
            break;
        case 2:
            N = A + 2;
            M = B + 2;
            Q = 6;
            break;
        case 3:
            N = 2;
            M = 2;
            Q = 2;
            switch ((blockmode >> 5) & 3) {
            case 0:
                N = 6;
                break;
            case 1:
                M = 6;
                break;
            case 2:
                Q = 6;
                break;
            case 3:
                return 0;
            }
            break;
        }
    }

    int weight_count = N * M * Q * (D + 1);
    int qmode = (base_quant_mode - 2) + 6 * H;

    int weightbits = compute_ise_bitcount(weight_count, (quantization_method)qmode);
    if (weight_count > MAX_WEIGHTS_PER_BLOCK || weightbits < MIN_WEIGHT_BITS_PER_BLOCK || weightbits > MAX_WEIGHT_BITS_PER_BLOCK)
        return 0;

    *Nval = N;
    *Mval = M;
    *Qval = Q;
    *dual_weight_plane = D;
    *quant_mode = qmode;
    return 1;
}

// stubbed for the time being.
const float *get_3d_percentile_table_host(int blockdim_x, int blockdim_y, int blockdim_z) {
    IGNOREPARAM(blockdim_x);
    IGNOREPARAM(blockdim_y);
    IGNOREPARAM(blockdim_z);
    return dummy_percentile_table_3d;
}

void construct_block_size_descriptor_3d_host(int xdim, int ydim, int zdim, block_size_descriptor * bsd) {
    int decimation_mode_index[512];    // for each of the 512 entries in the decim_table_array, its index
    int decimation_mode_count = 0;

    int i;
    int x_weights;
    int y_weights;
    int z_weights;

    for (i = 0; i < 512; i++) {
        decimation_mode_index[i] = -1;
    }

    // gather all the infill-modes that can be used with the current block size
    for (x_weights = 2; x_weights <= 6; x_weights++)
        for (y_weights = 2; y_weights <= 6; y_weights++)
            for (z_weights = 2; z_weights <= 6; z_weights++) {
                if ((x_weights * y_weights * z_weights) > MAX_WEIGHTS_PER_BLOCK)
                    continue;
                decimation_table dt;
                decimation_mode_index[z_weights * 64 + y_weights * 8 + x_weights] = decimation_mode_count;
                initialize_decimation_table_3d(xdim, ydim, zdim, x_weights, y_weights, z_weights, &dt);

                int weight_count = x_weights * y_weights * z_weights;

                int maxprec_1plane = -1;
                int maxprec_2planes = -1;
                for (i = 0; i < 12; i++) {
                    int bits_1plane = compute_ise_bitcount(weight_count, (quantization_method)i);
                    int bits_2planes = compute_ise_bitcount(2 * weight_count, (quantization_method)i);
                    if (bits_1plane >= MIN_WEIGHT_BITS_PER_BLOCK && bits_1plane <= MAX_WEIGHT_BITS_PER_BLOCK)
                        maxprec_1plane = i;
                    if (bits_2planes >= MIN_WEIGHT_BITS_PER_BLOCK && bits_2planes <= MAX_WEIGHT_BITS_PER_BLOCK)
                        maxprec_2planes = i;
                }
                bsd->permit_encode[decimation_mode_count] = (x_weights <= xdim && y_weights <= ydim && z_weights <= zdim);

                bsd->decimation_mode_samples[decimation_mode_count] = weight_count;
                bsd->decimation_mode_maxprec_1plane[decimation_mode_count] = maxprec_1plane;
                bsd->decimation_mode_maxprec_2planes[decimation_mode_count] = maxprec_2planes;
                bsd->decimation_tables[decimation_mode_count] = dt;

                decimation_mode_count++;
            }

    for (i = 0; i < MAX_DECIMATION_MODES; i++) {
        bsd->decimation_mode_percentile[i] = 1.0f;
    }

    for (i = decimation_mode_count; i < MAX_DECIMATION_MODES; i++) {
        bsd->permit_encode[i] = 0;
        bsd->decimation_mode_samples[i] = 0;
        bsd->decimation_mode_maxprec_1plane[i] = -1;
        bsd->decimation_mode_maxprec_2planes[i] = -1;
    }

    bsd->decimation_mode_count = decimation_mode_count;

    const float *percentiles = get_3d_percentile_table_host(xdim, ydim, zdim);

    // then construct the list of block formats
    for (i = 0; i < 2048; i++) {
        int is_dual_plane;
        int quantization_mode;
        int fail = 0;
        int permit_encode = 1;

        if (decode_block_mode_3d(i, &x_weights, &y_weights, &z_weights, &is_dual_plane, &quantization_mode)) {
            if (x_weights > xdim || y_weights > ydim || z_weights > zdim)
                permit_encode = 0;
        } else {
            fail = 1;
            permit_encode = 0;
        }
        if (fail) {
            bsd->block_modes[i].decimation_mode = -1;
            bsd->block_modes[i].quantization_mode = -1;
            bsd->block_modes[i].is_dual_plane = -1;
            bsd->block_modes[i].permit_encode = 0;
            bsd->block_modes[i].permit_decode = 0;
            bsd->block_modes[i].percentile = 1.0f;
        } else {
            int decimation_mode = decimation_mode_index[z_weights * 64 + y_weights * 8 + x_weights];
            bsd->block_modes[i].decimation_mode = (ASTC_Encoder::uint8_t)decimation_mode;
            bsd->block_modes[i].quantization_mode = (ASTC_Encoder::uint8_t)quantization_mode;
            bsd->block_modes[i].is_dual_plane = (ASTC_Encoder::uint8_t)is_dual_plane;
            bsd->block_modes[i].permit_encode = (ASTC_Encoder::uint8_t)permit_encode;
            bsd->block_modes[i].permit_decode = (ASTC_Encoder::uint8_t)permit_encode;
            bsd->block_modes[i].percentile = percentiles[i];

            if (bsd->decimation_mode_percentile[decimation_mode] > percentiles[i])
                bsd->decimation_mode_percentile[decimation_mode] = percentiles[i];
        }

    }

    if (xdim * ydim * zdim <= 64) {
        bsd->texelcount_for_bitmap_partitioning = xdim * ydim * zdim;
        for (i = 0; i < xdim * ydim * zdim; i++)
            bsd->texels_for_bitmap_partitioning[i] = i;
    } else {
        // pick 64 random texels for use with bitmap partitioning.
        int arr[MAX_TEXELS_PER_BLOCK];
        for (i = 0; i < xdim * ydim * zdim; i++)
            arr[i] = 0;
        int arr_elements_set = 0;
        while (arr_elements_set < 64) {
            int idx = rand() % (xdim * ydim * zdim);
            if (arr[idx] == 0) {
                arr_elements_set++;
                arr[idx] = 1;
            }
        }
        int texel_weights_written = 0;
        int idx = 0;
        while (texel_weights_written < 64) {
            if (arr[idx])
                bsd->texels_for_bitmap_partitioning[texel_weights_written++] = idx;
            idx++;
        }
        bsd->texelcount_for_bitmap_partitioning = 64;
    }
}
#endif

// return 0 on invalid mode, 1 on valid mode.
int decode_block_mode_2d(int blockmode, int *Nval, int *Mval, int *dual_weight_plane, int *quant_mode) {
    int base_quant_mode = (blockmode >> 4) & 1;
    int H = (blockmode >> 9) & 1;
    int D = (blockmode >> 10) & 1;

    int A = (blockmode >> 5) & 0x3;

    int N = 0, M = 0;

    if ((blockmode & 3) != 0) {
        base_quant_mode |= (blockmode & 3) << 1;
        int B = (blockmode >> 7) & 3;
        switch ((blockmode >> 2) & 3) {
        case 0:
            N = B + 4;
            M = A + 2;
            break;
        case 1:
            N = B + 8;
            M = A + 2;
            break;
        case 2:
            N = A + 2;
            M = B + 8;
            break;
        case 3:
            B &= 1;
            if (blockmode & 0x100) {
                N = B + 2;
                M = A + 2;
            } else {
                N = A + 2;
                M = B + 6;
            }
            break;
        }
    } else {
        base_quant_mode |= ((blockmode >> 2) & 3) << 1;
        if (((blockmode >> 2) & 3) == 0)
            return 0;
        int B = (blockmode >> 9) & 3;
        switch ((blockmode >> 7) & 3) {
        case 0:
            N = 12;
            M = A + 2;
            break;
        case 1:
            N = A + 2;
            M = 12;
            break;
        case 2:
            N = A + 6;
            M = B + 6;
            D = 0;
            H = 0;
            break;
        case 3:
            switch ((blockmode >> 5) & 3) {
            case 0:
                N = 6;
                M = 10;
                break;
            case 1:
                N = 10;
                M = 6;
                break;
            case 2:
            case 3:
                return 0;
            }
            break;
        }
    }

    int weight_count = N * M * (D + 1);
    int qmode = (base_quant_mode - 2) + 6 * H;

    int weightbits = compute_ise_bitcount2(weight_count, (quantization_method)qmode);
    if (weight_count > MAX_WEIGHTS_PER_BLOCK || weightbits < MIN_WEIGHT_BITS_PER_BLOCK || weightbits > MAX_WEIGHT_BITS_PER_BLOCK)
        return 0;

    *Nval = N;
    *Mval = M;
    *dual_weight_plane = D;
    *quant_mode = qmode;
    return 1;
}

const float *get_2d_percentile_table_host(int blockdim_x, int blockdim_y) {
    switch (blockdim_x) {
    case 4:
        switch (blockdim_y) {
        case 4:
            return percentile_table_4x4;
        case 5:
            return percentile_table_4x5;
        case 6:
            return percentile_table_4x6;
        case 8:
            return percentile_table_4x8;
        case 10:
            return percentile_table_4x10;
        case 12:
            return percentile_table_4x12;
        }
        break;
    case 5:
        switch (blockdim_y) {
        case 4:
            return percentile_table_5x4;
        case 5:
            return percentile_table_5x5;
        case 6:
            return percentile_table_5x6;
        case 8:
            return percentile_table_5x8;
        case 10:
            return percentile_table_5x10;
        case 12:
            return percentile_table_5x12;
        }
        break;

    case 6:
        switch (blockdim_y) {
        case 4:
            return percentile_table_6x4;
        case 5:
            return percentile_table_6x5;
        case 6:
            return percentile_table_6x6;
        case 8:
            return percentile_table_6x8;
        case 10:
            return percentile_table_6x10;
        case 12:
            return percentile_table_6x12;
        }
        break;

    case 8:
        switch (blockdim_y) {
        case 4:
            return percentile_table_8x4;
        case 5:
            return percentile_table_8x5;
        case 6:
            return percentile_table_8x6;
        case 8:
            return percentile_table_8x8;
        case 10:
            return percentile_table_8x10;
        case 12:
            return percentile_table_8x12;
        }
        break;

    case 10:
        switch (blockdim_y) {
        case 4:
            return percentile_table_10x4;
        case 5:
            return percentile_table_10x5;
        case 6:
            return percentile_table_10x6;
        case 8:
            return percentile_table_10x8;
        case 10:
            return percentile_table_10x10;
        case 12:
            return percentile_table_10x12;
        }
        break;

    case 12:
        switch (blockdim_y) {
        case 4:
            return percentile_table_12x4;
        case 5:
            return percentile_table_12x5;
        case 6:
            return percentile_table_12x6;
        case 8:
            return percentile_table_12x8;
        case 10:
            return percentile_table_12x10;
        case 12:
            return percentile_table_12x12;
        }
        break;
    default:
        break;
    }

    return NULL;                // should never happen.
}

void initialize_decimation_table_2d(
    // dimensions of the block
    int xdim, int ydim,
    // number of grid points in 2d weight grid
    int x_weights, int y_weights, decimation_table * dt) {
    int i, j;
    int x, y;

    int texels_per_block = xdim * ydim;
    int weights_per_block = x_weights * y_weights;

    int weightcount_of_texel[MAX_TEXELS_PER_BLOCK];
    int grid_weights_of_texel[MAX_TEXELS_PER_BLOCK][4];
    int weights_of_texel[MAX_TEXELS_PER_BLOCK][4];

    int texelcount_of_weight[MAX_WEIGHTS_PER_BLOCK];
    int texels_of_weight[MAX_WEIGHTS_PER_BLOCK][MAX_TEXELS_PER_BLOCK];
    int texelweights_of_weight[MAX_WEIGHTS_PER_BLOCK][MAX_TEXELS_PER_BLOCK];

    for (i = 0; i < weights_per_block; i++)
        texelcount_of_weight[i] = 0;
    for (i = 0; i < texels_per_block; i++)
        weightcount_of_texel[i] = 0;

    for (y = 0; y < ydim; y++)
        for (x = 0; x < xdim; x++) {
            int texel = y * xdim + x;

            int x_weight = (((1024 + xdim / 2) / (xdim - 1)) * x * (x_weights - 1) + 32) >> 6;
            int y_weight = (((1024 + ydim / 2) / (ydim - 1)) * y * (y_weights - 1) + 32) >> 6;

            int x_weight_frac = x_weight & 0xF;
            int y_weight_frac = y_weight & 0xF;
            int x_weight_int = x_weight >> 4;
            int y_weight_int = y_weight >> 4;
            int qweight[4];
            int weight[4];
            qweight[0] = x_weight_int + y_weight_int * x_weights;
            qweight[1] = qweight[0] + 1;
            qweight[2] = qweight[0] + x_weights;
            qweight[3] = qweight[2] + 1;

            // truncated-precision bilinear interpolation.
            int prod = x_weight_frac * y_weight_frac;

            weight[3] = (prod + 8) >> 4;
            weight[1] = x_weight_frac - weight[3];
            weight[2] = y_weight_frac - weight[3];
            weight[0] = 16 - x_weight_frac - y_weight_frac + weight[3];

            for (i = 0; i < 4; i++)
                if (weight[i] != 0) {
                    grid_weights_of_texel[texel][weightcount_of_texel[texel]] = qweight[i];
                    weights_of_texel[texel][weightcount_of_texel[texel]] = weight[i];
                    weightcount_of_texel[texel]++;
                    texels_of_weight[qweight[i]][texelcount_of_weight[qweight[i]]] = texel;
                    texelweights_of_weight[qweight[i]][texelcount_of_weight[qweight[i]]] = weight[i];
                    texelcount_of_weight[qweight[i]]++;
                }
        }

    for (i = 0; i < texels_per_block; i++) {
        dt->texel_num_weights[i] = (ASTC_Encoder::uint8_t)weightcount_of_texel[i];

        // ensure that all 4 entries are actually initialized.
        // This allows a branch-free implemntation of compute_value_of_texel_flt()
        for (j = 0; j < 4; j++) {
            dt->texel_weights_int[i][j] = 0;
            dt->texel_weights_float[i][j] = 0.0f;
            dt->texel_weights[i][j] = 0;
        }

        for (j = 0; j < weightcount_of_texel[i]; j++) {
            dt->texel_weights_int[i][j] = (ASTC_Encoder::uint8_t)weights_of_texel[i][j];
            dt->texel_weights_float[i][j] = (weights_of_texel[i][j]) * (1.0f / TEXEL_WEIGHT_SUM);
            dt->texel_weights[i][j] = (ASTC_Encoder::uint8_t)grid_weights_of_texel[i][j];
        }
    }

    for (i = 0; i < weights_per_block; i++) {
        dt->weight_num_texels[i] = (ASTC_Encoder::uint8_t)texelcount_of_weight[i];


        for (j = 0; j < texelcount_of_weight[i]; j++) {
            dt->weight_texel[i][j] = (ASTC_Encoder::uint8_t)texels_of_weight[i][j];
            dt->weights_int[i][j] = (ASTC_Encoder::uint8_t)texelweights_of_weight[i][j];
            dt->weights_flt[i][j] = (float)texelweights_of_weight[i][j];
        }
    }

    dt->num_texels = texels_per_block;
    dt->num_weights = weights_per_block;


}

void construct_block_size_descriptor_2d_host(int xdim, int ydim, block_size_descriptor * bsd) {
    int decimation_mode_index[256];    // for each of the 256 entries in the decim_table_array, its index
    int decimation_mode_count = 0;

    int i;
    int x_weights;
    int y_weights;

    for (i = 0; i < 256; i++) {
        decimation_mode_index[i] = -1;
    }

    // gather all the infill-modes that can be used with the current block size
    for (x_weights = 2; x_weights <= 12; x_weights++)
        for (y_weights = 2; y_weights <= 12; y_weights++) {
            if (x_weights * y_weights > MAX_WEIGHTS_PER_BLOCK)
                continue;
            decimation_table dt;
            decimation_mode_index[y_weights * 16 + x_weights] = decimation_mode_count;
            initialize_decimation_table_2d(xdim, ydim, x_weights, y_weights, &dt);

            int weight_count = x_weights * y_weights;

            int maxprec_1plane = -1;
            int maxprec_2planes = -1;
            for (i = 0; i < 12; i++) {
                int bits_1plane = compute_ise_bitcount2(weight_count, (quantization_method)i);
                int bits_2planes = compute_ise_bitcount2(2 * weight_count, (quantization_method)i);
                if (bits_1plane >= MIN_WEIGHT_BITS_PER_BLOCK && bits_1plane <= MAX_WEIGHT_BITS_PER_BLOCK)
                    maxprec_1plane = i;
                if (bits_2planes >= MIN_WEIGHT_BITS_PER_BLOCK && bits_2planes <= MAX_WEIGHT_BITS_PER_BLOCK)
                    maxprec_2planes = i;
            }

            bsd->permit_encode[decimation_mode_count] = (x_weights <= xdim && y_weights <= ydim);

            bsd->decimation_mode_samples[decimation_mode_count] = weight_count;
            bsd->decimation_mode_maxprec_1plane[decimation_mode_count] = maxprec_1plane;
            bsd->decimation_mode_maxprec_2planes[decimation_mode_count] = maxprec_2planes;
            bsd->decimation_tables[decimation_mode_count] = dt;

            decimation_mode_count++;
        }

    for (i = 0; i < MAX_DECIMATION_MODES; i++) {
        bsd->decimation_mode_percentile[i] = 1.0f;
    }

    for (i = decimation_mode_count; i < MAX_DECIMATION_MODES; i++) {
        bsd->permit_encode[i] = 0;
        bsd->decimation_mode_samples[i] = 0;
        bsd->decimation_mode_maxprec_1plane[i] = -1;
        bsd->decimation_mode_maxprec_2planes[i] = -1;
    }

    bsd->decimation_mode_count = decimation_mode_count;

    const float *percentiles = get_2d_percentile_table_host(xdim, ydim);

    // then construct the list of block formats
    for (i = 0; i < 2048; i++) {
        int is_dual_plane;
        int quantization_mode;
        int fail = 0;
        int permit_encode = 1;

        if (decode_block_mode_2d(i, &x_weights, &y_weights, &is_dual_plane, &quantization_mode)) {
            if (x_weights > xdim || y_weights > ydim)
                permit_encode = 0;
        } else {
            fail = 1;
            permit_encode = 0;
        }

        if (fail) {
            bsd->block_modes[i].decimation_mode = -1;
            bsd->block_modes[i].quantization_mode = -1;
            bsd->block_modes[i].is_dual_plane = -1;
            bsd->block_modes[i].permit_encode = 0;
            bsd->block_modes[i].permit_decode = 0;
            bsd->block_modes[i].percentile = 1.0f;
        } else {
            int decimation_mode = decimation_mode_index[y_weights * 16 + x_weights];
            bsd->block_modes[i].decimation_mode = (ASTC_Encoder::uint8_t)decimation_mode;
            bsd->block_modes[i].quantization_mode = (ASTC_Encoder::uint8_t)quantization_mode;
            bsd->block_modes[i].is_dual_plane = (ASTC_Encoder::uint8_t)is_dual_plane;
            bsd->block_modes[i].permit_encode = (ASTC_Encoder::uint8_t)permit_encode;
            bsd->block_modes[i].permit_decode = (ASTC_Encoder::uint8_t)permit_encode;    // disallow decode of grid size larger than block size.
            bsd->block_modes[i].percentile = percentiles[i];

            if (bsd->decimation_mode_percentile[decimation_mode] > percentiles[i])
                bsd->decimation_mode_percentile[decimation_mode] = percentiles[i];
        }

    }

    if (xdim * ydim <= 64) {
        bsd->texelcount_for_bitmap_partitioning = xdim * ydim;
        for (i = 0; i < xdim * ydim; i++)
            bsd->texels_for_bitmap_partitioning[i] = i;
    }

    else {
        // pick 64 random texels for use with bitmap partitioning.
        int arr[MAX_TEXELS_PER_BLOCK];
        for (i = 0; i < xdim * ydim; i++)
            arr[i] = 0;
        int arr_elements_set = 0;
        while (arr_elements_set < 64) {
            int idx = rand() % (xdim * ydim);
            if (arr[idx] == 0) {
                arr_elements_set++;
                arr[idx] = 1;
            }
        }
        int texel_weights_written = 0;
        int idx = 0;
        while (texel_weights_written < 64) {
            if (arr[idx])
                bsd->texels_for_bitmap_partitioning[texel_weights_written++] = idx;
            idx++;
        }
        bsd->texelcount_for_bitmap_partitioning = 64;

    }
}

int compare_canonicalized_partition_tables(const uint64_cl part1[7], const uint64_cl part2[7]) {
    if (part1[0] != part2[0])
        return 0;
    if (part1[1] != part2[1])
        return 0;
    if (part1[2] != part2[2])
        return 0;
    if (part1[3] != part2[3])
        return 0;
    if (part1[4] != part2[4])
        return 0;
    if (part1[5] != part2[5])
        return 0;
    if (part1[6] != part2[6])
        return 0;
    return 1;
}

void gen_canonicalized_partition_table(int texel_count, const ASTC_Encoder::uint8_t * partition_table, uint64_cl canonicalized[7]) {
    int i;
    for (i = 0; i < 7; i++)
        canonicalized[i] = 0;

    int mapped_index[4];
    int map_weight_count = 0;
    for (i = 0; i < 4; i++)
        mapped_index[i] = -1;

    for (i = 0; i < texel_count; i++) {
        int index = partition_table[i];
        if (mapped_index[index] == -1)
            mapped_index[index] = map_weight_count++;
        uint64_cl xlat_index = mapped_index[index];
        canonicalized[i >> 5] |= xlat_index << (2 * (i & 0x1F));
    }
}

void partition_table_zap_equal_elements(int xdim, int ydim, int zdim, partition_info * pi) {
    int partition_tables_zapped = 0;

    int texel_count = xdim * ydim * zdim;

    int i, j;
    uint64_cl *canonicalizeds = new uint64_cl[PARTITION_COUNT * 7];


    for (i = 0; i < PARTITION_COUNT; i++) {
        gen_canonicalized_partition_table(texel_count, pi[i].partition_of_texel, canonicalizeds + i * 7);
    }

    for (i = 0; i < PARTITION_COUNT; i++) {
        for (j = 0; j < i; j++) {
            if (compare_canonicalized_partition_tables(canonicalizeds + 7 * i, canonicalizeds + 7 * j)) {
                pi[i].partition_count = 0;
                partition_tables_zapped++;
                break;
            }
        }
    }
    delete[]canonicalizeds;
}

ASTC_Encoder::uint32_t hash52_host(ASTC_Encoder::uint32_t inp) {
    inp ^= inp >> 15;

    inp *= 0xEEDE0891;            // (2^4+1)*(2^7+1)*(2^17-1)
    inp ^= inp >> 5;
    inp += inp << 16;
    inp ^= inp >> 7;
    inp ^= inp >> 3;
    inp ^= inp << 6;
    inp ^= inp >> 17;
    return inp;
}

int select_partition_host(int seed, int x, int y, int z, int partitioncount, int small_block) {
    if (small_block) {
        x <<= 1;
        y <<= 1;
        z <<= 1;
    }

    seed += (partitioncount - 1) * 1024;

    ASTC_Encoder::uint32_t rnum = hash52_host(seed);

    ASTC_Encoder::uint8_t seed1 = rnum & 0xF;
    ASTC_Encoder::uint8_t seed2 = (rnum >> 4) & 0xF;
    ASTC_Encoder::uint8_t seed3 = (rnum >> 8) & 0xF;
    ASTC_Encoder::uint8_t seed4 = (rnum >> 12) & 0xF;
    ASTC_Encoder::uint8_t seed5 = (rnum >> 16) & 0xF;
    ASTC_Encoder::uint8_t seed6 = (rnum >> 20) & 0xF;
    ASTC_Encoder::uint8_t seed7 = (rnum >> 24) & 0xF;
    ASTC_Encoder::uint8_t seed8 = (rnum >> 28) & 0xF;
    ASTC_Encoder::uint8_t seed9 = (rnum >> 18) & 0xF;
    ASTC_Encoder::uint8_t seed10 = (rnum >> 22) & 0xF;
    ASTC_Encoder::uint8_t seed11 = (rnum >> 26) & 0xF;
    ASTC_Encoder::uint8_t seed12 = ((rnum >> 30) | (rnum << 2)) & 0xF;

    // squaring all the seeds in order to bias their distribution
    // towards lower values.
    seed1 *= seed1;
    seed2 *= seed2;
    seed3 *= seed3;
    seed4 *= seed4;
    seed5 *= seed5;
    seed6 *= seed6;
    seed7 *= seed7;
    seed8 *= seed8;
    seed9 *= seed9;
    seed10 *= seed10;
    seed11 *= seed11;
    seed12 *= seed12;


    int sh1, sh2, sh3;
    if (seed & 1) {
        sh1 = (seed & 2 ? 4 : 5);
        sh2 = (partitioncount == 3 ? 6 : 5);
    } else {
        sh1 = (partitioncount == 3 ? 6 : 5);
        sh2 = (seed & 2 ? 4 : 5);
    }
    sh3 = (seed & 0x10) ? sh1 : sh2;

    seed1 >>= sh1;
    seed2 >>= sh2;
    seed3 >>= sh1;
    seed4 >>= sh2;
    seed5 >>= sh1;
    seed6 >>= sh2;
    seed7 >>= sh1;
    seed8 >>= sh2;

    seed9 >>= sh3;
    seed10 >>= sh3;
    seed11 >>= sh3;
    seed12 >>= sh3;


    int a = seed1 * x + seed2 * y + seed11 * z + (rnum >> 14);
    int b = seed3 * x + seed4 * y + seed12 * z + (rnum >> 10);
    int c = seed5 * x + seed6 * y + seed9 * z + (rnum >> 6);
    int d = seed7 * x + seed8 * y + seed10 * z + (rnum >> 2);


    // apply the saw
    a &= 0x3F;
    b &= 0x3F;
    c &= 0x3F;
    d &= 0x3F;

    // remove some of the components of we are to output < 4 partitions.
    if (partitioncount <= 3)
        d = 0;
    if (partitioncount <= 2)
        c = 0;
    if (partitioncount <= 1)
        b = 0;

    int partition;
    if (a >= b && a >= c && a >= d)
        partition = 0;
    else if (b >= c && b >= d)
        partition = 1;
    else if (c >= d)
        partition = 2;
    else
        partition = 3;
    return partition;
}

void generate_one_partition_table(int xdim, int ydim, int zdim, int partition_count, int partition_index, partition_info * pt, __global ASTC_Encode *ASTCEncode) {
    int small_block = (xdim * ydim * zdim) < 32;

    ASTC_Encoder::uint8_t *partition_of_texel = pt->partition_of_texel;
    int x, y, z, i;


    for (z = 0; z < zdim; z++)
        for (y = 0; y < ydim; y++)
            for (x = 0; x < xdim; x++) {
                ASTC_Encoder::uint8_t part = (ASTC_Encoder::uint8_t)select_partition_host(partition_index, x, y, z, partition_count, small_block);
                *partition_of_texel++ = part;
            }


    int texels_per_block = xdim * ydim * zdim;

    int counts[4];
    for (i = 0; i < 4; i++)
        counts[i] = 0;

    for (i = 0; i < texels_per_block; i++) {
        int partition = pt->partition_of_texel[i];
        pt->texels_of_partition[partition][counts[partition]++] = (ASTC_Encoder::uint8_t)i;
    }

    for (i = 0; i < 4; i++)
        pt->texels_per_partition[i] = (ASTC_Encoder::uint8_t)counts[i];

    if (counts[0] == 0)
        pt->partition_count = 0;
    else if (counts[1] == 0)
        pt->partition_count = 1;
    else if (counts[2] == 0)
        pt->partition_count = 2;
    else if (counts[3] == 0)
        pt->partition_count = 3;
    else
        pt->partition_count = 4;


    for (i = 0; i < 4; i++)
        pt->coverage_bitmaps[i] = 0;

    int texels_to_process = ASTCEncode->bsd.texelcount_for_bitmap_partitioning;

    //# was 64 bits changed to 32 bit
    //# this will effect results and need to be fixed for GPU use
    if (texels_to_process > COVERAGE_BITMAPS_MAX)
        texels_to_process = COVERAGE_BITMAPS_MAX;

    uint64_cl shiftbit = 1;

    for (i = 0; i < texels_to_process; i++) {
        pt->coverage_bitmaps[pt->partition_of_texel[i]] |= shiftbit << i;
    }
}

void generate_partition_tables(int xdim, int ydim, int zdim, __global ASTC_Encode *ASTCEncode) {
    int i;
    generate_one_partition_table(xdim, ydim, zdim, 1, 0, &ASTCEncode->partition_tables[1][0], ASTCEncode);
    for (i = 0; i < PARTITION_COUNT; i++) {
        generate_one_partition_table(xdim, ydim, zdim, 2, i, &ASTCEncode->partition_tables[2][i], ASTCEncode);
        generate_one_partition_table(xdim, ydim, zdim, 3, i, &ASTCEncode->partition_tables[3][i], ASTCEncode);
        generate_one_partition_table(xdim, ydim, zdim, 4, i, &ASTCEncode->partition_tables[4][i], ASTCEncode);
    }
    partition_table_zap_equal_elements(xdim, ydim, zdim, &ASTCEncode->partition_tables[2][0]);
    partition_table_zap_equal_elements(xdim, ydim, zdim, &ASTCEncode->partition_tables[3][0]);
    partition_table_zap_equal_elements(xdim, ydim, zdim, &ASTCEncode->partition_tables[4][0]);
}

void prepare_angular_tables(__global ASTC_Encode *ASTCEncode) {
    int i, j;
    int max_angular_steps_needed_for_quant_steps[40];
    for (i = 0; i < ANGULAR_STEPS; i++) {
        ASTCEncode->stepsizes[i] = 1.0f / angular_steppings[i];
        ASTCEncode->stepsizes_sqr[i] = ASTCEncode->stepsizes[i] * ASTCEncode->stepsizes[i];

        for (j = 0; j < SINCOS_STEPS; j++) {
            ASTCEncode->sin_table[j][i] = static_cast < float >(sin((2.0f * M_PI / (SINCOS_STEPS - 1.0f)) * angular_steppings[i] * j));
            ASTCEncode->cos_table[j][i] = static_cast < float >(cos((2.0f * M_PI / (SINCOS_STEPS - 1.0f)) * angular_steppings[i] * j));
        }

        int p = static_cast < int >(floor(angular_steppings[i])) + 1;
        max_angular_steps_needed_for_quant_steps[p] = MIN(i + 1, ANGULAR_STEPS - 1);
    }

    for (i = 0; i < 13; i++)
        ASTCEncode->max_angular_steps_needed_for_quant_level[i] = max_angular_steps_needed_for_quant_steps[steps_of_level[i]];

}

void build_quantization_mode_table(__global ASTC_Encode *ASTCEncode) {
    int i, j;
    for (i = 0; i <= 16; i++)
        for (j = 0; j < 128; j++)
            ASTCEncode->quantization_mode_table[i][j] = -1;

    for (i = 0; i < 21; i++)
        for (j = 1; j <= 16; j++) {
            int p = compute_ise_bitcount2(2 * j, (quantization_method)i);
            if (p < 128)
                ASTCEncode->quantization_mode_table[j][p] = i;
        }
    for (i = 0; i <= 16; i++) {
        int largest_value_so_far = -1;
        for (j = 0; j < 128; j++) {
            if (ASTCEncode->quantization_mode_table[i][j] > largest_value_so_far)
                largest_value_so_far = ASTCEncode->quantization_mode_table[i][j];
            else
                ASTCEncode->quantization_mode_table[i][j] = largest_value_so_far;
        }
    }
}

void expand_block_artifact_suppression_host(int xdim, int ydim, int zdim, error_weighting_params * ewp) {
    int x, y, z;
    float centerpos_x = (xdim - 1) * 0.5f;
    float centerpos_y = (ydim - 1) * 0.5f;
    float centerpos_z = (zdim - 1) * 0.5f;
    int   bef = 0;

    for (z = 0; z < zdim; z++)
        for (y = 0; y < ydim; y++)
            for (x = 0; x < xdim; x++) {
                float xdif = (x - centerpos_x) / xdim;
                float ydif = (y - centerpos_y) / ydim;
                float zdif = (z - centerpos_z) / zdim;

                float wdif = 0.36f;
                float dist = sqrt(xdif * xdif + ydif * ydif + zdif * zdif + wdif * wdif);
                if (bef < MAX_TEXELS_PER_BLOCK) {
                    ewp->block_artifact_suppression_expanded[bef] = pow(dist, ewp->block_artifact_suppression);
                    bef++;
                }
            }
}

void set_block_size_descriptor(int xdim, int ydim, int zdim, __global ASTC_Encode *ASTCEncode) {
#ifdef ASTC_ENABLE_3D_SUPPORT
    if (zdim > 1)
        construct_block_size_descriptor_3d_host(xdim, ydim, zdim, &ASTCEncode->bsd);
    else
#else
    IGNOREPARAM(zdim);
#endif
        construct_block_size_descriptor_2d_host(xdim, ydim, &ASTCEncode->bsd);
}

//-----------------------------------------------------
#ifdef ASTC_ENABLE_3D_SUPPORT
static void initialize_decimation_table_3d(
    // dimensions of the block
    int xdim, int ydim, int zdim,
    // number of grid points in 3d weight grid
    int x_weights, int y_weights, int z_weights, decimation_table * dt) {
    int i, j;
    int x, y, z;

    int texels_per_block = xdim * ydim * zdim;
    int weights_per_block = x_weights * y_weights * z_weights;

    int weightcount_of_texel[MAX_TEXELS_PER_BLOCK];
    int grid_weights_of_texel[MAX_TEXELS_PER_BLOCK][4];
    int weights_of_texel[MAX_TEXELS_PER_BLOCK][4];

    int texelcount_of_weight[MAX_WEIGHTS_PER_BLOCK];
    int texels_of_weight[MAX_WEIGHTS_PER_BLOCK][MAX_TEXELS_PER_BLOCK];
    int texelweights_of_weight[MAX_WEIGHTS_PER_BLOCK][MAX_TEXELS_PER_BLOCK];

    for (i = 0; i < weights_per_block; i++)
        texelcount_of_weight[i] = 0;
    for (i = 0; i < texels_per_block; i++)
        weightcount_of_texel[i] = 0;

    for (z = 0; z < zdim; z++)
        for (y = 0; y < ydim; y++)
            for (x = 0; x < xdim; x++) {
                int texel = (z * ydim + y) * xdim + x;

                int x_weight = (((1024 + xdim / 2) / (xdim - 1)) * x * (x_weights - 1) + 32) >> 6;
                int y_weight = (((1024 + ydim / 2) / (ydim - 1)) * y * (y_weights - 1) + 32) >> 6;
                int z_weight = (((1024 + zdim / 2) / (zdim - 1)) * z * (z_weights - 1) + 32) >> 6;

                int x_weight_frac = x_weight & 0xF;
                int y_weight_frac = y_weight & 0xF;
                int z_weight_frac = z_weight & 0xF;
                int x_weight_int = x_weight >> 4;
                int y_weight_int = y_weight >> 4;
                int z_weight_int = z_weight >> 4;
                int qweight[4];
                int weight[4];
                qweight[0] = (z_weight_int * y_weights + y_weight_int) * x_weights + x_weight_int;
                qweight[3] = ((z_weight_int + 1) * y_weights + (y_weight_int + 1)) * x_weights + (x_weight_int + 1);

                // simplex interpolation
                int fs = x_weight_frac;
                int ft = y_weight_frac;
                int fp = z_weight_frac;

                int cas = ((fs > ft) << 2) + ((ft > fp) << 1) + ((fs > fp));
                int N = x_weights;
                int NM = x_weights * y_weights;

                int s1, s2, w0, w1, w2, w3;
                switch (cas) {
                case 7:
                    s1 = 1;
                    s2 = N;
                    w0 = 16 - fs;
                    w1 = fs - ft;
                    w2 = ft - fp;
                    w3 = fp;
                    break;
                case 3:
                    s1 = N;
                    s2 = 1;
                    w0 = 16 - ft;
                    w1 = ft - fs;
                    w2 = fs - fp;
                    w3 = fp;
                    break;
                case 5:
                    s1 = 1;
                    s2 = NM;
                    w0 = 16 - fs;
                    w1 = fs - fp;
                    w2 = fp - ft;
                    w3 = ft;
                    break;
                case 4:
                    s1 = NM;
                    s2 = 1;
                    w0 = 16 - fp;
                    w1 = fp - fs;
                    w2 = fs - ft;
                    w3 = ft;
                    break;
                case 2:
                    s1 = N;
                    s2 = NM;
                    w0 = 16 - ft;
                    w1 = ft - fp;
                    w2 = fp - fs;
                    w3 = fs;
                    break;
                case 0:
                    s1 = NM;
                    s2 = N;
                    w0 = 16 - fp;
                    w1 = fp - ft;
                    w2 = ft - fs;
                    w3 = fs;
                    break;

                default:
                    s1 = NM;
                    s2 = N;
                    w0 = 16 - fp;
                    w1 = fp - ft;
                    w2 = ft - fs;
                    w3 = fs;
                    break;
                }

                qweight[1] = qweight[0] + s1;
                qweight[2] = qweight[1] + s2;
                weight[0] = w0;
                weight[1] = w1;
                weight[2] = w2;
                weight[3] = w3;

                /*
                for(i=0;i<4;i++) weight[i] <<= 4; */

                for (i = 0; i < 4; i++)
                    if (weight[i] != 0) {
                        grid_weights_of_texel[texel][weightcount_of_texel[texel]] = qweight[i];
                        weights_of_texel[texel][weightcount_of_texel[texel]] = weight[i];
                        weightcount_of_texel[texel]++;
                        texels_of_weight[qweight[i]][texelcount_of_weight[qweight[i]]] = texel;
                        texelweights_of_weight[qweight[i]][texelcount_of_weight[qweight[i]]] = weight[i];
                        texelcount_of_weight[qweight[i]]++;
                    }
            }

    for (i = 0; i < texels_per_block; i++) {
        dt->texel_num_weights[i] = (uint8_t)weightcount_of_texel[i];

        // ensure that all 4 entries are actually initialized.
        // This allows a branch-free implemntation of compute_value_of_texel_flt()
        for (j = 0; j < 4; j++) {
            dt->texel_weights_int[i][j] = 0;
            dt->texel_weights_float[i][j] = 0.0f;
            dt->texel_weights[i][j] = 0;
        }

        for (j = 0; j < weightcount_of_texel[i]; j++) {
            dt->texel_weights_int[i][j] = (uint8_t)weights_of_texel[i][j];
            dt->texel_weights_float[i][j] = static_cast < float >(weights_of_texel[i][j]) * (1.0f / TEXEL_WEIGHT_SUM);
            dt->texel_weights[i][j] = (uint8_t)grid_weights_of_texel[i][j];
        }
    }

    for (i = 0; i < weights_per_block; i++) {
        dt->weight_num_texels[i] = (uint8_t)texelcount_of_weight[i];
        for (j = 0; j < texelcount_of_weight[i]; j++) {
            dt->weight_texel[i][j] = (uint8_t)texels_of_weight[i][j];
            dt->weights_int[i][j] = (uint8_t)texelweights_of_weight[i][j];
            dt->weights_flt[i][j] = static_cast < float >(texelweights_of_weight[i][j]);
        }
    }

    dt->num_texels = texels_per_block;
    dt->num_weights = weights_per_block;
}
#endif

// routine to write up to 8 bits
static inline void write_bits(int value, int bitcount, int bitoffset, uint8_t * ptr) {
    int mask = (1 << bitcount) - 1;
    value &= mask;
    ptr += bitoffset >> 3;
    bitoffset &= 7;
    value <<= bitoffset;
    mask <<= bitoffset;
    mask = ~mask;

    ptr[0] &= mask;
    ptr[0] |= value;
    ptr[1] &= mask >> 8;
    ptr[1] |= value >> 8;
}


// routine to read up to 8 bits
static inline int read_bits(int bitcount, int bitoffset, const uint8_t * ptr) {
    int mask = (1 << bitcount) - 1;
    ptr += bitoffset >> 3;
    bitoffset &= 7;
    int value = ptr[0] | (ptr[1] << 8);
    value >>= bitoffset;
    value &= mask;
    return value;
}

// unpacked trit quintuplets <low,_,_,_,high> for each packed-quint value
static const uint8_t trits_of_integer[256][5] = {
    { 0, 0, 0, 0, 0 },{ 1, 0, 0, 0, 0 },{ 2, 0, 0, 0, 0 },{ 0, 0, 2, 0, 0 },
    { 0, 1, 0, 0, 0 },{ 1, 1, 0, 0, 0 },{ 2, 1, 0, 0, 0 },{ 1, 0, 2, 0, 0 },
    { 0, 2, 0, 0, 0 },{ 1, 2, 0, 0, 0 },{ 2, 2, 0, 0, 0 },{ 2, 0, 2, 0, 0 },
    { 0, 2, 2, 0, 0 },{ 1, 2, 2, 0, 0 },{ 2, 2, 2, 0, 0 },{ 2, 0, 2, 0, 0 },
    { 0, 0, 1, 0, 0 },{ 1, 0, 1, 0, 0 },{ 2, 0, 1, 0, 0 },{ 0, 1, 2, 0, 0 },
    { 0, 1, 1, 0, 0 },{ 1, 1, 1, 0, 0 },{ 2, 1, 1, 0, 0 },{ 1, 1, 2, 0, 0 },
    { 0, 2, 1, 0, 0 },{ 1, 2, 1, 0, 0 },{ 2, 2, 1, 0, 0 },{ 2, 1, 2, 0, 0 },
    { 0, 0, 0, 2, 2 },{ 1, 0, 0, 2, 2 },{ 2, 0, 0, 2, 2 },{ 0, 0, 2, 2, 2 },
    { 0, 0, 0, 1, 0 },{ 1, 0, 0, 1, 0 },{ 2, 0, 0, 1, 0 },{ 0, 0, 2, 1, 0 },
    { 0, 1, 0, 1, 0 },{ 1, 1, 0, 1, 0 },{ 2, 1, 0, 1, 0 },{ 1, 0, 2, 1, 0 },
    { 0, 2, 0, 1, 0 },{ 1, 2, 0, 1, 0 },{ 2, 2, 0, 1, 0 },{ 2, 0, 2, 1, 0 },
    { 0, 2, 2, 1, 0 },{ 1, 2, 2, 1, 0 },{ 2, 2, 2, 1, 0 },{ 2, 0, 2, 1, 0 },
    { 0, 0, 1, 1, 0 },{ 1, 0, 1, 1, 0 },{ 2, 0, 1, 1, 0 },{ 0, 1, 2, 1, 0 },
    { 0, 1, 1, 1, 0 },{ 1, 1, 1, 1, 0 },{ 2, 1, 1, 1, 0 },{ 1, 1, 2, 1, 0 },
    { 0, 2, 1, 1, 0 },{ 1, 2, 1, 1, 0 },{ 2, 2, 1, 1, 0 },{ 2, 1, 2, 1, 0 },
    { 0, 1, 0, 2, 2 },{ 1, 1, 0, 2, 2 },{ 2, 1, 0, 2, 2 },{ 1, 0, 2, 2, 2 },
    { 0, 0, 0, 2, 0 },{ 1, 0, 0, 2, 0 },{ 2, 0, 0, 2, 0 },{ 0, 0, 2, 2, 0 },
    { 0, 1, 0, 2, 0 },{ 1, 1, 0, 2, 0 },{ 2, 1, 0, 2, 0 },{ 1, 0, 2, 2, 0 },
    { 0, 2, 0, 2, 0 },{ 1, 2, 0, 2, 0 },{ 2, 2, 0, 2, 0 },{ 2, 0, 2, 2, 0 },
    { 0, 2, 2, 2, 0 },{ 1, 2, 2, 2, 0 },{ 2, 2, 2, 2, 0 },{ 2, 0, 2, 2, 0 },
    { 0, 0, 1, 2, 0 },{ 1, 0, 1, 2, 0 },{ 2, 0, 1, 2, 0 },{ 0, 1, 2, 2, 0 },
    { 0, 1, 1, 2, 0 },{ 1, 1, 1, 2, 0 },{ 2, 1, 1, 2, 0 },{ 1, 1, 2, 2, 0 },
    { 0, 2, 1, 2, 0 },{ 1, 2, 1, 2, 0 },{ 2, 2, 1, 2, 0 },{ 2, 1, 2, 2, 0 },
    { 0, 2, 0, 2, 2 },{ 1, 2, 0, 2, 2 },{ 2, 2, 0, 2, 2 },{ 2, 0, 2, 2, 2 },
    { 0, 0, 0, 0, 2 },{ 1, 0, 0, 0, 2 },{ 2, 0, 0, 0, 2 },{ 0, 0, 2, 0, 2 },
    { 0, 1, 0, 0, 2 },{ 1, 1, 0, 0, 2 },{ 2, 1, 0, 0, 2 },{ 1, 0, 2, 0, 2 },
    { 0, 2, 0, 0, 2 },{ 1, 2, 0, 0, 2 },{ 2, 2, 0, 0, 2 },{ 2, 0, 2, 0, 2 },
    { 0, 2, 2, 0, 2 },{ 1, 2, 2, 0, 2 },{ 2, 2, 2, 0, 2 },{ 2, 0, 2, 0, 2 },
    { 0, 0, 1, 0, 2 },{ 1, 0, 1, 0, 2 },{ 2, 0, 1, 0, 2 },{ 0, 1, 2, 0, 2 },
    { 0, 1, 1, 0, 2 },{ 1, 1, 1, 0, 2 },{ 2, 1, 1, 0, 2 },{ 1, 1, 2, 0, 2 },
    { 0, 2, 1, 0, 2 },{ 1, 2, 1, 0, 2 },{ 2, 2, 1, 0, 2 },{ 2, 1, 2, 0, 2 },
    { 0, 2, 2, 2, 2 },{ 1, 2, 2, 2, 2 },{ 2, 2, 2, 2, 2 },{ 2, 0, 2, 2, 2 },
    { 0, 0, 0, 0, 1 },{ 1, 0, 0, 0, 1 },{ 2, 0, 0, 0, 1 },{ 0, 0, 2, 0, 1 },
    { 0, 1, 0, 0, 1 },{ 1, 1, 0, 0, 1 },{ 2, 1, 0, 0, 1 },{ 1, 0, 2, 0, 1 },
    { 0, 2, 0, 0, 1 },{ 1, 2, 0, 0, 1 },{ 2, 2, 0, 0, 1 },{ 2, 0, 2, 0, 1 },
    { 0, 2, 2, 0, 1 },{ 1, 2, 2, 0, 1 },{ 2, 2, 2, 0, 1 },{ 2, 0, 2, 0, 1 },
    { 0, 0, 1, 0, 1 },{ 1, 0, 1, 0, 1 },{ 2, 0, 1, 0, 1 },{ 0, 1, 2, 0, 1 },
    { 0, 1, 1, 0, 1 },{ 1, 1, 1, 0, 1 },{ 2, 1, 1, 0, 1 },{ 1, 1, 2, 0, 1 },
    { 0, 2, 1, 0, 1 },{ 1, 2, 1, 0, 1 },{ 2, 2, 1, 0, 1 },{ 2, 1, 2, 0, 1 },
    { 0, 0, 1, 2, 2 },{ 1, 0, 1, 2, 2 },{ 2, 0, 1, 2, 2 },{ 0, 1, 2, 2, 2 },
    { 0, 0, 0, 1, 1 },{ 1, 0, 0, 1, 1 },{ 2, 0, 0, 1, 1 },{ 0, 0, 2, 1, 1 },
    { 0, 1, 0, 1, 1 },{ 1, 1, 0, 1, 1 },{ 2, 1, 0, 1, 1 },{ 1, 0, 2, 1, 1 },
    { 0, 2, 0, 1, 1 },{ 1, 2, 0, 1, 1 },{ 2, 2, 0, 1, 1 },{ 2, 0, 2, 1, 1 },
    { 0, 2, 2, 1, 1 },{ 1, 2, 2, 1, 1 },{ 2, 2, 2, 1, 1 },{ 2, 0, 2, 1, 1 },
    { 0, 0, 1, 1, 1 },{ 1, 0, 1, 1, 1 },{ 2, 0, 1, 1, 1 },{ 0, 1, 2, 1, 1 },
    { 0, 1, 1, 1, 1 },{ 1, 1, 1, 1, 1 },{ 2, 1, 1, 1, 1 },{ 1, 1, 2, 1, 1 },
    { 0, 2, 1, 1, 1 },{ 1, 2, 1, 1, 1 },{ 2, 2, 1, 1, 1 },{ 2, 1, 2, 1, 1 },
    { 0, 1, 1, 2, 2 },{ 1, 1, 1, 2, 2 },{ 2, 1, 1, 2, 2 },{ 1, 1, 2, 2, 2 },
    { 0, 0, 0, 2, 1 },{ 1, 0, 0, 2, 1 },{ 2, 0, 0, 2, 1 },{ 0, 0, 2, 2, 1 },
    { 0, 1, 0, 2, 1 },{ 1, 1, 0, 2, 1 },{ 2, 1, 0, 2, 1 },{ 1, 0, 2, 2, 1 },
    { 0, 2, 0, 2, 1 },{ 1, 2, 0, 2, 1 },{ 2, 2, 0, 2, 1 },{ 2, 0, 2, 2, 1 },
    { 0, 2, 2, 2, 1 },{ 1, 2, 2, 2, 1 },{ 2, 2, 2, 2, 1 },{ 2, 0, 2, 2, 1 },
    { 0, 0, 1, 2, 1 },{ 1, 0, 1, 2, 1 },{ 2, 0, 1, 2, 1 },{ 0, 1, 2, 2, 1 },
    { 0, 1, 1, 2, 1 },{ 1, 1, 1, 2, 1 },{ 2, 1, 1, 2, 1 },{ 1, 1, 2, 2, 1 },
    { 0, 2, 1, 2, 1 },{ 1, 2, 1, 2, 1 },{ 2, 2, 1, 2, 1 },{ 2, 1, 2, 2, 1 },
    { 0, 2, 1, 2, 2 },{ 1, 2, 1, 2, 2 },{ 2, 2, 1, 2, 2 },{ 2, 1, 2, 2, 2 },
    { 0, 0, 0, 1, 2 },{ 1, 0, 0, 1, 2 },{ 2, 0, 0, 1, 2 },{ 0, 0, 2, 1, 2 },
    { 0, 1, 0, 1, 2 },{ 1, 1, 0, 1, 2 },{ 2, 1, 0, 1, 2 },{ 1, 0, 2, 1, 2 },
    { 0, 2, 0, 1, 2 },{ 1, 2, 0, 1, 2 },{ 2, 2, 0, 1, 2 },{ 2, 0, 2, 1, 2 },
    { 0, 2, 2, 1, 2 },{ 1, 2, 2, 1, 2 },{ 2, 2, 2, 1, 2 },{ 2, 0, 2, 1, 2 },
    { 0, 0, 1, 1, 2 },{ 1, 0, 1, 1, 2 },{ 2, 0, 1, 1, 2 },{ 0, 1, 2, 1, 2 },
    { 0, 1, 1, 1, 2 },{ 1, 1, 1, 1, 2 },{ 2, 1, 1, 1, 2 },{ 1, 1, 2, 1, 2 },
    { 0, 2, 1, 1, 2 },{ 1, 2, 1, 1, 2 },{ 2, 2, 1, 1, 2 },{ 2, 1, 2, 1, 2 },
    { 0, 2, 2, 2, 2 },{ 1, 2, 2, 2, 2 },{ 2, 2, 2, 2, 2 },{ 2, 1, 2, 2, 2 },
};

// unpacked quint triplets <low,middle,high> for each packed-quint value
static const uint8_t quints_of_integer[128][3] = {
    { 0, 0, 0 },{ 1, 0, 0 },{ 2, 0, 0 },{ 3, 0, 0 },
    { 4, 0, 0 },{ 0, 4, 0 },{ 4, 4, 0 },{ 4, 4, 4 },
    { 0, 1, 0 },{ 1, 1, 0 },{ 2, 1, 0 },{ 3, 1, 0 },
    { 4, 1, 0 },{ 1, 4, 0 },{ 4, 4, 1 },{ 4, 4, 4 },
    { 0, 2, 0 },{ 1, 2, 0 },{ 2, 2, 0 },{ 3, 2, 0 },
    { 4, 2, 0 },{ 2, 4, 0 },{ 4, 4, 2 },{ 4, 4, 4 },
    { 0, 3, 0 },{ 1, 3, 0 },{ 2, 3, 0 },{ 3, 3, 0 },
    { 4, 3, 0 },{ 3, 4, 0 },{ 4, 4, 3 },{ 4, 4, 4 },
    { 0, 0, 1 },{ 1, 0, 1 },{ 2, 0, 1 },{ 3, 0, 1 },
    { 4, 0, 1 },{ 0, 4, 1 },{ 4, 0, 4 },{ 0, 4, 4 },
    { 0, 1, 1 },{ 1, 1, 1 },{ 2, 1, 1 },{ 3, 1, 1 },
    { 4, 1, 1 },{ 1, 4, 1 },{ 4, 1, 4 },{ 1, 4, 4 },
    { 0, 2, 1 },{ 1, 2, 1 },{ 2, 2, 1 },{ 3, 2, 1 },
    { 4, 2, 1 },{ 2, 4, 1 },{ 4, 2, 4 },{ 2, 4, 4 },
    { 0, 3, 1 },{ 1, 3, 1 },{ 2, 3, 1 },{ 3, 3, 1 },
    { 4, 3, 1 },{ 3, 4, 1 },{ 4, 3, 4 },{ 3, 4, 4 },
    { 0, 0, 2 },{ 1, 0, 2 },{ 2, 0, 2 },{ 3, 0, 2 },
    { 4, 0, 2 },{ 0, 4, 2 },{ 2, 0, 4 },{ 3, 0, 4 },
    { 0, 1, 2 },{ 1, 1, 2 },{ 2, 1, 2 },{ 3, 1, 2 },
    { 4, 1, 2 },{ 1, 4, 2 },{ 2, 1, 4 },{ 3, 1, 4 },
    { 0, 2, 2 },{ 1, 2, 2 },{ 2, 2, 2 },{ 3, 2, 2 },
    { 4, 2, 2 },{ 2, 4, 2 },{ 2, 2, 4 },{ 3, 2, 4 },
    { 0, 3, 2 },{ 1, 3, 2 },{ 2, 3, 2 },{ 3, 3, 2 },
    { 4, 3, 2 },{ 3, 4, 2 },{ 2, 3, 4 },{ 3, 3, 4 },
    { 0, 0, 3 },{ 1, 0, 3 },{ 2, 0, 3 },{ 3, 0, 3 },
    { 4, 0, 3 },{ 0, 4, 3 },{ 0, 0, 4 },{ 1, 0, 4 },
    { 0, 1, 3 },{ 1, 1, 3 },{ 2, 1, 3 },{ 3, 1, 3 },
    { 4, 1, 3 },{ 1, 4, 3 },{ 0, 1, 4 },{ 1, 1, 4 },
    { 0, 2, 3 },{ 1, 2, 3 },{ 2, 2, 3 },{ 3, 2, 3 },
    { 4, 2, 3 },{ 2, 4, 3 },{ 0, 2, 4 },{ 1, 2, 4 },
    { 0, 3, 3 },{ 1, 3, 3 },{ 2, 3, 3 },{ 3, 3, 3 },
    { 4, 3, 3 },{ 3, 4, 3 },{ 0, 3, 4 },{ 1, 3, 4 },
};

int bitrev8(int p) {
    p = ((p & 0xF) << 4) | ((p >> 4) & 0xF);
    p = ((p & 0x33) << 2) | ((p >> 2) & 0x33);
    p = ((p & 0x55) << 1) | ((p >> 1) & 0x55);
    return p;
}

void decode_ise(int quantization_level, int elements, const uint8_t * input_data, uint8_t * output_data, int bit_offset) {
    int i;
    // note: due to how the the trit/quint-block unpacking is done in this function,
    // we may write more temporary results than the number of outputs
    // The maximum actual number of results is 64 bit, but we keep 4 additional elements
    // of padding.
    uint8_t results[68];
    uint8_t tq_blocks[22];        // trit-blocks or quint-blocks

    int bits, trits, quints;
    find_number_of_bits_trits_quints(quantization_level, &bits, &trits, &quints);

    int lcounter = 0;
    int hcounter = 0;

    // trit-blocks or quint-blocks must be zeroed out before we collect them in the loop below.
    for (i = 0; i < 22; i++)
        tq_blocks[i] = 0;

    // collect bits for each element, as well as bits for any trit-blocks and quint-blocks.
    for (i = 0; i < elements; i++) {
        results[i] = (uint8_t)read_bits(bits, bit_offset, input_data);
        bit_offset += bits;
        if (trits) {
            static const int bits_to_read[5] = { 2, 2, 1, 2, 1 };
            static const int block_shift[5] = { 0, 2, 4, 5, 7 };
            static const int next_lcounter[5] = { 1, 2, 3, 4, 0 };
            static const int hcounter_incr[5] = { 0, 0, 0, 0, 1 };
            int tdata = read_bits(bits_to_read[lcounter], bit_offset, input_data);
            bit_offset += bits_to_read[lcounter];
            tq_blocks[hcounter] |= tdata << block_shift[lcounter];
            hcounter += hcounter_incr[lcounter];
            lcounter = next_lcounter[lcounter];
        }
        if (quints) {
            static const int bits_to_read[3] = { 3, 2, 2 };
            static const int block_shift[3] = { 0, 3, 5 };
            static const int next_lcounter[3] = { 1, 2, 0 };
            static const int hcounter_incr[3] = { 0, 0, 1 };
            int tdata = read_bits(bits_to_read[lcounter], bit_offset, input_data);
            bit_offset += bits_to_read[lcounter];
            tq_blocks[hcounter] |= tdata << block_shift[lcounter];
            hcounter += hcounter_incr[lcounter];
            lcounter = next_lcounter[lcounter];
        }
    }


    // unpack trit-blocks or quint-blocks as needed
    if (trits) {
        int trit_blocks = (elements + 4) / 5;
        for (i = 0; i < trit_blocks; i++) {
            const uint8_t *tritptr = trits_of_integer[tq_blocks[i]];
            results[5 * i] |= tritptr[0] << bits;
            results[5 * i + 1] |= tritptr[1] << bits;
            results[5 * i + 2] |= tritptr[2] << bits;
            results[5 * i + 3] |= tritptr[3] << bits;
            results[5 * i + 4] |= tritptr[4] << bits;
        }
    }

    if (quints) {
        int quint_blocks = (elements + 2) / 3;
        for (i = 0; i < quint_blocks; i++) {
            const uint8_t *quintptr = quints_of_integer[tq_blocks[i]];
            results[3 * i] |= quintptr[0] << bits;
            results[3 * i + 1] |= quintptr[1] << bits;
            results[3 * i + 2] |= quintptr[2] << bits;
        }
    }

    for (i = 0; i < elements; i++)
        output_data[i] = results[i];
}

void InitializeASTCSettingsForSetBlockSize(__global ASTC_Encode *ASTCEncode) {
    ASTCEncode->m_target_bitrate = 0;
    int xdim_2d = ASTCEncode->m_xdim;
    int ydim_2d = ASTCEncode->m_ydim;
    float log10_texels_2d = log((float)(xdim_2d * ydim_2d)) / log(10.0f);

#ifdef ASTC_ENABLE_3D_SUPPORT
    int xdim_3d = ASTCEncode->m_xdim;
    int ydim_3d = ASTCEncode->m_ydim;
    int zdim_3d = ASTCEncode->m_zdim;
    float log10_texels_3d = 0.0f;
    log10_texels_3d = log((float)(xdim_3d * ydim_3d * zdim_3d)) / log(10.0f);
    float dblimit_autoset_3d = 0.0;
#endif

    int     plimit_autoset      = -1;
    float   dblimit_autoset_2d  = 0.0;
    float   oplimit_autoset     = 0.0;
    float   mincorrel_autoset   = 0.0;
    float   bmc_autoset         = 0.0;
    int     maxiters_autoset    = 0;


    /**********************************************************************************
    ASTC Settingsto review for quality & perfromance, these are the setting found in
    astc_main for astcenc sample application command line tool
    fast
        plimit_autoset = 4;
        oplimit_autoset = 1.0;
        mincorrel_autoset = 0.5;
        dblimit_autoset_2d = MAX(85 - 35 * log10_texels_2d, 63 - 19 * log10_texels_2d);
        dblimit_autoset_3d = MAX(85 - 35 * log10_texels_3d, 63 - 19 * log10_texels_3d);
        bmc_autoset = 50;
        maxiters_autoset = 1;
    medium
        plimit_autoset = 25;
        oplimit_autoset = 1.2f;
        mincorrel_autoset = 0.75f;
        dblimit_autoset_2d = MAX(95 - 35 * log10_texels_2d, 70 - 19 * log10_texels_2d);
        dblimit_autoset_3d = MAX(95 - 35 * log10_texels_3d, 70 - 19 * log10_texels_3d);
        bmc_autoset = 75;
        maxiters_autoset = 2;
    thorough
        plimit_autoset = 100;
        oplimit_autoset = 2.5f;
        mincorrel_autoset = 0.95f;
        dblimit_autoset_2d = MAX(105 - 35 * log10_texels_2d, 77 - 19 * log10_texels_2d);
        dblimit_autoset_3d = MAX(105 - 35 * log10_texels_3d, 77 - 19 * log10_texels_3d);
        bmc_autoset = 95;
        maxiters_autoset
    exhaustive
        #define PARTITION_BITS 10
        #define PARTITION_COUNT (1 << PARTITION_BITS)
        plimit_autoset = PARTITION_COUNT;
        oplimit_autoset = 1000.0f;
        mincorrel_autoset = 0.99f;
        dblimit_autoset_2d = 999.0f;
        dblimit_autoset_3d = 999.0f;
        bmc_autoset = 100;
        maxiters_autoset = 4;
    ***************************************************************************************************/

    // Codec Speed Setting Defaults based on Quality Settings
    float QualityScale; // Set quality normalized per process setting with a range of 0.0 to 1.0f
    if (ASTCEncode->m_Quality < 0.02f) {
        // Very Fast
        oplimit_autoset     = 1.0;
        mincorrel_autoset   = 0.5;
        plimit_autoset      = 1;
        bmc_autoset         = 5.0f;
        maxiters_autoset    = 1;
        dblimit_autoset_2d  = MAX(70 - 35 * log10_texels_2d, 53 - 19 * log10_texels_2d);
    } else if (ASTCEncode->m_Quality < 0.05f) {
        // Fast:
        QualityScale = ASTCEncode->m_Quality/0.05f;
        oplimit_autoset     = 1.0;
        mincorrel_autoset   = 0.5;
        plimit_autoset      = 4;
        bmc_autoset         = 5.0f+(45.0f*QualityScale);  // max 50
        maxiters_autoset    = 1;
        dblimit_autoset_2d  = MAX(85 - 35 * log10_texels_2d, 63 - 19 * log10_texels_2d);
    } else if (ASTCEncode->m_Quality <= 0.20f) {
        // Medium  set to match near Compressonator BC7 Default Quality 0.05f setting
        QualityScale = ASTCEncode->m_Quality/0.20f;
        oplimit_autoset     = 1.2f;
        mincorrel_autoset   = 0.75f;
        plimit_autoset      = 15+(int)round(10.0f*QualityScale); // max around 25;
        bmc_autoset         = 57.0f+(18.0f*QualityScale);  // max 75;
        maxiters_autoset    = 2;
        dblimit_autoset_2d  = MAX(95 - 35 * log10_texels_2d, 70 - 19 * log10_texels_2d);
    } else if (ASTCEncode->m_Quality <= 0.60f) {
        // Thorough
        QualityScale = ASTCEncode->m_Quality/0.60f;
        oplimit_autoset     = 1.2f + (1.3f*QualityScale); // max 2.5f;
        mincorrel_autoset   = 0.95f;
        plimit_autoset      = 25+(int)round(75.0f*QualityScale); // max around 100
        bmc_autoset         = 75.0f+(25.0f*QualityScale);  // max 95;
        maxiters_autoset    = 4;
        dblimit_autoset_2d  = MAX(105 - 35 * log10_texels_2d, 77 - 19 * log10_texels_2d);
    } else {
        // Exhaustive
        QualityScale =  ASTCEncode->m_Quality;
        oplimit_autoset     = 2.5f+ (997.5f*QualityScale); // max 1000.0f;
        mincorrel_autoset   = 0.99f;
        plimit_autoset      = 100 + (int)round(923.0f * QualityScale);  // max 1024
        bmc_autoset         = 95.0f+(5.0f*QualityScale);  // max 100;
        maxiters_autoset    = 4;
        dblimit_autoset_2d  = 999.0f;
    }

    int partitions_to_test = plimit_autoset;
    float dblimit_2d = dblimit_autoset_2d;
    float oplimit = oplimit_autoset;
    float mincorrel = mincorrel_autoset;

#ifdef ASTC_ENABLE_3D_SUPPORT
    float dblimit_3d = dblimit_set_by_user ? dblimit_user_specified : dblimit_autoset_3d;
#endif

    ASTCEncode->m_ewp.rgb_power = 1.0f;
    ASTCEncode->m_ewp.alpha_power = 1.0f;
    ASTCEncode->m_ewp.rgb_base_weight = 1.0f;
    ASTCEncode->m_ewp.alpha_base_weight = 1.0f;
    ASTCEncode->m_ewp.rgb_mean_weight = 0.0f;
    ASTCEncode->m_ewp.rgb_stdev_weight = 0.0f;
    ASTCEncode->m_ewp.alpha_mean_weight = 0.0f;
    ASTCEncode->m_ewp.alpha_stdev_weight = 0.0f;

    ASTCEncode->m_ewp.rgb_mean_and_stdev_mixing = 0.0f;
    ASTCEncode->m_ewp.mean_stdev_radius = 0;
    ASTCEncode->m_ewp.enable_rgb_scale_with_alpha = 0;
    ASTCEncode->m_ewp.alpha_radius = 0;

    ASTCEncode->m_ewp.block_artifact_suppression = 0.0f;
    ASTCEncode->m_ewp.rgba_weights[0] = 1.0f;
    ASTCEncode->m_ewp.rgba_weights[1] = 1.0f;
    ASTCEncode->m_ewp.rgba_weights[2] = 1.0f;
    ASTCEncode->m_ewp.rgba_weights[3] = 1.0f;
    ASTCEncode->m_ewp.ra_normal_angular_scale = 0;
    ASTCEncode->m_ewp.max_refinement_iters = maxiters_autoset;

    ASTCEncode->m_ewp.block_mode_cutoff = bmc_autoset / 100.0f;

    float texel_avg_error_limit_2d;
    float texel_avg_error_limit_3d;

    if (ASTCEncode->m_rgb_force_use_of_hdr == 0) {
        texel_avg_error_limit_2d = pow(0.1f, dblimit_2d * 0.1f) * 65535.0f * 65535.0f;
#ifdef ASTC_ENABLE_3D_SUPPORT
        texel_avg_error_limit_3d = pow(0.1f, dblimit_3d * 0.1f) * 65535.0f * 65535.0f;
#endif
    } else {
        texel_avg_error_limit_2d = 0.0f;
        texel_avg_error_limit_3d = 0.0f;
    }
    ASTCEncode->m_ewp.partition_1_to_2_limit = oplimit;
    ASTCEncode->m_ewp.lowest_correlation_cutoff = mincorrel;

    if (partitions_to_test < 1)
        partitions_to_test = 1;
    else if (partitions_to_test > PARTITION_COUNT)
        partitions_to_test = PARTITION_COUNT;
    ASTCEncode->m_ewp.partition_search_limit = partitions_to_test;

    // Specifying the error weight of a color component as 0 is not allowed.
    // If weights are 0, then they are instead set to a small positive value.

    float max_color_component_weight = MAX(MAX(ASTCEncode->m_ewp.rgba_weights[0], ASTCEncode->m_ewp.rgba_weights[1]),
                                           MAX(ASTCEncode->m_ewp.rgba_weights[2], ASTCEncode->m_ewp.rgba_weights[3]));
    ASTCEncode->m_ewp.rgba_weights[0] = MAX(ASTCEncode->m_ewp.rgba_weights[0], max_color_component_weight / 1000.0f);
    ASTCEncode->m_ewp.rgba_weights[1] = MAX(ASTCEncode->m_ewp.rgba_weights[1], max_color_component_weight / 1000.0f);
    ASTCEncode->m_ewp.rgba_weights[2] = MAX(ASTCEncode->m_ewp.rgba_weights[2], max_color_component_weight / 1000.0f);
    ASTCEncode->m_ewp.rgba_weights[3] = MAX(ASTCEncode->m_ewp.rgba_weights[3], max_color_component_weight / 1000.0f);

    // Allocate arrays for image data and load results.
    ASTCEncode->m_ewp.texel_avg_error_limit = texel_avg_error_limit_2d;

    expand_block_artifact_suppression_host(ASTCEncode->m_xdim, ASTCEncode->m_ydim, ASTCEncode->m_zdim, &ASTCEncode->m_ewp);
}

bool init_ASTC(__global ASTC_Encode *ASTCEncode) {
    prepare_angular_tables(ASTCEncode);
    build_quantization_mode_table(ASTCEncode);
    InitializeASTCSettingsForSetBlockSize(ASTCEncode);
    set_block_size_descriptor(ASTCEncode->m_xdim, ASTCEncode->m_ydim, ASTCEncode->m_zdim, ASTCEncode);

#ifdef ASTC_ENABLE_3D_SUPPORT
    ASTCEncode->m_texels_per_block = ASTCEncode->m_xdim * ASTCEncode->m_ydim * ASTCEncode->m_zdim;
#else
    ASTCEncode->m_texels_per_block = ASTCEncode->m_xdim * ASTCEncode->m_ydim;
#endif
    ASTCEncode->m_ptindex = ASTCEncode->m_xdim + 16 * ASTCEncode->m_ydim + 256 * ASTCEncode->m_zdim;
    generate_partition_tables(ASTCEncode->m_xdim, ASTCEncode->m_ydim, ASTCEncode->m_zdim, ASTCEncode);
    return true;
}

}

//=====================================================================================================================================
// CPU Based Decoder code

extern ASTC_Encoder::ASTC_Encode g_ASTCEncode;

void initialize_decimation_table_2d_cpu(
    // dimensions of the block
    int xdim, int ydim,
    // number of grid points in 2d weight grid
    int x_weights, int y_weights, decimation_table_cpu * dt)

{
    int i, j;
    int x, y;

    int texels_per_block = xdim * ydim;
    int weights_per_block = x_weights * y_weights;

    int weightcount_of_texel[MAX_TEXELS_PER_BLOCK];
    int grid_weights_of_texel[MAX_TEXELS_PER_BLOCK][4];
    int weights_of_texel[MAX_TEXELS_PER_BLOCK][4];

    int texelcount_of_weight[MAX_WEIGHTS_PER_BLOCK];
    int texels_of_weight[MAX_WEIGHTS_PER_BLOCK][MAX_TEXELS_PER_BLOCK];
    int texelweights_of_weight[MAX_WEIGHTS_PER_BLOCK][MAX_TEXELS_PER_BLOCK];

    for (i = 0; i < weights_per_block; i++)
        texelcount_of_weight[i] = 0;
    for (i = 0; i < texels_per_block; i++)
        weightcount_of_texel[i] = 0;

    for (y = 0; y < ydim; y++)
        for (x = 0; x < xdim; x++) {
            int texel = y * xdim + x;

            int x_weight = (((1024 + xdim / 2) / (xdim - 1)) * x * (x_weights - 1) + 32) >> 6;
            int y_weight = (((1024 + ydim / 2) / (ydim - 1)) * y * (y_weights - 1) + 32) >> 6;

            int x_weight_frac = x_weight & 0xF;
            int y_weight_frac = y_weight & 0xF;
            int x_weight_int = x_weight >> 4;
            int y_weight_int = y_weight >> 4;
            int qweight[4];
            int weight[4];
            qweight[0] = x_weight_int + y_weight_int * x_weights;
            qweight[1] = qweight[0] + 1;
            qweight[2] = qweight[0] + x_weights;
            qweight[3] = qweight[2] + 1;

            // truncated-precision bilinear interpolation.
            int prod = x_weight_frac * y_weight_frac;

            weight[3] = (prod + 8) >> 4;
            weight[1] = x_weight_frac - weight[3];
            weight[2] = y_weight_frac - weight[3];
            weight[0] = 16 - x_weight_frac - y_weight_frac + weight[3];

            for (i = 0; i < 4; i++)
                if (weight[i] != 0) {
                    grid_weights_of_texel[texel][weightcount_of_texel[texel]] = qweight[i];
                    weights_of_texel[texel][weightcount_of_texel[texel]] = weight[i];
                    weightcount_of_texel[texel]++;
                    texels_of_weight[qweight[i]][texelcount_of_weight[qweight[i]]] = texel;
                    texelweights_of_weight[qweight[i]][texelcount_of_weight[qweight[i]]] = weight[i];
                    texelcount_of_weight[qweight[i]]++;
                }
        }

    for (i = 0; i < texels_per_block; i++) {
        dt->texel_num_weights[i] = (ASTC_Encoder::uint8_t)weightcount_of_texel[i];

        // ensure that all 4 entries are actually initialized.
        // This allows a branch-free implemntation of compute_value_of_texel_flt()
        for (j = 0; j < 4; j++) {
            dt->texel_weights_int[i][j] = 0;
            dt->texel_weights_float[i][j] = 0.0f;
            dt->texel_weights[i][j] = 0;
        }

        for (j = 0; j < weightcount_of_texel[i]; j++) {
            dt->texel_weights_int[i][j] = (ASTC_Encoder::uint8_t)weights_of_texel[i][j];
            dt->texel_weights_float[i][j] = (weights_of_texel[i][j]) * (1.0f / TEXEL_WEIGHT_SUM);
            dt->texel_weights[i][j] = (ASTC_Encoder::uint8_t)grid_weights_of_texel[i][j];
        }
    }

    for (i = 0; i < weights_per_block; i++) {
        dt->weight_num_texels[i] = (ASTC_Encoder::uint8_t)texelcount_of_weight[i];


        for (j = 0; j < texelcount_of_weight[i]; j++) {
            dt->weight_texel[i][j] = (ASTC_Encoder::uint8_t)texels_of_weight[i][j];
            dt->weights_int[i][j] = (ASTC_Encoder::uint8_t)texelweights_of_weight[i][j];
            dt->weights_flt[i][j] = (float)texelweights_of_weight[i][j];
        }
    }

    dt->num_texels = texels_per_block;
    dt->num_weights = weights_per_block;


}

void construct_block_size_descriptor_2d_cpu(int xdim, int ydim, block_size_descriptor_cpu * bsd) {
    int decimation_mode_index[256];    // for each of the 256 entries in the decim_table_array, its index
    int decimation_mode_count = 0;

    int i;
    int x_weights;
    int y_weights;

    for (i = 0; i < 256; i++) {
        decimation_mode_index[i] = -1;
    }

    // gather all the infill-modes that can be used with the current block size
    for (x_weights = 2; x_weights <= 12; x_weights++)
        for (y_weights = 2; y_weights <= 12; y_weights++) {
            if (x_weights * y_weights > MAX_WEIGHTS_PER_BLOCK)
                continue;
            decimation_table_cpu *dt = new decimation_table_cpu;
            decimation_mode_index[y_weights * 16 + x_weights] = decimation_mode_count;
            initialize_decimation_table_2d_cpu(xdim, ydim, x_weights, y_weights, dt);

            int weight_count = x_weights * y_weights;

            int maxprec_1plane = -1;
            int maxprec_2planes = -1;
            for (i = 0; i < 12; i++) {
                int bits_1plane  = ASTC_Encoder::compute_ise_bitcount2(weight_count, (ASTC_Encoder::quantization_method)i);
                int bits_2planes = ASTC_Encoder::compute_ise_bitcount2(2 * weight_count, (ASTC_Encoder::quantization_method)i);
                if (bits_1plane >= MIN_WEIGHT_BITS_PER_BLOCK && bits_1plane <= MAX_WEIGHT_BITS_PER_BLOCK)
                    maxprec_1plane = i;
                if (bits_2planes >= MIN_WEIGHT_BITS_PER_BLOCK && bits_2planes <= MAX_WEIGHT_BITS_PER_BLOCK)
                    maxprec_2planes = i;
            }

            bsd->permit_encode[decimation_mode_count] = (x_weights <= xdim && y_weights <= ydim);

            bsd->decimation_mode_samples[decimation_mode_count] = weight_count;
            bsd->decimation_mode_maxprec_1plane[decimation_mode_count] = maxprec_1plane;
            bsd->decimation_mode_maxprec_2planes[decimation_mode_count] = maxprec_2planes;
            bsd->decimation_tables[decimation_mode_count] = dt;

            decimation_mode_count++;
        }

    for (i = 0; i < MAX_DECIMATION_MODES; i++) {
        bsd->decimation_mode_percentile[i] = 1.0f;
    }

    for (i = decimation_mode_count; i < MAX_DECIMATION_MODES; i++) {
        bsd->permit_encode[i] = 0;
        bsd->decimation_mode_samples[i] = 0;
        bsd->decimation_mode_maxprec_1plane[i] = -1;
        bsd->decimation_mode_maxprec_2planes[i] = -1;
    }

    bsd->decimation_mode_count = decimation_mode_count;

    const float *percentiles = ASTC_Encoder::get_2d_percentile_table_host(xdim, ydim);

    // then construct the list of block formats
    for (i = 0; i < 2048; i++) {
        int is_dual_plane;
        int quantization_mode;
        int fail = 0;
        int permit_encode = 1;

        if (ASTC_Encoder::decode_block_mode_2d(i, &x_weights, &y_weights, &is_dual_plane, &quantization_mode)) {
            if (x_weights > xdim || y_weights > ydim)
                permit_encode = 0;
        } else {
            fail = 1;
            permit_encode = 0;
        }

        if (fail) {
            bsd->block_modes[i].decimation_mode = -1;
            bsd->block_modes[i].quantization_mode = -1;
            bsd->block_modes[i].is_dual_plane = -1;
            bsd->block_modes[i].permit_encode = 0;
            bsd->block_modes[i].permit_decode = 0;
            bsd->block_modes[i].percentile = 1.0f;
        } else {
            int decimation_mode = decimation_mode_index[y_weights * 16 + x_weights];
            bsd->block_modes[i].decimation_mode = (uint8_t)decimation_mode;
            bsd->block_modes[i].quantization_mode = (uint8_t)quantization_mode;
            bsd->block_modes[i].is_dual_plane = (uint8_t)is_dual_plane;
            bsd->block_modes[i].permit_encode = (uint8_t)permit_encode;
            bsd->block_modes[i].permit_decode = (uint8_t)permit_encode;    // disallow decode of grid size larger than block size.
            bsd->block_modes[i].percentile = percentiles[i];

            if (bsd->decimation_mode_percentile[decimation_mode] > percentiles[i])
                bsd->decimation_mode_percentile[decimation_mode] = percentiles[i];
        }

    }

    if (xdim * ydim <= 64) {
        bsd->texelcount_for_bitmap_partitioning = xdim * ydim;
        for (i = 0; i < xdim * ydim; i++)
            bsd->texels_for_bitmap_partitioning[i] = i;
    }

    else {
        // pick 64 random texels for use with bitmap partitioning.
        int arr[MAX_TEXELS_PER_BLOCK];
        for (i = 0; i < xdim * ydim; i++)
            arr[i] = 0;
        int arr_elements_set = 0;
        while (arr_elements_set < 64) {
            int idx = rand() % (xdim * ydim);
            if (arr[idx] == 0) {
                arr_elements_set++;
                arr[idx] = 1;
            }
        }
        int texel_weights_written = 0;
        int idx = 0;
        while (texel_weights_written < 64) {
            if (arr[idx])
                bsd->texels_for_bitmap_partitioning[texel_weights_written++] = idx;
            idx++;
        }
        bsd->texelcount_for_bitmap_partitioning = 64;

    }
}

#ifdef ASTC_ENABLE_3D_SUPPORT
void construct_block_size_descriptor_3d(int xdim, int ydim, int zdim, block_size_descriptor * bsd) {
    int decimation_mode_index[512];    // for each of the 512 entries in the decim_table_array, its index
    int decimation_mode_count = 0;

    int i;
    int x_weights;
    int y_weights;
    int z_weights;

    for (i = 0; i < 512; i++) {
        decimation_mode_index[i] = -1;
    }

    // gather all the infill-modes that can be used with the current block size
    for (x_weights = 2; x_weights <= 6; x_weights++)
        for (y_weights = 2; y_weights <= 6; y_weights++)
            for (z_weights = 2; z_weights <= 6; z_weights++) {
                if ((x_weights * y_weights * z_weights) > MAX_WEIGHTS_PER_BLOCK)
                    continue;
                decimation_table *dt = new decimation_table;
                decimation_mode_index[z_weights * 64 + y_weights * 8 + x_weights] = decimation_mode_count;
                initialize_decimation_table_3d(xdim, ydim, zdim, x_weights, y_weights, z_weights, dt);

                int weight_count = x_weights * y_weights * z_weights;

                int maxprec_1plane = -1;
                int maxprec_2planes = -1;
                for (i = 0; i < 12; i++) {
                    int bits_1plane = compute_ise_bitcount(weight_count, (quantization_method)i);
                    int bits_2planes = compute_ise_bitcount(2 * weight_count, (quantization_method)i);
                    if (bits_1plane >= MIN_WEIGHT_BITS_PER_BLOCK && bits_1plane <= MAX_WEIGHT_BITS_PER_BLOCK)
                        maxprec_1plane = i;
                    if (bits_2planes >= MIN_WEIGHT_BITS_PER_BLOCK && bits_2planes <= MAX_WEIGHT_BITS_PER_BLOCK)
                        maxprec_2planes = i;
                }
                bsd->permit_encode[decimation_mode_count] = (x_weights <= xdim && y_weights <= ydim && z_weights <= zdim);

                bsd->decimation_mode_samples[decimation_mode_count] = weight_count;
                bsd->decimation_mode_maxprec_1plane[decimation_mode_count] = maxprec_1plane;
                bsd->decimation_mode_maxprec_2planes[decimation_mode_count] = maxprec_2planes;
                bsd->decimation_tables[decimation_mode_count] = *dt;    // NP code change!

                decimation_mode_count++;
            }

    for (i = 0; i < MAX_DECIMATION_MODES; i++) {
        bsd->decimation_mode_percentile[i] = 1.0f;
    }

    for (i = decimation_mode_count; i < MAX_DECIMATION_MODES; i++) {
        bsd->permit_encode[i] = 0;
        bsd->decimation_mode_samples[i] = 0;
        bsd->decimation_mode_maxprec_1plane[i] = -1;
        bsd->decimation_mode_maxprec_2planes[i] = -1;
    }

    bsd->decimation_mode_count = decimation_mode_count;

    const float *percentiles = get_3d_percentile_table(xdim, ydim, zdim);

    // then construct the list of block formats
    for (i = 0; i < 2048; i++) {
        int is_dual_plane;
        int quantization_mode;
        int fail = 0;
        int permit_encode = 1;

        if (decode_block_mode_3d(i, &x_weights, &y_weights, &z_weights, &is_dual_plane, &quantization_mode)) {
            if (x_weights > xdim || y_weights > ydim || z_weights > zdim)
                permit_encode = 0;
        } else

        {
            fail = 1;
            permit_encode = 0;
        }
        if (fail) {
            bsd->block_modes[i].decimation_mode = -1;
            bsd->block_modes[i].quantization_mode = -1;
            bsd->block_modes[i].is_dual_plane = -1;
            bsd->block_modes[i].permit_encode = 0;
            bsd->block_modes[i].permit_decode = 0;
            bsd->block_modes[i].percentile = 1.0f;
        } else {
            int decimation_mode = decimation_mode_index[z_weights * 64 + y_weights * 8 + x_weights];
            bsd->block_modes[i].decimation_mode = (uint8_t)decimation_mode;
            bsd->block_modes[i].quantization_mode = (uint8_t)quantization_mode;
            bsd->block_modes[i].is_dual_plane = (uint8_t)is_dual_plane;
            bsd->block_modes[i].permit_encode = (uint8_t)permit_encode;
            bsd->block_modes[i].permit_decode = (uint8_t)permit_encode;
            bsd->block_modes[i].percentile = percentiles[i];

            if (bsd->decimation_mode_percentile[decimation_mode] > percentiles[i])
                bsd->decimation_mode_percentile[decimation_mode] = percentiles[i];
        }

    }

    if (xdim * ydim * zdim <= 64) {
        bsd->texelcount_for_bitmap_partitioning = xdim * ydim * zdim;
        for (i = 0; i < xdim * ydim * zdim; i++)
            bsd->texels_for_bitmap_partitioning[i] = i;
    }

    else {
        // pick 64 random texels for use with bitmap partitioning.
        int arr[MAX_TEXELS_PER_BLOCK];
        for (i = 0; i < xdim * ydim * zdim; i++)
            arr[i] = 0;
        int arr_elements_set = 0;
        while (arr_elements_set < 64) {
            int idx = rand() % (xdim * ydim * zdim);
            if (arr[idx] == 0) {
                arr_elements_set++;
                arr[idx] = 1;
            }
        }
        int texel_weights_written = 0;
        int idx = 0;
        while (texel_weights_written < 64) {
            if (arr[idx])
                bsd->texels_for_bitmap_partitioning[texel_weights_written++] = idx;
            idx++;
        }
        bsd->texelcount_for_bitmap_partitioning = 64;
    }
}
#endif

static block_size_descriptor_cpu *bsd_pointers[4096];

// function to obtain a block size descriptor. If the descriptor does not exist,
// it is created as needed. Should not be called from within multithreaded code.
block_size_descriptor_cpu *get_block_size_descriptor_cpu(int xdim, int ydim, int zdim) {
    int bsd_index = xdim + (ydim << 4) + (zdim << 8);
    if (bsd_pointers[bsd_index] == NULL) {
        block_size_descriptor_cpu *bsd = new block_size_descriptor_cpu;
#ifdef ASTC_ENABLE_3D_SUPPORT
        if (zdim > 1)
            construct_block_size_descriptor_3d(xdim, ydim, zdim, bsd);
        else
#endif
            construct_block_size_descriptor_2d_cpu(xdim, ydim, bsd);

        bsd_pointers[bsd_index] = bsd;
    }
    return bsd_pointers[bsd_index];
}

void physical_to_symbolic_cpu(int xdim, int ydim, int zdim, physical_compressed_block_cpu pb, symbolic_compressed_block_cpu * res) {
    uint8_t bswapped[16];
    int i, j;

    res->error_block = 0;

    // get hold of the block-size descriptor and the decimation tables.
    const block_size_descriptor_cpu *bsd = get_block_size_descriptor_cpu(xdim, ydim, zdim);
    const decimation_table_cpu *const *ixtab2 = bsd->decimation_tables;

    // extract header fields
    int block_mode = ASTC_Encoder::read_bits(11, 0, pb.data);


    if ((block_mode & 0x1FF) == 0x1FC) {
        // void-extent block!

        // check what format the data has
        if (block_mode & 0x200)
            res->block_mode = -1;    // floating-point
        else
            res->block_mode = -2;    // unorm16.

        res->partition_count = 0;
        for (i = 0; i < 4; i++) {
            res->constant_color[i] = pb.data[2 * i + 8] | (pb.data[2 * i + 9] << 8);
        }

        // additionally, check that the void-extent
        if (zdim == 1) {
            // 2D void-extent
            int rsvbits = ASTC_Encoder::read_bits(2, 10, pb.data);
            if (rsvbits != 3)
                res->error_block = 1;

            int vx_low_s = ASTC_Encoder::read_bits(8, 12, pb.data) | (ASTC_Encoder::read_bits(5, 12 + 8, pb.data) << 8);
            int vx_high_s = ASTC_Encoder::read_bits(8, 25, pb.data) | (ASTC_Encoder::read_bits(5, 25 + 8, pb.data) << 8);
            int vx_low_t = ASTC_Encoder::read_bits(8, 38, pb.data) | (ASTC_Encoder::read_bits(5, 38 + 8, pb.data) << 8);
            int vx_high_t = ASTC_Encoder::read_bits(8, 51, pb.data) | (ASTC_Encoder::read_bits(5, 51 + 8, pb.data) << 8);

            int all_ones = vx_low_s == 0x1FFF && vx_high_s == 0x1FFF && vx_low_t == 0x1FFF && vx_high_t == 0x1FFF;

            if ((vx_low_s >= vx_high_s || vx_low_t >= vx_high_t) && !all_ones)
                res->error_block = 1;
        } else {
            // 3D void-extent
            int vx_low_s = ASTC_Encoder::read_bits(9, 10, pb.data);
            int vx_high_s = ASTC_Encoder::read_bits(9, 19, pb.data);
            int vx_low_t = ASTC_Encoder::read_bits(9, 28, pb.data);
            int vx_high_t = ASTC_Encoder::read_bits(9, 37, pb.data);
            int vx_low_p = ASTC_Encoder::read_bits(9, 46, pb.data);
            int vx_high_p = ASTC_Encoder::read_bits(9, 55, pb.data);

            int all_ones = vx_low_s == 0x1FF && vx_high_s == 0x1FF && vx_low_t == 0x1FF && vx_high_t == 0x1FF && vx_low_p == 0x1FF && vx_high_p == 0x1FF;

            if ((vx_low_s >= vx_high_s || vx_low_t >= vx_high_t || vx_low_p >= vx_high_p) && !all_ones)
                res->error_block = 1;
        }

        return;
    }

    if (bsd->block_modes[block_mode].permit_decode == 0) {
        res->error_block = 1;
        return;
    }

    int weight_count = ixtab2[bsd->block_modes[block_mode].decimation_mode]->num_weights;
    int weight_quantization_method = bsd->block_modes[block_mode].quantization_mode;
    int is_dual_plane = bsd->block_modes[block_mode].is_dual_plane;

    int real_weight_count = is_dual_plane ? 2 * weight_count : weight_count;

    int partition_count = ASTC_Encoder::read_bits(2, 11, pb.data) + 1;

    res->block_mode = block_mode;
    res->partition_count = partition_count;

    for (i = 0; i < 16; i++)
        bswapped[i] = (uint8_t)ASTC_Encoder::bitrev8(pb.data[15 - i]);

    int bits_for_weights = ASTC_Encoder::compute_ise_bitcount2(real_weight_count,(ASTC_Encoder::quantization_method)weight_quantization_method);

    int below_weights_pos = 128 - bits_for_weights;

    if (is_dual_plane) {
        uint8_t indices[64];
        ASTC_Encoder::decode_ise(weight_quantization_method, real_weight_count, bswapped, indices, 0);
        for (i = 0; i < weight_count; i++) {
            res->plane1_weights[i] = indices[2 * i];
            res->plane2_weights[i] = indices[2 * i + 1];
        }
    } else {
        ASTC_Encoder::decode_ise(weight_quantization_method, weight_count, bswapped, res->plane1_weights, 0);
    }

    if (is_dual_plane && partition_count == 4)
        res->error_block = 1;


    res->color_formats_matched = 0;

    // then, determine the format of each endpoint pair
    int color_formats[4];
    int encoded_type_highpart_size = 0;
    if (partition_count == 1) {
        color_formats[0] = ASTC_Encoder::read_bits(4, 13, pb.data);
        res->partition_index = 0;
    } else {
        encoded_type_highpart_size = (3 * partition_count) - 4;
        below_weights_pos -= encoded_type_highpart_size;
        int encoded_type = ASTC_Encoder::read_bits(6, 13 + PARTITION_BITS, pb.data) | (ASTC_Encoder::read_bits(encoded_type_highpart_size, below_weights_pos, pb.data) << 6);
        int baseclass = encoded_type & 0x3;
        if (baseclass == 0) {
            for (i = 0; i < partition_count; i++) {
                color_formats[i] = (encoded_type >> 2) & 0xF;
            }
            below_weights_pos += encoded_type_highpart_size;
            res->color_formats_matched = 1;
            encoded_type_highpart_size = 0;
        } else {
            int bitpos = 2;
            baseclass--;
            for (i = 0; i < partition_count; i++) {
                color_formats[i] = (((encoded_type >> bitpos) & 1) + baseclass) << 2;
                bitpos++;
            }
            for (i = 0; i < partition_count; i++) {
                color_formats[i] |= (encoded_type >> bitpos) & 3;
                bitpos += 2;
            }
        }
        res->partition_index = ASTC_Encoder::read_bits(6, 13, pb.data) | (ASTC_Encoder::read_bits(PARTITION_BITS - 6, 19, pb.data) << 6);

    }
    for (i = 0; i < partition_count; i++)
        res->color_formats[i] = color_formats[i];


    // then, determine the number of integers we need to unpack for the endpoint pairs
    int color_integer_count = 0;
    for (i = 0; i < partition_count; i++) {
        int endpoint_class = color_formats[i] >> 2;
        color_integer_count += (endpoint_class + 1) * 2;
    }

    if (color_integer_count > 18)
        res->error_block = 1;

    // then, determine the color endpoint format to use for these integers
    static const int color_bits_arr[5] = { -1, 115 - 4, 113 - 4 - PARTITION_BITS, 113 - 4 - PARTITION_BITS, 113 - 4 - PARTITION_BITS };
    int color_bits = color_bits_arr[partition_count] - bits_for_weights - encoded_type_highpart_size;
    if (is_dual_plane)
        color_bits -= 2;
    if (color_bits < 0)
        color_bits = 0;

    int color_quantization_level = g_ASTCEncode.quantization_mode_table[color_integer_count >> 1][color_bits];
    res->color_quantization_level = color_quantization_level;
    if (color_quantization_level < 4)
        res->error_block = 1;


    // then unpack the integer-bits
    uint8_t values_to_decode[32];
    ASTC_Encoder::decode_ise(color_quantization_level, color_integer_count, pb.data, values_to_decode, (partition_count == 1 ? 17 : 19 + PARTITION_BITS));

    // and distribute them over the endpoint types
    int valuecount_to_decode = 0;

    for (i = 0; i < partition_count; i++) {
        int vals = 2 * (color_formats[i] >> 2) + 2;
        for (j = 0; j < vals; j++)
            res->color_values[i][j] = values_to_decode[j + valuecount_to_decode];
        valuecount_to_decode += vals;
    }

    // get hold of color component for second-plane in the case of dual plane of weightss.
    if (is_dual_plane)
        res->plane2_color_component = ASTC_Encoder::read_bits(2, below_weights_pos - 2, pb.data);

}

void imageblock_initialize_deriv_from_work_and_orig_cpu(imageblock_cpu * pb, int pixelcount) {
    int i;

    const float *fptr = pb->orig_data;
    const float *wptr = pb->work_data;
    float *dptr = pb->deriv_data;

    for (i = 0; i < pixelcount; i++) {

        // compute derivatives for RGB first
        if (pb->rgb_lns[i]) {
            float r = MAX(fptr[0], 6e-5f);
            float g = MAX(fptr[1], 6e-5f);
            float b = MAX(fptr[2], 6e-5f);

            float rderiv = (ASTC_Encoder::float_to_lns(r * 1.05f) - ASTC_Encoder::float_to_lns(r)) / (r * 0.05f);
            float gderiv = (ASTC_Encoder::float_to_lns(g * 1.05f) - ASTC_Encoder::float_to_lns(g)) / (g * 0.05f);
            float bderiv = (ASTC_Encoder::float_to_lns(b * 1.05f) - ASTC_Encoder::float_to_lns(b)) / (b * 0.05f);

            // the derivative may not actually take values smaller than 1/32 or larger than 2^25;
            // if it does, we clamp it.
            if (rderiv < (1.0f / 32.0f))
                rderiv = (1.0f / 32.0f);
            else if (rderiv > 33554432.0f)
                rderiv = 33554432.0f;

            if (gderiv < (1.0f / 32.0f))
                gderiv = (1.0f / 32.0f);
            else if (gderiv > 33554432.0f)
                gderiv = 33554432.0f;

            if (bderiv < (1.0f / 32.0f))
                bderiv = (1.0f / 32.0f);
            else if (bderiv > 33554432.0f)
                bderiv = 33554432.0f;

            dptr[0] = rderiv;
            dptr[1] = gderiv;
            dptr[2] = bderiv;
        } else {
            dptr[0] = 65535.0f;
            dptr[1] = 65535.0f;
            dptr[2] = 65535.0f;
        }


        // then compute derivatives for Alpha
        if (pb->alpha_lns[i]) {
            float a = MAX(fptr[3], 6e-5f);
            float aderiv = (ASTC_Encoder::float_to_lns(a * 1.05f) - ASTC_Encoder::float_to_lns(a)) / (a * 0.05f);
            // the derivative may not actually take values smaller than 1/32 or larger than 2^25;
            // if it does, we clamp it.
            if (aderiv < (1.0f / 32.0f))
                aderiv = (1.0f / 32.0f);
            else if (aderiv > 33554432.0f)
                aderiv = 33554432.0f;

            dptr[3] = aderiv;
        } else {
            dptr[3] = 65535.0f;
        }

        fptr += 4;
        wptr += 4;
        dptr += 4;
    }
}

// helper function to initialize the work-data from the orig-data
void imageblock_initialize_work_from_orig_cpu(imageblock_cpu * pb, int pixelcount) {
    int i;
    float *fptr = pb->orig_data;
    float *wptr = pb->work_data;

    for (i = 0; i < pixelcount; i++) {
        if (pb->rgb_lns[i]) {
            wptr[0] = ASTC_Encoder::float_to_lns(fptr[0]);
            wptr[1] = ASTC_Encoder::float_to_lns(fptr[1]);
            wptr[2] = ASTC_Encoder::float_to_lns(fptr[2]);
        } else {
            wptr[0] = fptr[0] * 65535.0f;
            wptr[1] = fptr[1] * 65535.0f;
            wptr[2] = fptr[2] * 65535.0f;
        }

        if (pb->alpha_lns[i]) {
            wptr[3] = ASTC_Encoder::float_to_lns(fptr[3]);
        } else {
            wptr[3] = fptr[3] * 65535.0f;
        }
        fptr += 4;
        wptr += 4;
    }

    imageblock_initialize_deriv_from_work_and_orig_cpu(pb, pixelcount);
}

void update_imageblock_flags_cpu(imageblock_cpu * pb, int xdim, int ydim, int zdim) {
    int i;
    float red_min = 1e38f, red_max = -1e38f;
    float green_min = 1e38f, green_max = -1e38f;
    float blue_min = 1e38f, blue_max = -1e38f;
    float alpha_min = 1e38f, alpha_max = -1e38f;

    int texels_per_block = xdim * ydim * zdim;

    int grayscale = 1;

    for (i = 0; i < texels_per_block; i++) {
        float red = pb->work_data[4 * i];
        float green = pb->work_data[4 * i + 1];
        float blue = pb->work_data[4 * i + 2];
        float alpha = pb->work_data[4 * i + 3];
        if (red < red_min)
            red_min = red;
        if (red > red_max)
            red_max = red;
        if (green < green_min)
            green_min = green;
        if (green > green_max)
            green_max = green;
        if (blue < blue_min)
            blue_min = blue;
        if (blue > blue_max)
            blue_max = blue;
        if (alpha < alpha_min)
            alpha_min = alpha;
        if (alpha > alpha_max)
            alpha_max = alpha;

        if (grayscale == 1 && (red != green || red != blue))
            grayscale = 0;
    }

    pb->red_min = red_min;
    pb->red_max = red_max;
    pb->green_min = green_min;
    pb->green_max = green_max;
    pb->blue_min = blue_min;
    pb->blue_max = blue_max;
    pb->alpha_min = alpha_min;
    pb->alpha_max = alpha_max;
    pb->grayscale = grayscale;
}

// fetch an imageblock from the input file.
void fetch_imageblock_cpu(
    const astc_codec_image_cpu * img,
    imageblock_cpu * pb,    // picture-block to imitialize with image data
    // block dimensions
    int xdim, int ydim, int zdim,
    // position in texture.
    int xpos, int ypos, int zpos
) {
    float *fptr = pb->orig_data;
    int xsize = img->xsize + 2 * img->padding;
    int ysize = img->ysize + 2 * img->padding;
    int zsize = (img->zsize == 1) ? 1 : img->zsize + 2 * img->padding;

    int x, y, z, i;

    pb->xpos = xpos;
    pb->ypos = ypos;
    pb->zpos = zpos;

    xpos += img->padding;
    ypos += img->padding;
    if (img->zsize > 1)
        zpos += img->padding;

    float data[6];
    data[4] = 0;
    data[5] = 1;

    if (img->imagedata8) {
        for (z = 0; z < zdim; z++)
            for (y = 0; y < ydim; y++)
                for (x = 0; x < xdim; x++) {
                    int xi = xpos + x;
                    int yi = ypos + y;
                    int zi = zpos + z;
                    // clamp XY coordinates to the picture.
                    if (xi < 0)
                        xi = 0;
                    if (yi < 0)
                        yi = 0;
                    if (zi < 0)
                        zi = 0;
                    if (xi >= xsize)
                        xi = xsize - 1;
                    if (yi >= ysize)
                        yi = ysize - 1;
                    if (zi >= zsize)
                        zi = zsize - 1;

                    int r = img->imagedata8[zi][yi][4 * xi];
                    int g = img->imagedata8[zi][yi][4 * xi + 1];
                    int b = img->imagedata8[zi][yi][4 * xi + 2];
                    int a = img->imagedata8[zi][yi][4 * xi + 3];

                    data[0] = r / 255.0f;
                    data[1] = g / 255.0f;
                    data[2] = b / 255.0f;
                    data[3] = a / 255.0f;

                    fptr[0] = data[0];
                    fptr[1] = data[1];
                    fptr[2] = data[2];
                    fptr[3] = data[3];
                    fptr += 4;
                }
    }

    //------------------------------------------
    // HDR currently not supported in code
    /*
    else if (img->imagedata16)
    {
    for (z = 0; z < zdim; z++)
    for (y = 0; y < ydim; y++)
    for (x = 0; x < xdim; x++)
    {
    int xi = xpos + x;
    int yi = ypos + y;
    int zi = zpos + z;
    // clamp XY coordinates to the picture.
    if (xi < 0)
    xi = 0;
    if (yi < 0)
    yi = 0;
    if (zi < 0)
    zi = 0;
    if (xi >= xsize)
    xi = xsize - 1;
    if (yi >= ysize)
    yi = ysize - 1;
    if (zi >= ysize)
    zi = zsize - 1;

    int r = img->imagedata16[zi][yi][4 * xi];
    int g = img->imagedata16[zi][yi][4 * xi + 1];
    int b = img->imagedata16[zi][yi][4 * xi + 2];
    int a = img->imagedata16[zi][yi][4 * xi + 3];

    float rf = sf16_to_float((sf16)r);
    float gf = sf16_to_float((sf16)g);
    float bf = sf16_to_float((sf16)b);
    float af = sf16_to_float((sf16)a);

    // equalize the color components somewhat, and get rid of negative values.

    rf = MAX(rf, 1e-8f);
    gf = MAX(gf, 1e-8f);
    bf = MAX(bf, 1e-8f);
    af = MAX(af, 1e-8f);

    data[0] = rf;
    data[1] = gf;
    data[2] = bf;
    data[3] = af;

    fptr[0] = data[0];
    fptr[1] = data[1];
    fptr[2] = data[2];
    fptr[3] = data[3];
    fptr += 4;
    }
    }
    */

    int pixelcount = xdim * ydim * zdim;

    // impose the choice on every pixel when encoding.
    for (i = 0; i < pixelcount; i++) {
        pb->rgb_lns[i]      = (uint8_t)g_ASTCEncode.m_rgb_force_use_of_hdr;
        pb->alpha_lns[i]    = (uint8_t)g_ASTCEncode.m_alpha_force_use_of_hdr;
        pb->nan_texel[i]    = 0;
    }

    imageblock_initialize_work_from_orig_cpu(pb, pixelcount);
    update_imageblock_flags_cpu(pb, xdim, ydim, zdim);
}

void destroy_image_cpu(astc_codec_image_cpu * img) {
    if (img == NULL)
        return;

    if (img->imagedata8) {
        delete[]img->imagedata8[0][0];
        delete[]img->imagedata8[0];
        delete[]img->imagedata8;
    }
    if (img->imagedata16) {
        delete[]img->imagedata16[0][0];
        delete[]img->imagedata16[0];
        delete[]img->imagedata16;
    }
    delete img;
}

astc_codec_image_cpu *allocate_image_cpu(int bitness, int xsize, int ysize, int zsize, int padding) {
    int i, j;
    astc_codec_image_cpu *img = new astc_codec_image_cpu;
    img->xsize = xsize;
    img->ysize = ysize;
    img->zsize = zsize;
    img->padding = padding;

    int exsize = xsize + 2 * padding;
    int eysize = ysize + 2 * padding;
    int ezsize = (zsize == 1) ? 1 : zsize + 2 * padding;

    if (bitness == 8) {
        img->imagedata8 = new uint8_t **[ezsize];
        img->imagedata8[0] = new uint8_t *[ezsize * eysize];
        img->imagedata8[0][0] = new uint8_t[4 * ezsize * eysize * exsize];
        for (i = 1; i < ezsize; i++) {
            img->imagedata8[i] = img->imagedata8[0] + i * eysize;
            img->imagedata8[i][0] = img->imagedata8[0][0] + 4 * i * exsize * eysize;
        }
        for (i = 0; i < ezsize; i++)
            for (j = 1; j < eysize; j++)
                img->imagedata8[i][j] = img->imagedata8[i][0] + 4 * j * exsize;

        img->imagedata16 = NULL;
    }

    else if (bitness == 16) {
        img->imagedata16 = new uint16_t **[ezsize];
        img->imagedata16[0] = new uint16_t *[ezsize * eysize];
        img->imagedata16[0][0] = new uint16_t[4 * ezsize * eysize * exsize];
        for (i = 1; i < ezsize; i++) {
            img->imagedata16[i] = img->imagedata16[0] + i * eysize;
            img->imagedata16[i][0] = img->imagedata16[0][0] + 4 * i * exsize * eysize;
        }
        for (i = 0; i < ezsize; i++)
            for (j = 1; j < eysize; j++)
                img->imagedata16[i][j] = img->imagedata16[i][0] + 4 * j * exsize;

        img->imagedata8 = NULL;
    } else {
        return nullptr;
    }

    return img;
}

void initialize_image_cpu(astc_codec_image_cpu * img) {
    int x, y, z;

    int exsize = img->xsize + 2 * img->padding;
    int eysize = img->ysize + 2 * img->padding;
    int ezsize = (img->zsize == 1) ? 1 : img->zsize + 2 * img->padding;

    if (img->imagedata8) {
        for (z = 0; z < ezsize; z++)
            for (y = 0; y < eysize; y++)
                for (x = 0; x < exsize; x++) {
                    img->imagedata8[z][y][4 * x] = 0;
                    img->imagedata8[z][y][4 * x + 1] = 0;
                    img->imagedata8[z][y][4 * x + 2] = 0;
                    img->imagedata8[z][y][4 * x + 3] = 0xFF;
                }
    } else if (img->imagedata16) {
        for (z = 0; z < ezsize; z++)
            for (y = 0; y < eysize; y++)
                for (x = 0; x < exsize; x++) {
                    img->imagedata16[z][y][4 * x] = 0;
                    img->imagedata16[z][y][4 * x + 1] = 0;
                    img->imagedata16[z][y][4 * x + 2] = 0;
                    img->imagedata16[z][y][4 * x + 3] = 0x3C00;
                }
    }
}


void write_imageblock_cpu(astc_codec_image_cpu * img, const imageblock_cpu * pb,
                          int xdim, int ydim, int zdim,
                          int xpos, int ypos, int zpos, swizzlepattern_cpu swz) {
    const float *fptr = pb->orig_data;
    const uint8_t *nptr = pb->nan_texel;
    int xsize = img->xsize;
    int ysize = img->ysize;
    int zsize = img->zsize;
    int x, y, z;


    float data[7];
    data[4] = 0.0f;
    data[5] = 1.0f;


    if (img->imagedata8) {
        for (z = 0; z < zdim; z++)
            for (y = 0; y < ydim; y++)
                for (x = 0; x < xdim; x++) {
                    int xi = xpos + x;
                    int yi = ypos + y;
                    int zi = zpos + z;

                    if (xi >= 0 && yi >= 0 && zi >= 0 && xi < xsize && yi < ysize && zi < zsize) {
                        if (*nptr) {
                            // NaN-pixel, but we can't display it. Display purple instead.
                            img->imagedata8[zi][yi][4 * xi] = 0xFF;
                            img->imagedata8[zi][yi][4 * xi + 1] = 0x00;
                            img->imagedata8[zi][yi][4 * xi + 2] = 0xFF;
                            img->imagedata8[zi][yi][4 * xi + 3] = 0xFF;
                        }

                        else {
#ifdef USE_PERFORMM_SRGB_TRANSFORM
                            // apply swizzle
                            if (g_ASTCEncode.m_perform_srgb_transform) {
                                float r = fptr[0];
                                float g = fptr[1];
                                float b = fptr[2];

                                if (r <= 0.0031308f)
                                    r = r * 12.92f;
                                else if (r <= 1)
                                    r = (float)(1.055f * pow(r, (1.0f / 2.4f)) - 0.055f);

                                if (g <= 0.0031308f)
                                    g = g * 12.92f;
                                else if (g <= 1)
                                    g = (float)(1.055f * pow(g, (1.0f / 2.4f)) - 0.055f);

                                if (b <= 0.0031308f)
                                    b = b * 12.92f;
                                else if (b <= 1)
                                    b = (float)(1.055f * pow(b, (1.0f / 2.4f)) - 0.055f);

                                data[0] = r;
                                data[1] = g;
                                data[2] = b;
                            } else
#endif
                            {

                                float r = fptr[0];
                                float g = fptr[1];
                                float b = fptr[2];

                                data[0] = r;
                                data[1] = g;
                                data[2] = b;
                            }
                            data[3] = fptr[3];


                            float xcoord = (data[0] * 2.0f) - 1.0f;
                            float ycoord = (data[3] * 2.0f) - 1.0f;
                            float zcoord = 1.0f - xcoord * xcoord - ycoord * ycoord;
                            if (zcoord < 0.0f)
                                zcoord = 0.0f;
                            data[6] = float((sqrt(zcoord) * 0.5f) + 0.5f);

                            // clamp to [0,1]
                            if (data[0] > 1.0f)
                                data[0] = 1.0f;
                            if (data[1] > 1.0f)
                                data[1] = 1.0f;
                            if (data[2] > 1.0f)
                                data[2] = 1.0f;
                            if (data[3] > 1.0f)
                                data[3] = 1.0f;


                            // pack the data
                            int ri = static_cast < int >(floor(data[swz.r] * 255.0f + 0.5f));
                            int gi = static_cast < int >(floor(data[swz.g] * 255.0f + 0.5f));
                            int bi = static_cast < int >(floor(data[swz.b] * 255.0f + 0.5f));
                            int ai = static_cast < int >(floor(data[swz.a] * 255.0f + 0.5f));

                            img->imagedata8[zi][yi][4 * xi] = (uint8_t)ri;
                            img->imagedata8[zi][yi][4 * xi + 1] = (uint8_t)gi;
                            img->imagedata8[zi][yi][4 * xi + 2] = (uint8_t)bi;
                            img->imagedata8[zi][yi][4 * xi + 3] = (uint8_t)ai;
                        }
                    }
                    fptr += 4;
                    nptr++;
                }
    } else if (img->imagedata16) {
        for (z = 0; z < zdim; z++)
            for (y = 0; y < ydim; y++)
                for (x = 0; x < xdim; x++) {
                    int xi = xpos + x;
                    int yi = ypos + y;
                    int zi = zpos + z;

                    if (xi >= 0 && yi >= 0 && zi >= 0 && xi < xsize && yi < ysize && zi < zsize) {
                        if (*nptr) {
                            img->imagedata16[zi][yi][4 * xi] = 0xFFFF;
                            img->imagedata16[zi][yi][4 * xi + 1] = 0xFFFF;
                            img->imagedata16[zi][yi][4 * xi + 2] = 0xFFFF;
                            img->imagedata16[zi][yi][4 * xi + 3] = 0xFFFF;
                        }

                        else {
#ifdef USE_PERFORMM_SRGB_TRANSFORM
                            // apply swizzle
                            if (g_ASTCEncode.m_perform_srgb_transform) {
                                float r = fptr[0];
                                float g = fptr[1];
                                float b = fptr[2];

                                if (r <= 0.0031308f)
                                    r = r * 12.92f;
                                else if (r <= 1)
                                    r = (float)(1.055f * pow(r, (1.0f / 2.4f)) - 0.055f);
                                if (g <= 0.0031308f)
                                    g = g * 12.92f;
                                else if (g <= 1)
                                    g = (float)(1.055f * pow(g, (1.0f / 2.4f)) - 0.055f);
                                if (b <= 0.0031308f)
                                    b = b * 12.92f;
                                else if (b <= 1)
                                    b = (float)(1.055f * pow(b, (1.0f / 2.4f)) - 0.055f);

                                data[0] = r;
                                data[1] = g;
                                data[2] = b;
                            } else
#endif
                            {
                                data[0] = fptr[0];
                                data[1] = fptr[1];
                                data[2] = fptr[2];
                            }
                            data[3] = fptr[3];

                            float x1 = (data[0] * 2.0f) - 1.0f;
                            float y1 = (data[3] * 2.0f) - 1.0f;
                            float z1 = 1.0f - x1 * x1 - y1 * y1;
                            if (z1 < 0.0f)
                                z1 = 0.0f;
                            data[6] = (float)((sqrt(z1) * 0.5f) + 0.5f);


                            int r = ASTC_Encoder::float_to_sf16(data[swz.r],  ASTC_Encoder::SF_NEARESTEVEN);
                            int g = ASTC_Encoder::float_to_sf16(data[swz.g],  ASTC_Encoder::SF_NEARESTEVEN);
                            int b = ASTC_Encoder::float_to_sf16(data[swz.b],  ASTC_Encoder::SF_NEARESTEVEN);
                            int a = ASTC_Encoder::float_to_sf16(data[swz.a],  ASTC_Encoder::SF_NEARESTEVEN);
                            img->imagedata16[zi][yi][4 * xi] = (uint16_t)r;
                            img->imagedata16[zi][yi][4 * xi + 1] = (uint16_t)g;
                            img->imagedata16[zi][yi][4 * xi + 2] = (uint16_t)b;
                            img->imagedata16[zi][yi][4 * xi + 3] = (uint16_t)a;
                        }
                    }
                    fptr += 4;
                    nptr++;
                }
    }
}

uint32_t hash52_cpu(uint32_t inp) {
    inp ^= inp >> 15;

    inp *= 0xEEDE0891;            // (2^4+1)*(2^7+1)*(2^17-1)
    inp ^= inp >> 5;
    inp += inp << 16;
    inp ^= inp >> 7;
    inp ^= inp >> 3;
    inp ^= inp << 6;
    inp ^= inp >> 17;
    return inp;
}

int select_partition_cpu(int seed, int x, int y, int z, int partitioncount, int small_block) {
    if (small_block) {
        x <<= 1;
        y <<= 1;
        z <<= 1;
    }

    seed += (partitioncount - 1) * 1024;

    uint32_t rnum = hash52_cpu(seed);

    uint8_t seed1 = rnum & 0xF;
    uint8_t seed2 = (rnum >> 4) & 0xF;
    uint8_t seed3 = (rnum >> 8) & 0xF;
    uint8_t seed4 = (rnum >> 12) & 0xF;
    uint8_t seed5 = (rnum >> 16) & 0xF;
    uint8_t seed6 = (rnum >> 20) & 0xF;
    uint8_t seed7 = (rnum >> 24) & 0xF;
    uint8_t seed8 = (rnum >> 28) & 0xF;
    uint8_t seed9 = (rnum >> 18) & 0xF;
    uint8_t seed10 = (rnum >> 22) & 0xF;
    uint8_t seed11 = (rnum >> 26) & 0xF;
    uint8_t seed12 = ((rnum >> 30) | (rnum << 2)) & 0xF;

    // squaring all the seeds in order to bias their distribution
    // towards lower values.
    seed1 *= seed1;
    seed2 *= seed2;
    seed3 *= seed3;
    seed4 *= seed4;
    seed5 *= seed5;
    seed6 *= seed6;
    seed7 *= seed7;
    seed8 *= seed8;
    seed9 *= seed9;
    seed10 *= seed10;
    seed11 *= seed11;
    seed12 *= seed12;


    int sh1, sh2, sh3;
    if (seed & 1) {
        sh1 = (seed & 2 ? 4 : 5);
        sh2 = (partitioncount == 3 ? 6 : 5);
    } else {
        sh1 = (partitioncount == 3 ? 6 : 5);
        sh2 = (seed & 2 ? 4 : 5);
    }
    sh3 = (seed & 0x10) ? sh1 : sh2;

    seed1 >>= sh1;
    seed2 >>= sh2;
    seed3 >>= sh1;
    seed4 >>= sh2;
    seed5 >>= sh1;
    seed6 >>= sh2;
    seed7 >>= sh1;
    seed8 >>= sh2;

    seed9 >>= sh3;
    seed10 >>= sh3;
    seed11 >>= sh3;
    seed12 >>= sh3;


    int a = seed1 * x + seed2 * y + seed11 * z + (rnum >> 14);
    int b = seed3 * x + seed4 * y + seed12 * z + (rnum >> 10);
    int c = seed5 * x + seed6 * y + seed9 * z + (rnum >> 6);
    int d = seed7 * x + seed8 * y + seed10 * z + (rnum >> 2);


    // apply the saw
    a &= 0x3F;
    b &= 0x3F;
    c &= 0x3F;
    d &= 0x3F;

    // remove some of the components of we are to output < 4 partitions.
    if (partitioncount <= 3)
        d = 0;
    if (partitioncount <= 2)
        c = 0;
    if (partitioncount <= 1)
        b = 0;

    int partition;
    if (a >= b && a >= c && a >= d)
        partition = 0;
    else if (b >= c && b >= d)
        partition = 1;
    else if (c >= d)
        partition = 2;
    else
        partition = 3;
    return partition;
}

void generate_one_partition_table_cpu(int xdim, int ydim, int zdim, int partition_count, int partition_index, partition_info_cpu * pt) {
    int small_block = (xdim * ydim * zdim) < 32;

    uint8_t *partition_of_texel = pt->partition_of_texel;
    int x, y, z, i;


    for (z = 0; z < zdim; z++)
        for (y = 0; y < ydim; y++)
            for (x = 0; x < xdim; x++) {
                uint8_t part = (uint8_t)select_partition_cpu(partition_index, x, y, z, partition_count, small_block);
                *partition_of_texel++ = part;
            }


    int texels_per_block = xdim * ydim * zdim;

    int counts[4];
    for (i = 0; i < 4; i++)
        counts[i] = 0;

    for (i = 0; i < texels_per_block; i++) {
        int partition = pt->partition_of_texel[i];
        pt->texels_of_partition[partition][counts[partition]++] = (uint8_t)i;
    }

    for (i = 0; i < 4; i++)
        pt->texels_per_partition[i] = (uint8_t)counts[i];

    if (counts[0] == 0)
        pt->partition_count = 0;
    else if (counts[1] == 0)
        pt->partition_count = 1;
    else if (counts[2] == 0)
        pt->partition_count = 2;
    else if (counts[3] == 0)
        pt->partition_count = 3;
    else
        pt->partition_count = 4;


    for (i = 0; i < 4; i++)
        pt->coverage_bitmaps[i] = 0ULL;

    const block_size_descriptor_cpu *bsd = get_block_size_descriptor_cpu(xdim, ydim, zdim);
    int texels_to_process = bsd->texelcount_for_bitmap_partitioning;
    for (i = 0; i < texels_to_process; i++) {
        pt->coverage_bitmaps[pt->partition_of_texel[i]] |= 1ULL << i;
    }

}

void imageblock_initialize_orig_from_work_cpu(imageblock_cpu * pb, int pixelcount) {
    int i;
    float *fptr = pb->orig_data;
    float *wptr = pb->work_data;

    for (i = 0; i < pixelcount; i++) {
        if (pb->rgb_lns[i]) {
            fptr[0] = ASTC_Encoder::sf16_to_float(ASTC_Encoder::lns_to_sf16((uint16_t)wptr[0]));
            fptr[1] = ASTC_Encoder::sf16_to_float(ASTC_Encoder::lns_to_sf16((uint16_t)wptr[1]));
            fptr[2] = ASTC_Encoder::sf16_to_float(ASTC_Encoder::lns_to_sf16((uint16_t)wptr[2]));
        } else {
            fptr[0] = ASTC_Encoder::sf16_to_float(ASTC_Encoder::unorm16_to_sf16((uint16_t)wptr[0]));
            fptr[1] = ASTC_Encoder::sf16_to_float(ASTC_Encoder::unorm16_to_sf16((uint16_t)wptr[1]));
            fptr[2] = ASTC_Encoder::sf16_to_float(ASTC_Encoder::unorm16_to_sf16((uint16_t)wptr[2]));
        }

        if (pb->alpha_lns[i]) {
            fptr[3] = ASTC_Encoder::sf16_to_float(ASTC_Encoder::lns_to_sf16((uint16_t)wptr[3]));
        } else {
            fptr[3] = ASTC_Encoder::sf16_to_float(ASTC_Encoder::unorm16_to_sf16((uint16_t)wptr[3]));
        }

        fptr += 4;
        wptr += 4;
    }

    imageblock_initialize_deriv_from_work_and_orig_cpu(pb, pixelcount);
}

void unpack_color_endpoints_cpu(ASTC_Encoder::astc_decode_mode decode_mode, int format, int quantization_level,  int *input, int *rgb_hdr, int *alpha_hdr, int *nan_endpoint, ASTC_Encoder::ushort4 * output0, ASTC_Encoder::ushort4 * output1) {
    *nan_endpoint = 0;

    switch (format) {
    case ASTC_Encoder::FMT_LUMINANCE:
        *rgb_hdr = 0;
        *alpha_hdr = 0;
        ASTC_Encoder::luminance_unpack(input, quantization_level, output0, output1);
        break;

    case ASTC_Encoder::FMT_LUMINANCE_DELTA:
        *rgb_hdr = 0;
        *alpha_hdr = 0;
        ASTC_Encoder::luminance_delta_unpack(input, quantization_level, output0, output1);
        break;

    case ASTC_Encoder::FMT_HDR_LUMINANCE_SMALL_RANGE:
        *rgb_hdr = 1;
        *alpha_hdr = -1;
        ASTC_Encoder::hdr_luminance_small_range_unpack(input, quantization_level, output0, output1);
        break;

    case ASTC_Encoder::FMT_HDR_LUMINANCE_LARGE_RANGE:
        *rgb_hdr = 1;
        *alpha_hdr = -1;
        ASTC_Encoder::hdr_luminance_large_range_unpack(input, quantization_level, output0, output1);
        break;

    case ASTC_Encoder::FMT_LUMINANCE_ALPHA:
        *rgb_hdr = 0;
        *alpha_hdr = 0;
        ASTC_Encoder::luminance_alpha_unpack(input, quantization_level, output0, output1);
        break;

    case ASTC_Encoder::FMT_LUMINANCE_ALPHA_DELTA:
        *rgb_hdr = 0;
        *alpha_hdr = 0;
        ASTC_Encoder::luminance_alpha_delta_unpack(input, quantization_level, output0, output1);
        break;

    case ASTC_Encoder::FMT_RGB_SCALE:
        *rgb_hdr = 0;
        *alpha_hdr = 0;
        ASTC_Encoder::rgb_scale_unpack(input, quantization_level, output0, output1);
        break;

    case ASTC_Encoder::FMT_RGB_SCALE_ALPHA:
        *rgb_hdr = 0;
        *alpha_hdr = 0;
        ASTC_Encoder::rgb_scale_alpha_unpack(input, quantization_level, output0, output1);
        break;

    case ASTC_Encoder::FMT_HDR_RGB_SCALE:
        *rgb_hdr = 1;
        *alpha_hdr = -1;
        ASTC_Encoder::hdr_rgbo_unpack3(input, quantization_level, output0, output1);
        break;

    case ASTC_Encoder::FMT_RGB:
        *rgb_hdr = 0;
        *alpha_hdr = 0;
        ASTC_Encoder::rgb_unpack(input, quantization_level, output0, output1);
        break;

    case ASTC_Encoder::FMT_RGB_DELTA:
        *rgb_hdr = 0;
        *alpha_hdr = 0;
        ASTC_Encoder::rgb_delta_unpack(input, quantization_level, output0, output1);
        break;

    case ASTC_Encoder::FMT_HDR_RGB:
        *rgb_hdr = 1;
        *alpha_hdr = -1;
        ASTC_Encoder::hdr_rgb_unpack3(input, quantization_level, output0, output1);
        break;

    case ASTC_Encoder::FMT_RGBA:
        *rgb_hdr = 0;
        *alpha_hdr = 0;
        ASTC_Encoder::rgba_unpack(input, quantization_level, output0, output1);
        break;

    case ASTC_Encoder::FMT_RGBA_DELTA:
        *rgb_hdr = 0;
        *alpha_hdr = 0;
        ASTC_Encoder::rgba_delta_unpack(input, quantization_level, output0, output1);
        break;

    case ASTC_Encoder::FMT_HDR_RGB_LDR_ALPHA:
        *rgb_hdr = 1;
        *alpha_hdr = 0;
        ASTC_Encoder::hdr_rgb_ldr_alpha_unpack3(input, quantization_level, output0, output1);
        break;

    case ASTC_Encoder::FMT_HDR_RGBA:
        *rgb_hdr = 1;
        *alpha_hdr = 1;
        ASTC_Encoder::hdr_rgb_hdr_alpha_unpack3(input, quantization_level, output0, output1);
        break;

    default:
        break;;
    }


    if (*alpha_hdr == -1) {
        if (g_ASTCEncode.m_alpha_force_use_of_hdr) {
            output0->w = 0x7800;
            output1->w = 0x7800;
            *alpha_hdr = 1;
        } else {
            output0->w = 0x00FF;
            output1->w = 0x00FF;
            *alpha_hdr = 0;
        }
    }


    switch (decode_mode) {
    case ASTC_Encoder::DECODE_LDR_SRGB:
        if (*rgb_hdr == 1) {
            output0->x = 0xFF00;
            output0->y = 0x0000;
            output0->z = 0xFF00;
            output0->w = 0xFF00;
            output1->x = 0xFF00;
            output1->y = 0x0000;
            output1->z = 0xFF00;
            output1->w = 0xFF00;
        } else {
            output0->x *= 257;
            output0->y *= 257;
            output0->z *= 257;
            output0->w *= 257;
            output1->x *= 257;
            output1->y *= 257;
            output1->z *= 257;
            output1->w *= 257;
        }
        *rgb_hdr = 0;
        *alpha_hdr = 0;
        break;

    case ASTC_Encoder::DECODE_LDR:
        if (*rgb_hdr == 1) {
            output0->x = 0xFFFF;
            output0->y = 0xFFFF;
            output0->z = 0xFFFF;
            output0->w = 0xFFFF;
            output1->x = 0xFFFF;
            output1->y = 0xFFFF;
            output1->z = 0xFFFF;
            output1->w = 0xFFFF;
            *nan_endpoint = 1;
        } else {
            output0->x *= 257;
            output0->y *= 257;
            output0->z *= 257;
            output0->w *= 257;
            output1->x *= 257;
            output1->y *= 257;
            output1->z *= 257;
            output1->w *= 257;
        }
        *rgb_hdr = 0;
        *alpha_hdr = 0;
        break;

    case ASTC_Encoder::DECODE_HDR:

        if (*rgb_hdr == 0) {
            output0->x *= 257;
            output0->y *= 257;
            output0->z *= 257;
            output1->x *= 257;
            output1->y *= 257;
            output1->z *= 257;
        }
        if (*alpha_hdr == 0) {
            output0->w *= 257;
            output1->w *= 257;
        }
        break;
    }
}


ASTC_Encoder::ushort4 lerp_color_int(ASTC_Encoder::astc_decode_mode decode_mode, ASTC_Encoder::ushort4 color0, ASTC_Encoder::ushort4 color1, int weight, int plane2_weight, int plane2_color_component    // -1 in 1-plane mode
                                    ) {
    ASTC_Encoder::int4 ecolor0 = ASTC_Encoder::int4(color0.x, color0.y, color0.z, color0.w);
    ASTC_Encoder::int4 ecolor1 = ASTC_Encoder::int4(color1.x, color1.y, color1.z, color1.w);

    ASTC_Encoder::int4 eweight1 = ASTC_Encoder::int4(weight, weight, weight, weight);
    switch (plane2_color_component) {
    case 0:
        eweight1.x = plane2_weight;
        break;
    case 1:
        eweight1.y = plane2_weight;
        break;
    case 2:
        eweight1.z = plane2_weight;
        break;
    case 3:
        eweight1.w = plane2_weight;
        break;
    default:
        break;
    }

    ASTC_Encoder::int4 eweight0 = ASTC_Encoder::int4(64, 64, 64, 64) - eweight1;

    if (decode_mode == ASTC_Encoder::DECODE_LDR_SRGB) {
        ecolor0 = ecolor0 >> 8;
        ecolor1 = ecolor1 >> 8;
    }

    ASTC_Encoder::int4 color = (ecolor0 * eweight0) + (ecolor1 * eweight1) + ASTC_Encoder::int4(32, 32, 32, 32);
    color = color >> 6;
    if (decode_mode == ASTC_Encoder::DECODE_LDR_SRGB)
        color = color | (color << 8);

    ASTC_Encoder::ushort4 rcolor = ASTC_Encoder::ushort4((ASTC_Encoder::ushort)color.x, (ASTC_Encoder::ushort)color.y, (ASTC_Encoder::ushort)color.z, (ASTC_Encoder::ushort)color.w);
    return rcolor;
}

int compute_value_of_texel_int_cpu(int texel_to_get, const decimation_table_cpu * it, const int *weights) {
    int i;
    int summed_value = 8;
    int weights_to_evaluate = it->texel_num_weights[texel_to_get];
    for (i = 0; i < weights_to_evaluate; i++) {
        summed_value += weights[it->texel_weights[texel_to_get][i]] * it->texel_weights_int[texel_to_get][i];
    }
    return summed_value >> 4;
}

void decompress_symbolic_block_cpu(ASTC_Encoder::astc_decode_mode decode_mode,
                                   int xdim, int ydim, int zdim,   // dimensions of block
                                   int xpos, int ypos, int zpos,   // position of block
                                   symbolic_compressed_block_cpu * scb,
                                   imageblock_cpu * blk) {
    blk->xpos = xpos;
    blk->ypos = ypos;
    blk->zpos = zpos;

    int i;

    // if we detected an error-block, blow up immediately.
    if (scb->error_block) {
        if (decode_mode == ASTC_Encoder::DECODE_LDR_SRGB) {
            for (i = 0; i < xdim * ydim * zdim; i++) {
                blk->orig_data[4 * i] = 1.0f;
                blk->orig_data[4 * i + 1] = 0.0f;
                blk->orig_data[4 * i + 2] = 1.0f;
                blk->orig_data[4 * i + 3] = 1.0f;
                blk->rgb_lns[i] = 0;
                blk->alpha_lns[i] = 0;
                blk->nan_texel[i] = 0;
            }
        } else {
            for (i = 0; i < xdim * ydim * zdim; i++) {
                blk->orig_data[4 * i] = 0.0f;
                blk->orig_data[4 * i + 1] = 0.0f;
                blk->orig_data[4 * i + 2] = 0.0f;
                blk->orig_data[4 * i + 3] = 0.0f;
                blk->rgb_lns[i] = 0;
                blk->alpha_lns[i] = 0;
                blk->nan_texel[i] = 1;
            }
        }

        imageblock_initialize_work_from_orig_cpu(blk, xdim * ydim * zdim);
        update_imageblock_flags_cpu(blk, xdim, ydim, zdim);
        return;
    }

    if (scb->block_mode < 0) {
        float red = 0, green = 0, blue = 0, alpha = 0;
        int use_lns = 0;
        int use_nan = 0;

        if (scb->block_mode == -2) {
            // For sRGB decoding, we should return only the top 8 bits.
            int mask = (decode_mode == ASTC_Encoder::DECODE_LDR_SRGB) ? 0xFF00 : 0xFFFF;

            red     = ASTC_Encoder::sf16_to_float(ASTC_Encoder::unorm16_to_sf16((uint16_t)scb->constant_color[0] & mask));
            green   = ASTC_Encoder::sf16_to_float(ASTC_Encoder::unorm16_to_sf16((uint16_t)scb->constant_color[1] & mask));
            blue    = ASTC_Encoder::sf16_to_float(ASTC_Encoder::unorm16_to_sf16((uint16_t)scb->constant_color[2] & mask));
            alpha   = ASTC_Encoder::sf16_to_float(ASTC_Encoder::unorm16_to_sf16((uint16_t)scb->constant_color[3] & mask));
            use_lns = 0;
            use_nan = 0;
        } else {
            switch (decode_mode) {
            case ASTC_Encoder::DECODE_LDR_SRGB:
                red = 1.0f;
                green = 0.0f;
                blue = 1.0f;
                alpha = 1.0f;
                use_lns = 0;
                use_nan = 0;
                break;
            case ASTC_Encoder::DECODE_LDR:
                red = 0.0f;
                green = 0.0f;
                blue = 0.0f;
                alpha = 0.0f;
                use_lns = 0;
                use_nan = 1;
                break;
            case ASTC_Encoder::DECODE_HDR:
                // constant-color block; unpack from FP16 to FP32.
                red     = ASTC_Encoder::sf16_to_float((sf16)scb->constant_color[0]);
                green   = ASTC_Encoder::sf16_to_float((sf16)scb->constant_color[1]);
                blue    = ASTC_Encoder::sf16_to_float((sf16)scb->constant_color[2]);
                alpha   = ASTC_Encoder::sf16_to_float((sf16)scb->constant_color[3]);
                use_lns = 1;
                use_nan = 0;
                break;
            }
        }

        for (i = 0; i < xdim * ydim * zdim; i++) {
            blk->orig_data[4 * i] = red;
            blk->orig_data[4 * i + 1] = green;
            blk->orig_data[4 * i + 2] = blue;
            blk->orig_data[4 * i + 3] = alpha;
            blk->rgb_lns[i] = (uint8_t)use_lns;
            blk->alpha_lns[i] = (uint8_t)use_lns;
            blk->nan_texel[i] = (uint8_t)use_nan;
        }


        imageblock_initialize_work_from_orig_cpu(blk, xdim * ydim * zdim);
        update_imageblock_flags_cpu(blk, xdim, ydim, zdim);
        return;
    }

    // get the appropriate partition-table entry
    int partition_count = scb->partition_count;

    if ((partition_count > 5) || (scb->partition_index > 1024))
        return;


    // get the appropriate block descriptor
    block_size_descriptor_cpu *bsd = get_block_size_descriptor_cpu(xdim, ydim, zdim);
    decimation_table_cpu **ixtab2 = bsd->decimation_tables;

    decimation_table_cpu *it = ixtab2[bsd->block_modes[scb->block_mode].decimation_mode];

    int is_dual_plane = bsd->block_modes[scb->block_mode].is_dual_plane;

    int weight_quantization_level = bsd->block_modes[scb->block_mode].quantization_mode;

    // decode the color endpoints
    ASTC_Encoder::ushort4 color_endpoint0[4];
    ASTC_Encoder::ushort4 color_endpoint1[4];
    int rgb_hdr_endpoint[4];
    int alpha_hdr_endpoint[4];
    int nan_endpoint[4];

    for (i = 0; i < partition_count; i++)
        unpack_color_endpoints_cpu(
            decode_mode,
            scb->color_formats[i],
            scb->color_quantization_level,
            scb->color_values[i],
            &(rgb_hdr_endpoint[i]),
            &(alpha_hdr_endpoint[i]),
            &(nan_endpoint[i]),
            &(color_endpoint0[i]),
            &(color_endpoint1[i]));

    // first unquantize the weights
    int uq_plane1_weights[MAX_WEIGHTS_PER_BLOCK];
    int uq_plane2_weights[MAX_WEIGHTS_PER_BLOCK];
    int weight_count = it->num_weights;

    const ASTC_Encoder::quantization_and_transfer_table *qat = &(ASTC_Encoder::quant_and_xfer_tables[weight_quantization_level]);

    for (i = 0; i < weight_count; i++) {
        uq_plane1_weights[i] = qat->unquantized_value[scb->plane1_weights[i]];
    }
    if (is_dual_plane) {
        for (i = 0; i < weight_count; i++)
            uq_plane2_weights[i] = qat->unquantized_value[scb->plane2_weights[i]];
    }


    // then un-decimate them.
    int weights[MAX_TEXELS_PER_BLOCK];
    int plane2_weights[MAX_TEXELS_PER_BLOCK];


    int texels_per_block = xdim * ydim * zdim;
    for (i = 0; i < texels_per_block; i++)
        weights[i] = compute_value_of_texel_int_cpu(i, it, uq_plane1_weights);

    if (is_dual_plane)
        for (i = 0; i < texels_per_block; i++)
            plane2_weights[i] = compute_value_of_texel_int_cpu(i, it, uq_plane2_weights);


    int plane2_color_component = scb->plane2_color_component;


    // now that we have endpoint colors and weights, we can unpack actual colors for
    // each texel.
    for (i = 0; i < texels_per_block; i++) {
        ASTC_Encoder::uint8_t partition = g_ASTCEncode.partition_tables[partition_count][scb->partition_index].partition_of_texel[i];

        ASTC_Encoder::ushort4 color = lerp_color_int(decode_mode,
                                      color_endpoint0[partition],
                                      color_endpoint1[partition],
                                      weights[i],
                                      plane2_weights[i],
                                      is_dual_plane ? plane2_color_component : -1);

        blk->rgb_lns[i] = (uint8_t)rgb_hdr_endpoint[partition];
        blk->alpha_lns[i] = (uint8_t)alpha_hdr_endpoint[partition];
        blk->nan_texel[i] = (uint8_t)nan_endpoint[partition];

        blk->work_data[4 * i] = color.x;
        blk->work_data[4 * i + 1] = color.y;
        blk->work_data[4 * i + 2] = color.z;
        blk->work_data[4 * i + 3] = color.w;
    }

    imageblock_initialize_orig_from_work_cpu(blk, xdim * ydim * zdim);
    update_imageblock_flags_cpu(blk, xdim, ydim, zdim);
}

// End CPU Decoder Code
//-----------------------------------------------