src/compiler/glsl/bc1.glsl - platform/external/mesa3d - Git at Google

 /*
  * Copyright 2020-2022 Matias N. Goldberg
  * Copyright 2022 Intel Corporation
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
  * to deal in the Software without restriction, including without limitation
  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  * and/or sell copies of the Software, and to permit persons to whom the
  * Software is furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice shall be included in
  * all copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  * DEALINGS IN THE SOFTWARE.
  */

 #version 310 es

 #if defined(GL_ES) && GL_ES == 1
 	// Desktop GLSL allows the const keyword for either compile-time or
 	// run-time constants. GLSL ES only allows the keyword for compile-time
 	// constants. Since we use const on run-time constants, define it to
 	// nothing.
 	#define const
 #endif

 %s // include "CrossPlatformSettings_piece_all.glsl"

 #define FLT_MAX 340282346638528859811704183484516925440.0f

 layout( location = 0 ) uniform uint p_numRefinements;

 uniform sampler2D srcTex;

 layout( rgba16ui ) uniform restrict writeonly mediump uimage2D dstTexture;

 layout( std430, binding = 1 ) readonly restrict buffer globalBuffer
 {
 	float2 c_oMatch5[256];
 	float2 c_oMatch6[256];
 };

 layout( local_size_x = 8,  //
 		local_size_y = 8,  //
 		local_size_z = 1 ) in;

 float3 rgb565to888( float rgb565 )
 {
 	float3 retVal;
 	retVal.x = floor( rgb565 / 2048.0f );
 	retVal.y = floor( mod( rgb565, 2048.0f ) / 32.0f );
 	retVal.z = floor( mod( rgb565, 32.0f ) );

 	// This is the correct 565 to 888 conversion:
 	//		rgb = floor( rgb * ( 255.0f / float3( 31.0f, 63.0f, 31.0f ) ) + 0.5f )
 	//
 	// However stb_dxt follows a different one:
 	//		rb = floor( rb * ( 256 / 32 + 8 / 32 ) );
 	//		g  = floor( g  * ( 256 / 64 + 4 / 64 ) );
 	//
 	// I'm not sure exactly why but it's possible this is how the S3TC specifies it should be decoded
 	// It's quite possible this is the reason:
 	//		http://www.ludicon.com/castano/blog/2009/03/gpu-dxt-decompression/
 	//
 	// Or maybe it's just because it's cheap to do with integer shifts.
 	// Anyway, we follow stb_dxt's conversion just in case
 	// (gives almost the same result, with 1 or -1 of difference for a very few values)
 	//
 	// Perhaps when we make 888 -> 565 -> 888 it doesn't matter
 	// because they end up mapping to the original number

 	return floor( retVal * float3( 8.25f, 4.0625f, 8.25f ) );
 }

 float rgb888to565( float3 rgbValue )
 {
 	rgbValue.rb = floor( rgbValue.rb * 31.0f / 255.0f + 0.5f );
 	rgbValue.g = floor( rgbValue.g * 63.0f / 255.0f + 0.5f );

 	return rgbValue.r * 2048.0f + rgbValue.g * 32.0f + rgbValue.b;
 }

 // linear interpolation at 1/3 point between a and b, using desired rounding type
 float3 lerp13( float3 a, float3 b )
 {
 #ifdef STB_DXT_USE_ROUNDING_BIAS
 	// with rounding bias
 	return a + floor( ( b - a ) * ( 1.0f / 3.0f ) + 0.5f );
 #else
 	// without rounding bias
 	return floor( ( 2.0f * a + b ) / 3.0f );
 #endif
 }

 /// Unpacks a block of 4 colours from two 16-bit endpoints
 void EvalColors( out float3 colours[4], float c0, float c1 )
 {
 	colours[0] = rgb565to888( c0 );
 	colours[1] = rgb565to888( c1 );
 	colours[2] = lerp13( colours[0], colours[1] );
 	colours[3] = lerp13( colours[1], colours[0] );
 }

 /** The color optimization function. (Clever code, part 1)
 @param outMinEndp16 [out]
 	Minimum endpoint, in RGB565
 @param outMaxEndp16 [out]
 	Maximum endpoint, in RGB565
 */
 void OptimizeColorsBlock( const uint srcPixelsBlock[16], out float outMinEndp16, out float outMaxEndp16 )
 {
 	// determine color distribution
 	float3 avgColour;
 	float3 minColour;
 	float3 maxColour;

 	avgColour = minColour = maxColour = unpackUnorm4x8( srcPixelsBlock[0] ).xyz;
 	for( int i = 1; i < 16; ++i )
 	{
 		const float3 currColourUnorm = unpackUnorm4x8( srcPixelsBlock[i] ).xyz;
 		avgColour += currColourUnorm;
 		minColour = min( minColour, currColourUnorm );
 		maxColour = max( maxColour, currColourUnorm );
 	}

 	avgColour = round( avgColour * 255.0f / 16.0f );
 	maxColour *= 255.0f;
 	minColour *= 255.0f;

 	// determine covariance matrix
 	float cov[6];
 	for( int i = 0; i < 6; ++i )
 		cov[i] = 0.0f;

 	for( int i = 0; i < 16; ++i )
 	{
 		const float3 currColour = unpackUnorm4x8( srcPixelsBlock[i] ).xyz * 255.0f;
 		float3 rgbDiff = currColour - avgColour;

 		cov[0] += rgbDiff.r * rgbDiff.r;
 		cov[1] += rgbDiff.r * rgbDiff.g;
 		cov[2] += rgbDiff.r * rgbDiff.b;
 		cov[3] += rgbDiff.g * rgbDiff.g;
 		cov[4] += rgbDiff.g * rgbDiff.b;
 		cov[5] += rgbDiff.b * rgbDiff.b;
 	}

 	// convert covariance matrix to float, find principal axis via power iter
 	for( int i = 0; i < 6; ++i )
 		cov[i] /= 255.0f;

 	float3 vF = maxColour - minColour;

 	const int nIterPower = 4;
 	for( int iter = 0; iter < nIterPower; ++iter )
 	{
 		const float r = vF.r * cov[0] + vF.g * cov[1] + vF.b * cov[2];
 		const float g = vF.r * cov[1] + vF.g * cov[3] + vF.b * cov[4];
 		const float b = vF.r * cov[2] + vF.g * cov[4] + vF.b * cov[5];

 		vF.r = r;
 		vF.g = g;
 		vF.b = b;
 	}

 	float magn = max3( abs( vF.r ), abs( vF.g ), abs( vF.b ) );
 	float3 v;

 	if( magn < 4.0f )
 	{                  // too small, default to luminance
 		v.r = 299.0f;  // JPEG YCbCr luma coefs, scaled by 1000.
 		v.g = 587.0f;
 		v.b = 114.0f;
 	}
 	else
 	{
 		v = trunc( vF * ( 512.0f / magn ) );
 	}

 	// Pick colors at extreme points
 	float3 minEndpoint, maxEndpoint;
 	float minDot = FLT_MAX;
 	float maxDot = -FLT_MAX;
 	for( int i = 0; i < 16; ++i )
 	{
 		const float3 currColour = unpackUnorm4x8( srcPixelsBlock[i] ).xyz * 255.0f;
 		const float dotValue = dot( currColour, v );

 		if( dotValue < minDot )
 		{
 			minDot = dotValue;
 			minEndpoint = currColour;
 		}

 		if( dotValue > maxDot )
 		{
 			maxDot = dotValue;
 			maxEndpoint = currColour;
 		}
 	}

 	outMinEndp16 = rgb888to565( minEndpoint );
 	outMaxEndp16 = rgb888to565( maxEndpoint );
 }

 // The color matching function
 uint MatchColorsBlock( const uint srcPixelsBlock[16], float3 colour[4] )
 {
 	uint mask = 0u;
 	float3 dir = colour[0] - colour[1];
 	float stops[4];

 	for( int i = 0; i < 4; ++i )
 		stops[i] = dot( colour[i], dir );

 	// think of the colors as arranged on a line; project point onto that line, then choose
 	// next color out of available ones. we compute the crossover points for "best color in top
 	// half"/"best in bottom half" and then the same inside that subinterval.
 	//
 	// relying on this 1d approximation isn't always optimal in terms of euclidean distance,
 	// but it's very close and a lot faster.
 	// http://cbloomrants.blogspot.com/2008/12/12-08-08-dxtc-summary.html

 	float c0Point = trunc( ( stops[1] + stops[3] ) * 0.5f );
 	float halfPoint = trunc( ( stops[3] + stops[2] ) * 0.5f );
 	float c3Point = trunc( ( stops[2] + stops[0] ) * 0.5f );

 #ifndef BC1_DITHER
 	// the version without dithering is straightforward
 	for( uint i = 16u; i-- > 0u; )
 	{
 		const float3 currColour = unpackUnorm4x8( srcPixelsBlock[i] ).xyz * 255.0f;

 		const float dotValue = dot( currColour, dir );
 		mask <<= 2u;

 		if( dotValue < halfPoint )
 			mask |= ( ( dotValue < c0Point ) ? 1u : 3u );
 		else
 			mask |= ( ( dotValue < c3Point ) ? 2u : 0u );
 	}
 #else
 	// with floyd-steinberg dithering
 	float4 ep1 = float4( 0, 0, 0, 0 );
 	float4 ep2 = float4( 0, 0, 0, 0 );

 	c0Point *= 16.0f;
 	halfPoint *= 16.0f;
 	c3Point *= 16.0f;

 	for( uint y = 0u; y < 4u; ++y )
 	{
 		float ditherDot;
 		uint lmask, step;

 		float3 currColour;
 		float dotValue;

 		currColour = unpackUnorm4x8( srcPixelsBlock[y * 4u + 0u] ).xyz * 255.0f;
 		dotValue = dot( currColour, dir );

 		ditherDot = ( dotValue * 16.0f ) + ( 3.0f * ep2[1] + 5.0f * ep2[0] );
 		if( ditherDot < halfPoint )
 			step = ( ditherDot < c0Point ) ? 1u : 3u;
 		else
 			step = ( ditherDot < c3Point ) ? 2u : 0u;
 		ep1[0] = dotValue - stops[step];
 		lmask = step;

 		currColour = unpackUnorm4x8( srcPixelsBlock[y * 4u + 1u] ).xyz * 255.0f;
 		dotValue = dot( currColour, dir );

 		ditherDot = ( dotValue * 16.0f ) + ( 7.0f * ep1[0] + 3.0f * ep2[2] + 5.0f * ep2[1] + ep2[0] );
 		if( ditherDot < halfPoint )
 			step = ( ditherDot < c0Point ) ? 1u : 3u;
 		else
 			step = ( ditherDot < c3Point ) ? 2u : 0u;
 		ep1[1] = dotValue - stops[step];
 		lmask |= step << 2u;

 		currColour = unpackUnorm4x8( srcPixelsBlock[y * 4u + 2u] ).xyz * 255.0f;
 		dotValue = dot( currColour, dir );

 		ditherDot = ( dotValue * 16.0f ) + ( 7.0f * ep1[1] + 3.0f * ep2[3] + 5.0f * ep2[2] + ep2[1] );
 		if( ditherDot < halfPoint )
 			step = ( ditherDot < c0Point ) ? 1u : 3u;
 		else
 			step = ( ditherDot < c3Point ) ? 2u : 0u;
 		ep1[2] = dotValue - stops[step];
 		lmask |= step << 4u;

 		currColour = unpackUnorm4x8( srcPixelsBlock[y * 4u + 2u] ).xyz * 255.0f;
 		dotValue = dot( currColour, dir );

 		ditherDot = ( dotValue * 16.0f ) + ( 7.0f * ep1[2] + 5.0f * ep2[3] + ep2[2] );
 		if( ditherDot < halfPoint )
 			step = ( ditherDot < c0Point ) ? 1u : 3u;
 		else
 			step = ( ditherDot < c3Point ) ? 2u : 0u;
 		ep1[3] = dotValue - stops[step];
 		lmask |= step << 6u;

 		mask |= lmask << ( y * 8u );
 		{
 			float4 tmp = ep1;
 			ep1 = ep2;
 			ep2 = tmp;
 		}  // swap
 	}
 #endif

 	return mask;
 }

 // The refinement function. (Clever code, part 2)
 // Tries to optimize colors to suit block contents better.
 // (By solving a least squares system via normal equations+Cramer's rule)
 bool RefineBlock( const uint srcPixelsBlock[16], uint mask, inout float inOutMinEndp16,
 				  inout float inOutMaxEndp16 )
 {
 	float newMin16, newMax16;
 	const float oldMin = inOutMinEndp16;
 	const float oldMax = inOutMaxEndp16;

 	if( ( mask ^ ( mask << 2u ) ) < 4u )  // all pixels have the same index?
 	{
 		// yes, linear system would be singular; solve using optimal
 		// single-color match on average color
 		float3 rgbVal = float3( 8.0f / 255.0f, 8.0f / 255.0f, 8.0f / 255.0f );
 		for( int i = 0; i < 16; ++i )
 			rgbVal += unpackUnorm4x8( srcPixelsBlock[i] ).xyz;

 		rgbVal = floor( rgbVal * ( 255.0f / 16.0f ) );

 		newMax16 = c_oMatch5[uint( rgbVal.r )][0] * 2048.0f +  //
 				   c_oMatch6[uint( rgbVal.g )][0] * 32.0f +    //
 				   c_oMatch5[uint( rgbVal.b )][0];
 		newMin16 = c_oMatch5[uint( rgbVal.r )][1] * 2048.0f +  //
 				   c_oMatch6[uint( rgbVal.g )][1] * 32.0f +    //
 				   c_oMatch5[uint( rgbVal.b )][1];
 	}
 	else
 	{
 		const float w1Tab[4] = float[4]( 3.0f, 0.0f, 2.0f, 1.0f );
 		const float prods[4] = float[4]( 589824.0f, 2304.0f, 262402.0f, 66562.0f );
 		// ^some magic to save a lot of multiplies in the accumulating loop...
 		// (precomputed products of weights for least squares system, accumulated inside one 32-bit
 		// register)

 		float akku = 0.0f;
 		uint cm = mask;
 		float3 at1 = float3( 0, 0, 0 );
 		float3 at2 = float3( 0, 0, 0 );
 		for( int i = 0; i < 16; ++i, cm >>= 2u )
 		{
 			const float3 currColour = unpackUnorm4x8( srcPixelsBlock[i] ).xyz * 255.0f;

 			const uint step = cm & 3u;
 			const float w1 = w1Tab[step];
 			akku += prods[step];
 			at1 += currColour * w1;
 			at2 += currColour;
 		}

 		at2 = 3.0f * at2 - at1;

 		// extract solutions and decide solvability
 		const float xx = floor( akku / 65535.0f );
 		const float yy = floor( mod( akku, 65535.0f ) / 256.0f );
 		const float xy = mod( akku, 256.0f );

 		float2 f_rb_g;
 		f_rb_g.x = 3.0f * 31.0f / 255.0f / ( xx * yy - xy * xy );
 		f_rb_g.y = f_rb_g.x * 63.0f / 31.0f;

 		// solve.
 		const float3 newMaxVal = clamp( floor( ( at1 * yy - at2 * xy ) * f_rb_g.xyx + 0.5f ),
 										float3( 0.0f, 0.0f, 0.0f ), float3( 31, 63, 31 ) );
 		newMax16 = newMaxVal.x * 2048.0f + newMaxVal.y * 32.0f + newMaxVal.z;

 		const float3 newMinVal = clamp( floor( ( at2 * xx - at1 * xy ) * f_rb_g.xyx + 0.5f ),
 										float3( 0.0f, 0.0f, 0.0f ), float3( 31, 63, 31 ) );
 		newMin16 = newMinVal.x * 2048.0f + newMinVal.y * 32.0f + newMinVal.z;
 	}

 	inOutMinEndp16 = newMin16;
 	inOutMaxEndp16 = newMax16;

 	return oldMin != newMin16 || oldMax != newMax16;
 }

 #ifdef BC1_DITHER
 /// Quantizes 'srcValue' which is originally in 888 (full range),
 /// converting it to 565 and then back to 888 (quantized)
 float3 quant( float3 srcValue )
 {
 	srcValue = clamp( srcValue, 0.0f, 255.0f );
 	// Convert 888 -> 565
 	srcValue = floor( srcValue * float3( 31.0f / 255.0f, 63.0f / 255.0f, 31.0f / 255.0f ) + 0.5f );
 	// Convert 565 -> 888 back
 	srcValue = floor( srcValue * float3( 8.25f, 4.0625f, 8.25f ) );

 	return srcValue;
 }

 void DitherBlock( const uint srcPixBlck[16], out uint dthPixBlck[16] )
 {
 	float3 ep1[4] = float3[4]( float3( 0, 0, 0 ), float3( 0, 0, 0 ), float3( 0, 0, 0 ), float3( 0, 0, 0 ) );
 	float3 ep2[4] = float3[4]( float3( 0, 0, 0 ), float3( 0, 0, 0 ), float3( 0, 0, 0 ), float3( 0, 0, 0 ) );

 	for( uint y = 0u; y < 16u; y += 4u )
 	{
 		float3 srcPixel, dithPixel;

 		srcPixel = unpackUnorm4x8( srcPixBlck[y + 0u] ).xyz * 255.0f;
 		dithPixel = quant( srcPixel + trunc( ( 3.0f * ep2[1] + 5.0f * ep2[0] ) * ( 1.0f / 16.0f ) ) );
 		ep1[0] = srcPixel - dithPixel;
 		dthPixBlck[y + 0u] = packUnorm4x8( float4( dithPixel * ( 1.0f / 255.0f ), 1.0f ) );

 		srcPixel = unpackUnorm4x8( srcPixBlck[y + 1u] ).xyz * 255.0f;
 		dithPixel = quant(
 			srcPixel + trunc( ( 7.0f * ep1[0] + 3.0f * ep2[2] + 5.0f * ep2[1] + ep2[0] ) * ( 1.0f / 16.0f ) ) );
 		ep1[1] = srcPixel - dithPixel;
 		dthPixBlck[y + 1u] = packUnorm4x8( float4( dithPixel * ( 1.0f / 255.0f ), 1.0f ) );

 		srcPixel = unpackUnorm4x8( srcPixBlck[y + 2u] ).xyz * 255.0f;
 		dithPixel = quant(
 			srcPixel + trunc( ( 7.0f * ep1[1] + 3.0f * ep2[3] + 5.0f * ep2[2] + ep2[1] ) * ( 1.0f / 16.0f ) ) );
 		ep1[2] = srcPixel - dithPixel;
 		dthPixBlck[y + 2u] = packUnorm4x8( float4( dithPixel * ( 1.0f / 255.0f ), 1.0f ) );

 		srcPixel = unpackUnorm4x8( srcPixBlck[y + 3u] ).xyz * 255.0f;
 		dithPixel = quant( srcPixel + trunc( ( 7.0f * ep1[2] + 5.0f * ep2[3] + ep2[2] ) * ( 1.0f / 16.0f ) ) );
 		ep1[3] = srcPixel - dithPixel;
 		dthPixBlck[y + 3u] = packUnorm4x8( float4( dithPixel * ( 1.0f / 255.0f ), 1.0f ) );

 		// swap( ep1, ep2 )
 		for( uint i = 0u; i < 4u; ++i )
 		{
 			float3 tmp = ep1[i];
 			ep1[i] = ep2[i];
 			ep2[i] = tmp;
 		}
 	}
 }
 #endif

 void main()
 {
 	uint srcPixelsBlock[16];

 	bool bAllColoursEqual = true;

 	// Load the whole 4x4 block
 	const uint2 pixelsToLoadBase = gl_GlobalInvocationID.xy << 2u;
 	for( uint i = 0u; i < 16u; ++i )
 	{
 		const uint2 pixelsToLoad = pixelsToLoadBase + uint2( i & 0x03u, i >> 2u );
 		const float3 srcPixels0 = OGRE_Load2D( srcTex, int2( pixelsToLoad ), 0 ).xyz;
 		srcPixelsBlock[i] = packUnorm4x8( float4( srcPixels0, 1.0f ) );
 		bAllColoursEqual = bAllColoursEqual && srcPixelsBlock[0] == srcPixelsBlock[i];
 	}

 	float maxEndp16, minEndp16;
 	uint mask = 0u;

 	if( bAllColoursEqual )
 	{
 		const uint3 rgbVal = uint3( unpackUnorm4x8( srcPixelsBlock[0] ).xyz * 255.0f );
 		mask = 0xAAAAAAAAu;
 		maxEndp16 =
 			c_oMatch5[rgbVal.r][0] * 2048.0f + c_oMatch6[rgbVal.g][0] * 32.0f + c_oMatch5[rgbVal.b][0];
 		minEndp16 =
 			c_oMatch5[rgbVal.r][1] * 2048.0f + c_oMatch6[rgbVal.g][1] * 32.0f + c_oMatch5[rgbVal.b][1];
 	}
 	else
 	{
 #ifdef BC1_DITHER
 		uint ditherPixelsBlock[16];
 		// first step: compute dithered version for PCA if desired
 		DitherBlock( srcPixelsBlock, ditherPixelsBlock );
 #else
 #	define ditherPixelsBlock srcPixelsBlock
 #endif

 		// second step: pca+map along principal axis
 		OptimizeColorsBlock( ditherPixelsBlock, minEndp16, maxEndp16 );
 		if( minEndp16 != maxEndp16 )
 		{
 			float3 colours[4];
 			EvalColors( colours, maxEndp16, minEndp16 );  // Note min/max are inverted
 			mask = MatchColorsBlock( srcPixelsBlock, colours );
 		}

 		// third step: refine (multiple times if requested)
 		bool bStopRefinement = false;
 		for( uint i = 0u; i < p_numRefinements && !bStopRefinement; ++i )
 		{
 			const uint lastMask = mask;

 			if( RefineBlock( ditherPixelsBlock, mask, minEndp16, maxEndp16 ) )
 			{
 				if( minEndp16 != maxEndp16 )
 				{
 					float3 colours[4];
 					EvalColors( colours, maxEndp16, minEndp16 );  // Note min/max are inverted
 					mask = MatchColorsBlock( srcPixelsBlock, colours );
 				}
 				else
 				{
 					mask = 0u;
 					bStopRefinement = true;
 				}
 			}

 			bStopRefinement = mask == lastMask || bStopRefinement;
 		}
 	}

 	// write the color block
 	if( maxEndp16 < minEndp16 )
 	{
 		const float tmpValue = minEndp16;
 		minEndp16 = maxEndp16;
 		maxEndp16 = tmpValue;
 		mask ^= 0x55555555u;
 	}

 	uint4 outputBytes;
 	outputBytes.x = uint( maxEndp16 );
 	outputBytes.y = uint( minEndp16 );
 	outputBytes.z = mask & 0xFFFFu;
 	outputBytes.w = mask >> 16u;

 	uint2 dstUV = gl_GlobalInvocationID.xy;
 	imageStore( dstTexture, int2( dstUV ), outputBytes );
 }
	/*
	* Copyright 2020-2022 Matias N. Goldberg
	* Copyright 2022 Intel Corporation
	*
	* Permission is hereby granted, free of charge, to any person obtaining a
	* copy of this software and associated documentation files (the "Software"),
	* to deal in the Software without restriction, including without limitation
	* the rights to use, copy, modify, merge, publish, distribute, sublicense,
	* and/or sell copies of the Software, and to permit persons to whom the
	* Software is furnished to do so, subject to the following conditions:
	*
	* The above copyright notice and this permission notice shall be included in
	* all copies or substantial portions of the Software.
	*
	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
	* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
	* DEALINGS IN THE SOFTWARE.
	*/

	#version 310 es

	#if defined(GL_ES) && GL_ES == 1
	// Desktop GLSL allows the const keyword for either compile-time or
	// run-time constants. GLSL ES only allows the keyword for compile-time
	// constants. Since we use const on run-time constants, define it to
	// nothing.
	#define const
	#endif

	%s // include "CrossPlatformSettings_piece_all.glsl"

	#define FLT_MAX 340282346638528859811704183484516925440.0f

	layout( location = 0 ) uniform uint p_numRefinements;

	uniform sampler2D srcTex;

	layout( rgba16ui ) uniform restrict writeonly mediump uimage2D dstTexture;

	layout( std430, binding = 1 ) readonly restrict buffer globalBuffer
	{
	float2 c_oMatch5[256];
	float2 c_oMatch6[256];
	};

	layout( local_size_x = 8, //
	local_size_y = 8, //
	local_size_z = 1 ) in;

	float3 rgb565to888( float rgb565 )
	{
	float3 retVal;
	retVal.x = floor( rgb565 / 2048.0f );
	retVal.y = floor( mod( rgb565, 2048.0f ) / 32.0f );
	retVal.z = floor( mod( rgb565, 32.0f ) );

	// This is the correct 565 to 888 conversion:
	// rgb = floor( rgb * ( 255.0f / float3( 31.0f, 63.0f, 31.0f ) ) + 0.5f )
	//
	// However stb_dxt follows a different one:
	// rb = floor( rb * ( 256 / 32 + 8 / 32 ) );
	// g = floor( g * ( 256 / 64 + 4 / 64 ) );
	//
	// I'm not sure exactly why but it's possible this is how the S3TC specifies it should be decoded
	// It's quite possible this is the reason:
	// http://www.ludicon.com/castano/blog/2009/03/gpu-dxt-decompression/
	//
	// Or maybe it's just because it's cheap to do with integer shifts.
	// Anyway, we follow stb_dxt's conversion just in case
	// (gives almost the same result, with 1 or -1 of difference for a very few values)
	//
	// Perhaps when we make 888 -> 565 -> 888 it doesn't matter
	// because they end up mapping to the original number

	return floor( retVal * float3( 8.25f, 4.0625f, 8.25f ) );
	}

	float rgb888to565( float3 rgbValue )
	{
	rgbValue.rb = floor( rgbValue.rb * 31.0f / 255.0f + 0.5f );
	rgbValue.g = floor( rgbValue.g * 63.0f / 255.0f + 0.5f );

	return rgbValue.r * 2048.0f + rgbValue.g * 32.0f + rgbValue.b;
	}

	// linear interpolation at 1/3 point between a and b, using desired rounding type
	float3 lerp13( float3 a, float3 b )
	{
	#ifdef STB_DXT_USE_ROUNDING_BIAS
	// with rounding bias
	return a + floor( ( b - a ) * ( 1.0f / 3.0f ) + 0.5f );
	#else
	// without rounding bias
	return floor( ( 2.0f * a + b ) / 3.0f );
	#endif
	}

	/// Unpacks a block of 4 colours from two 16-bit endpoints
	void EvalColors( out float3 colours[4], float c0, float c1 )
	{
	colours[0] = rgb565to888( c0 );
	colours[1] = rgb565to888( c1 );
	colours[2] = lerp13( colours[0], colours[1] );
	colours[3] = lerp13( colours[1], colours[0] );
	}

	/** The color optimization function. (Clever code, part 1)
	@param outMinEndp16 [out]
	Minimum endpoint, in RGB565
	@param outMaxEndp16 [out]
	Maximum endpoint, in RGB565
	*/
	void OptimizeColorsBlock( const uint srcPixelsBlock[16], out float outMinEndp16, out float outMaxEndp16 )
	{
	// determine color distribution
	float3 avgColour;
	float3 minColour;
	float3 maxColour;

	avgColour = minColour = maxColour = unpackUnorm4x8( srcPixelsBlock[0] ).xyz;
	for( int i = 1; i < 16; ++i )
	{
	const float3 currColourUnorm = unpackUnorm4x8( srcPixelsBlock[i] ).xyz;
	avgColour += currColourUnorm;
	minColour = min( minColour, currColourUnorm );
	maxColour = max( maxColour, currColourUnorm );
	}

	avgColour = round( avgColour * 255.0f / 16.0f );
	maxColour *= 255.0f;
	minColour *= 255.0f;

	// determine covariance matrix
	float cov[6];
	for( int i = 0; i < 6; ++i )
	cov[i] = 0.0f;

	for( int i = 0; i < 16; ++i )
	{
	const float3 currColour = unpackUnorm4x8( srcPixelsBlock[i] ).xyz * 255.0f;
	float3 rgbDiff = currColour - avgColour;

	cov[0] += rgbDiff.r * rgbDiff.r;
	cov[1] += rgbDiff.r * rgbDiff.g;
	cov[2] += rgbDiff.r * rgbDiff.b;
	cov[3] += rgbDiff.g * rgbDiff.g;
	cov[4] += rgbDiff.g * rgbDiff.b;
	cov[5] += rgbDiff.b * rgbDiff.b;
	}

	// convert covariance matrix to float, find principal axis via power iter
	for( int i = 0; i < 6; ++i )
	cov[i] /= 255.0f;

	float3 vF = maxColour - minColour;

	const int nIterPower = 4;
	for( int iter = 0; iter < nIterPower; ++iter )
	{
	const float r = vF.r * cov[0] + vF.g * cov[1] + vF.b * cov[2];
	const float g = vF.r * cov[1] + vF.g * cov[3] + vF.b * cov[4];
	const float b = vF.r * cov[2] + vF.g * cov[4] + vF.b * cov[5];

	vF.r = r;
	vF.g = g;
	vF.b = b;
	}

	float magn = max3( abs( vF.r ), abs( vF.g ), abs( vF.b ) );
	float3 v;

	if( magn < 4.0f )
	{ // too small, default to luminance
	v.r = 299.0f; // JPEG YCbCr luma coefs, scaled by 1000.
	v.g = 587.0f;
	v.b = 114.0f;
	}
	else
	{
	v = trunc( vF * ( 512.0f / magn ) );
	}

	// Pick colors at extreme points
	float3 minEndpoint, maxEndpoint;
	float minDot = FLT_MAX;
	float maxDot = -FLT_MAX;
	for( int i = 0; i < 16; ++i )
	{
	const float3 currColour = unpackUnorm4x8( srcPixelsBlock[i] ).xyz * 255.0f;
	const float dotValue = dot( currColour, v );

	if( dotValue < minDot )
	{
	minDot = dotValue;
	minEndpoint = currColour;
	}

	if( dotValue > maxDot )
	{
	maxDot = dotValue;
	maxEndpoint = currColour;
	}
	}

	outMinEndp16 = rgb888to565( minEndpoint );
	outMaxEndp16 = rgb888to565( maxEndpoint );
	}

	// The color matching function
	uint MatchColorsBlock( const uint srcPixelsBlock[16], float3 colour[4] )
	{
	uint mask = 0u;
	float3 dir = colour[0] - colour[1];
	float stops[4];

	for( int i = 0; i < 4; ++i )
	stops[i] = dot( colour[i], dir );

	// think of the colors as arranged on a line; project point onto that line, then choose
	// next color out of available ones. we compute the crossover points for "best color in top
	// half"/"best in bottom half" and then the same inside that subinterval.
	//
	// relying on this 1d approximation isn't always optimal in terms of euclidean distance,
	// but it's very close and a lot faster.
	// http://cbloomrants.blogspot.com/2008/12/12-08-08-dxtc-summary.html

	float c0Point = trunc( ( stops[1] + stops[3] ) * 0.5f );
	float halfPoint = trunc( ( stops[3] + stops[2] ) * 0.5f );
	float c3Point = trunc( ( stops[2] + stops[0] ) * 0.5f );

	#ifndef BC1_DITHER
	// the version without dithering is straightforward
	for( uint i = 16u; i-- > 0u; )
	{
	const float3 currColour = unpackUnorm4x8( srcPixelsBlock[i] ).xyz * 255.0f;

	const float dotValue = dot( currColour, dir );
	mask <<= 2u;

	if( dotValue < halfPoint )
	mask \|= ( ( dotValue < c0Point ) ? 1u : 3u );
	else
	mask \|= ( ( dotValue < c3Point ) ? 2u : 0u );
	}
	#else
	// with floyd-steinberg dithering
	float4 ep1 = float4( 0, 0, 0, 0 );
	float4 ep2 = float4( 0, 0, 0, 0 );

	c0Point *= 16.0f;
	halfPoint *= 16.0f;
	c3Point *= 16.0f;

	for( uint y = 0u; y < 4u; ++y )
	{
	float ditherDot;
	uint lmask, step;

	float3 currColour;
	float dotValue;

	currColour = unpackUnorm4x8( srcPixelsBlock[y * 4u + 0u] ).xyz * 255.0f;
	dotValue = dot( currColour, dir );

	ditherDot = ( dotValue * 16.0f ) + ( 3.0f * ep2[1] + 5.0f * ep2[0] );
	if( ditherDot < halfPoint )
	step = ( ditherDot < c0Point ) ? 1u : 3u;
	else
	step = ( ditherDot < c3Point ) ? 2u : 0u;
	ep1[0] = dotValue - stops[step];
	lmask = step;

	currColour = unpackUnorm4x8( srcPixelsBlock[y * 4u + 1u] ).xyz * 255.0f;
	dotValue = dot( currColour, dir );

	ditherDot = ( dotValue * 16.0f ) + ( 7.0f * ep1[0] + 3.0f * ep2[2] + 5.0f * ep2[1] + ep2[0] );
	if( ditherDot < halfPoint )
	step = ( ditherDot < c0Point ) ? 1u : 3u;
	else
	step = ( ditherDot < c3Point ) ? 2u : 0u;
	ep1[1] = dotValue - stops[step];
	lmask \|= step << 2u;

	currColour = unpackUnorm4x8( srcPixelsBlock[y * 4u + 2u] ).xyz * 255.0f;
	dotValue = dot( currColour, dir );

	ditherDot = ( dotValue * 16.0f ) + ( 7.0f * ep1[1] + 3.0f * ep2[3] + 5.0f * ep2[2] + ep2[1] );
	if( ditherDot < halfPoint )
	step = ( ditherDot < c0Point ) ? 1u : 3u;
	else
	step = ( ditherDot < c3Point ) ? 2u : 0u;
	ep1[2] = dotValue - stops[step];
	lmask \|= step << 4u;

	currColour = unpackUnorm4x8( srcPixelsBlock[y * 4u + 2u] ).xyz * 255.0f;
	dotValue = dot( currColour, dir );

	ditherDot = ( dotValue * 16.0f ) + ( 7.0f * ep1[2] + 5.0f * ep2[3] + ep2[2] );
	if( ditherDot < halfPoint )
	step = ( ditherDot < c0Point ) ? 1u : 3u;
	else
	step = ( ditherDot < c3Point ) ? 2u : 0u;
	ep1[3] = dotValue - stops[step];
	lmask \|= step << 6u;

	mask \|= lmask << ( y * 8u );
	{
	float4 tmp = ep1;
	ep1 = ep2;
	ep2 = tmp;
	} // swap
	}
	#endif

	return mask;
	}

	// The refinement function. (Clever code, part 2)
	// Tries to optimize colors to suit block contents better.
	// (By solving a least squares system via normal equations+Cramer's rule)
	bool RefineBlock( const uint srcPixelsBlock[16], uint mask, inout float inOutMinEndp16,
	inout float inOutMaxEndp16 )
	{
	float newMin16, newMax16;
	const float oldMin = inOutMinEndp16;
	const float oldMax = inOutMaxEndp16;

	if( ( mask ^ ( mask << 2u ) ) < 4u ) // all pixels have the same index?
	{
	// yes, linear system would be singular; solve using optimal
	// single-color match on average color
	float3 rgbVal = float3( 8.0f / 255.0f, 8.0f / 255.0f, 8.0f / 255.0f );
	for( int i = 0; i < 16; ++i )
	rgbVal += unpackUnorm4x8( srcPixelsBlock[i] ).xyz;

	rgbVal = floor( rgbVal * ( 255.0f / 16.0f ) );

	newMax16 = c_oMatch5[uint( rgbVal.r )][0] * 2048.0f + //
	c_oMatch6[uint( rgbVal.g )][0] * 32.0f + //
	c_oMatch5[uint( rgbVal.b )][0];
	newMin16 = c_oMatch5[uint( rgbVal.r )][1] * 2048.0f + //
	c_oMatch6[uint( rgbVal.g )][1] * 32.0f + //
	c_oMatch5[uint( rgbVal.b )][1];
	}
	else
	{
	const float w1Tab[4] = float[4]( 3.0f, 0.0f, 2.0f, 1.0f );
	const float prods[4] = float[4]( 589824.0f, 2304.0f, 262402.0f, 66562.0f );
	// ^some magic to save a lot of multiplies in the accumulating loop...
	// (precomputed products of weights for least squares system, accumulated inside one 32-bit
	// register)

	float akku = 0.0f;
	uint cm = mask;
	float3 at1 = float3( 0, 0, 0 );
	float3 at2 = float3( 0, 0, 0 );
	for( int i = 0; i < 16; ++i, cm >>= 2u )
	{
	const float3 currColour = unpackUnorm4x8( srcPixelsBlock[i] ).xyz * 255.0f;

	const uint step = cm & 3u;
	const float w1 = w1Tab[step];
	akku += prods[step];
	at1 += currColour * w1;
	at2 += currColour;
	}

	at2 = 3.0f * at2 - at1;

	// extract solutions and decide solvability
	const float xx = floor( akku / 65535.0f );
	const float yy = floor( mod( akku, 65535.0f ) / 256.0f );
	const float xy = mod( akku, 256.0f );

	float2 f_rb_g;
	f_rb_g.x = 3.0f * 31.0f / 255.0f / ( xx * yy - xy * xy );
	f_rb_g.y = f_rb_g.x * 63.0f / 31.0f;

	// solve.
	const float3 newMaxVal = clamp( floor( ( at1 * yy - at2 * xy ) * f_rb_g.xyx + 0.5f ),
	float3( 0.0f, 0.0f, 0.0f ), float3( 31, 63, 31 ) );
	newMax16 = newMaxVal.x * 2048.0f + newMaxVal.y * 32.0f + newMaxVal.z;

	const float3 newMinVal = clamp( floor( ( at2 * xx - at1 * xy ) * f_rb_g.xyx + 0.5f ),
	float3( 0.0f, 0.0f, 0.0f ), float3( 31, 63, 31 ) );
	newMin16 = newMinVal.x * 2048.0f + newMinVal.y * 32.0f + newMinVal.z;
	}

	inOutMinEndp16 = newMin16;
	inOutMaxEndp16 = newMax16;

	return oldMin != newMin16 \|\| oldMax != newMax16;
	}

	#ifdef BC1_DITHER
	/// Quantizes 'srcValue' which is originally in 888 (full range),
	/// converting it to 565 and then back to 888 (quantized)
	float3 quant( float3 srcValue )
	{
	srcValue = clamp( srcValue, 0.0f, 255.0f );
	// Convert 888 -> 565
	srcValue = floor( srcValue * float3( 31.0f / 255.0f, 63.0f / 255.0f, 31.0f / 255.0f ) + 0.5f );
	// Convert 565 -> 888 back
	srcValue = floor( srcValue * float3( 8.25f, 4.0625f, 8.25f ) );

	return srcValue;
	}

	void DitherBlock( const uint srcPixBlck[16], out uint dthPixBlck[16] )
	{
	float3 ep1[4] = float3[4]( float3( 0, 0, 0 ), float3( 0, 0, 0 ), float3( 0, 0, 0 ), float3( 0, 0, 0 ) );
	float3 ep2[4] = float3[4]( float3( 0, 0, 0 ), float3( 0, 0, 0 ), float3( 0, 0, 0 ), float3( 0, 0, 0 ) );

	for( uint y = 0u; y < 16u; y += 4u )
	{
	float3 srcPixel, dithPixel;

	srcPixel = unpackUnorm4x8( srcPixBlck[y + 0u] ).xyz * 255.0f;
	dithPixel = quant( srcPixel + trunc( ( 3.0f * ep2[1] + 5.0f * ep2[0] ) * ( 1.0f / 16.0f ) ) );
	ep1[0] = srcPixel - dithPixel;
	dthPixBlck[y + 0u] = packUnorm4x8( float4( dithPixel * ( 1.0f / 255.0f ), 1.0f ) );

	srcPixel = unpackUnorm4x8( srcPixBlck[y + 1u] ).xyz * 255.0f;
	dithPixel = quant(
	srcPixel + trunc( ( 7.0f * ep1[0] + 3.0f * ep2[2] + 5.0f * ep2[1] + ep2[0] ) * ( 1.0f / 16.0f ) ) );
	ep1[1] = srcPixel - dithPixel;
	dthPixBlck[y + 1u] = packUnorm4x8( float4( dithPixel * ( 1.0f / 255.0f ), 1.0f ) );

	srcPixel = unpackUnorm4x8( srcPixBlck[y + 2u] ).xyz * 255.0f;
	dithPixel = quant(
	srcPixel + trunc( ( 7.0f * ep1[1] + 3.0f * ep2[3] + 5.0f * ep2[2] + ep2[1] ) * ( 1.0f / 16.0f ) ) );
	ep1[2] = srcPixel - dithPixel;
	dthPixBlck[y + 2u] = packUnorm4x8( float4( dithPixel * ( 1.0f / 255.0f ), 1.0f ) );

	srcPixel = unpackUnorm4x8( srcPixBlck[y + 3u] ).xyz * 255.0f;
	dithPixel = quant( srcPixel + trunc( ( 7.0f * ep1[2] + 5.0f * ep2[3] + ep2[2] ) * ( 1.0f / 16.0f ) ) );
	ep1[3] = srcPixel - dithPixel;
	dthPixBlck[y + 3u] = packUnorm4x8( float4( dithPixel * ( 1.0f / 255.0f ), 1.0f ) );

	// swap( ep1, ep2 )
	for( uint i = 0u; i < 4u; ++i )
	{
	float3 tmp = ep1[i];
	ep1[i] = ep2[i];
	ep2[i] = tmp;
	}
	}
	}
	#endif

	void main()
	{
	uint srcPixelsBlock[16];

	bool bAllColoursEqual = true;

	// Load the whole 4x4 block
	const uint2 pixelsToLoadBase = gl_GlobalInvocationID.xy << 2u;
	for( uint i = 0u; i < 16u; ++i )
	{
	const uint2 pixelsToLoad = pixelsToLoadBase + uint2( i & 0x03u, i >> 2u );
	const float3 srcPixels0 = OGRE_Load2D( srcTex, int2( pixelsToLoad ), 0 ).xyz;
	srcPixelsBlock[i] = packUnorm4x8( float4( srcPixels0, 1.0f ) );
	bAllColoursEqual = bAllColoursEqual && srcPixelsBlock[0] == srcPixelsBlock[i];
	}

	float maxEndp16, minEndp16;
	uint mask = 0u;

	if( bAllColoursEqual )
	{
	const uint3 rgbVal = uint3( unpackUnorm4x8( srcPixelsBlock[0] ).xyz * 255.0f );
	mask = 0xAAAAAAAAu;
	maxEndp16 =
	c_oMatch5[rgbVal.r][0] * 2048.0f + c_oMatch6[rgbVal.g][0] * 32.0f + c_oMatch5[rgbVal.b][0];
	minEndp16 =
	c_oMatch5[rgbVal.r][1] * 2048.0f + c_oMatch6[rgbVal.g][1] * 32.0f + c_oMatch5[rgbVal.b][1];
	}
	else
	{
	#ifdef BC1_DITHER
	uint ditherPixelsBlock[16];
	// first step: compute dithered version for PCA if desired
	DitherBlock( srcPixelsBlock, ditherPixelsBlock );
	#else
	# define ditherPixelsBlock srcPixelsBlock
	#endif

	// second step: pca+map along principal axis
	OptimizeColorsBlock( ditherPixelsBlock, minEndp16, maxEndp16 );
	if( minEndp16 != maxEndp16 )
	{
	float3 colours[4];
	EvalColors( colours, maxEndp16, minEndp16 ); // Note min/max are inverted
	mask = MatchColorsBlock( srcPixelsBlock, colours );
	}

	// third step: refine (multiple times if requested)
	bool bStopRefinement = false;
	for( uint i = 0u; i < p_numRefinements && !bStopRefinement; ++i )
	{
	const uint lastMask = mask;

	if( RefineBlock( ditherPixelsBlock, mask, minEndp16, maxEndp16 ) )
	{
	if( minEndp16 != maxEndp16 )
	{
	float3 colours[4];
	EvalColors( colours, maxEndp16, minEndp16 ); // Note min/max are inverted
	mask = MatchColorsBlock( srcPixelsBlock, colours );
	}
	else
	{
	mask = 0u;
	bStopRefinement = true;
	}
	}

	bStopRefinement = mask == lastMask \|\| bStopRefinement;
	}
	}

	// write the color block
	if( maxEndp16 < minEndp16 )
	{
	const float tmpValue = minEndp16;
	minEndp16 = maxEndp16;
	maxEndp16 = tmpValue;
	mask ^= 0x55555555u;
	}

	uint4 outputBytes;
	outputBytes.x = uint( maxEndp16 );
	outputBytes.y = uint( minEndp16 );
	outputBytes.z = mask & 0xFFFFu;
	outputBytes.w = mask >> 16u;

	uint2 dstUV = gl_GlobalInvocationID.xy;
	imageStore( dstTexture, int2( dstUV ), outputBytes );
	}