src/compiler/glsl/bc4.glsl - platform/external/mesa3d - Git at Google

 /*
  * Copyright 2020-2022 Matias N. Goldberg
  * Copyright 2022 Intel Corporation
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
  * to deal in the Software without restriction, including without limitation
  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  * and/or sell copies of the Software, and to permit persons to whom the
  * Software is furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice shall be included in
  * all copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  * DEALINGS IN THE SOFTWARE.
  */

 #version 310 es

 #if defined(GL_ES) && GL_ES == 1
 	// Desktop GLSL allows the const keyword for either compile-time or
 	// run-time constants. GLSL ES only allows the keyword for compile-time
 	// constants. Since we use const on run-time constants, define it to
 	// nothing.
 	#define const
 #endif

 #define __sharedOnlyBarrier memoryBarrierShared();barrier();

 %s // include "CrossPlatformSettings_piece_all.glsl"

 shared float2 g_minMaxValues[4u * 4u * 4u];
 shared uint2 g_mask[4u * 4u];

 layout( location = 0 ) uniform uint2 params;

 #define p_channelIdx params.x
 #define p_useSNorm params.y

 uniform sampler2D srcTex;

 layout( rgba16ui ) uniform restrict writeonly mediump uimage2D dstTexture;

 layout( local_size_x = 4,  //
 		local_size_y = 4,  //
 		local_size_z = 4 ) in;

 /// Each block is 16 pixels
 /// Each thread works on 4 pixels
 /// Therefore each block needs 4 threads, generating 8 masks
 /// At the end these 8 masks get merged into 2 and results written to output
 ///
 /// **Q: Why 4 pixels per thread? Why not 1 pixel per thread? Why not 2? Why not 16?**
 ///
 /// A: It's a sweetspot.
 ///  - Very short threads cannot fill expensive GPUs with enough work (dispatch bound)
 ///  - Lots of threads means lots of synchronization (e.g. evaluating min/max, merging masks)
 ///    overhead, and also more LDS usage which reduces occupancy.
 ///  - Long threads (e.g. 1 thread per block) misses parallelism opportunities
 void main()
 {
 	float minVal, maxVal;
 	float4 srcPixel;

 	const uint blockThreadId = gl_LocalInvocationID.x;

 	const uint2 pixelsToLoadBase = gl_GlobalInvocationID.yz << 2u;

 	for( uint i = 0u; i < 4u; ++i )
 	{
 		const uint2 pixelsToLoad = pixelsToLoadBase + uint2( i, blockThreadId );

 		const float4 value = OGRE_Load2D( srcTex, int2( pixelsToLoad ), 0 ).xyzw;
 		srcPixel[i] = p_channelIdx == 0u ? value.x : ( p_channelIdx == 1u ? value.y : value.w );
 		srcPixel[i] *= 255.0f;
 	}

 	minVal = min3( srcPixel.x, srcPixel.y, srcPixel.z );
 	maxVal = max3( srcPixel.x, srcPixel.y, srcPixel.z );
 	minVal = min( minVal, srcPixel.w );
 	maxVal = max( maxVal, srcPixel.w );

 	const uint minMaxIdxBase = ( gl_LocalInvocationID.z << 4u ) + ( gl_LocalInvocationID.y << 2u );
 	const uint maskIdxBase = ( gl_LocalInvocationID.z << 2u ) + gl_LocalInvocationID.y;

 	g_minMaxValues[minMaxIdxBase + blockThreadId] = float2( minVal, maxVal );
 	g_mask[maskIdxBase] = uint2( 0u, 0u );

 	__sharedOnlyBarrier;

 	// Have all 4 threads in the block grab the min/max value by comparing what all 4 threads uploaded
 	for( uint i = 0u; i < 4u; ++i )
 	{
 		minVal = min( g_minMaxValues[minMaxIdxBase + i].x, minVal );
 		maxVal = max( g_minMaxValues[minMaxIdxBase + i].y, maxVal );
 	}

 	// determine bias and emit color indices
 	// given the choice of maxVal/minVal, these indices are optimal:
 	// http://fgiesen.wordpress.com/2009/12/15/dxt5-alpha-block-index-determination/
 	float dist = maxVal - minVal;
 	float dist4 = dist * 4.0f;
 	float dist2 = dist * 2.0f;
 	float bias = ( dist < 8.0f ) ? ( dist - 1.0f ) : ( trunc( dist * 0.5f ) + 2.0f );
 	bias -= minVal * 7.0f;

 	uint mask0 = 0u, mask1 = 0u;

 	for( uint i = 0u; i < 4u; ++i )
 	{
 		float a = srcPixel[i] * 7.0f + bias;

 		int ind = 0;

 		// select index. this is a "linear scale" lerp factor between 0 (val=min) and 7 (val=max).
 		if( a >= dist4 )
 		{
 			ind = 4;
 			a -= dist4;
 		}

 		if( a >= dist2 )
 		{
 			ind += 2;
 			a -= dist2;
 		}

 		if( a >= dist )
 			ind += 1;

 		// turn linear scale into DXT index (0/1 are extremal pts)
 		ind = -ind & 7;
 		ind ^= ( 2 > ind ) ? 1 : 0;

 		// write index
 		const uint bits = 16u + ( ( blockThreadId << 2u ) + i ) * 3u;
 		if( bits < 32u )
 		{
 			mask0 |= uint( ind ) << bits;
 			if( bits + 3u > 32u )
 			{
 				mask1 |= uint( ind ) >> ( 32u - bits );
 			}
 		}
 		else
 		{
 			mask1 |= uint( ind ) << ( bits - 32u );
 		}
 	}

 	if( mask0 != 0u )
 		atomicOr( g_mask[maskIdxBase].x, mask0 );
 	if( mask1 != 0u )
 		atomicOr( g_mask[maskIdxBase].y, mask1 );

 	__sharedOnlyBarrier;

 	if( blockThreadId == 0u )
 	{
 		// Save data
 		uint4 outputBytes;

 		if( p_useSNorm != 0u )
 		{
 			outputBytes.x =
 				packSnorm4x8( float4( maxVal * ( 1.0f / 255.0f ) * 2.0f - 1.0f,
 									  minVal * ( 1.0f / 255.0f ) * 2.0f - 1.0f, 0.0f, 0.0f ) );
 		}
 		else
 		{
 			outputBytes.x = packUnorm4x8(
 				float4( maxVal * ( 1.0f / 255.0f ), minVal * ( 1.0f / 255.0f ), 0.0f, 0.0f ) );
 		}
 		outputBytes.y = g_mask[maskIdxBase].x >> 16u;
 		outputBytes.z = g_mask[maskIdxBase].y & 0xFFFFu;
 		outputBytes.w = g_mask[maskIdxBase].y >> 16u;

 		uint2 dstUV = gl_GlobalInvocationID.yz;
 		imageStore( dstTexture, int2( dstUV ), outputBytes );
 	}
 }
	/*
	* Copyright 2020-2022 Matias N. Goldberg
	* Copyright 2022 Intel Corporation
	*
	* Permission is hereby granted, free of charge, to any person obtaining a
	* copy of this software and associated documentation files (the "Software"),
	* to deal in the Software without restriction, including without limitation
	* the rights to use, copy, modify, merge, publish, distribute, sublicense,
	* and/or sell copies of the Software, and to permit persons to whom the
	* Software is furnished to do so, subject to the following conditions:
	*
	* The above copyright notice and this permission notice shall be included in
	* all copies or substantial portions of the Software.
	*
	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
	* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
	* DEALINGS IN THE SOFTWARE.
	*/

	#version 310 es

	#if defined(GL_ES) && GL_ES == 1
	// Desktop GLSL allows the const keyword for either compile-time or
	// run-time constants. GLSL ES only allows the keyword for compile-time
	// constants. Since we use const on run-time constants, define it to
	// nothing.
	#define const
	#endif

	#define __sharedOnlyBarrier memoryBarrierShared();barrier();

	%s // include "CrossPlatformSettings_piece_all.glsl"

	shared float2 g_minMaxValues[4u * 4u * 4u];
	shared uint2 g_mask[4u * 4u];

	layout( location = 0 ) uniform uint2 params;

	#define p_channelIdx params.x
	#define p_useSNorm params.y

	uniform sampler2D srcTex;

	layout( rgba16ui ) uniform restrict writeonly mediump uimage2D dstTexture;

	layout( local_size_x = 4, //
	local_size_y = 4, //
	local_size_z = 4 ) in;

	/// Each block is 16 pixels
	/// Each thread works on 4 pixels
	/// Therefore each block needs 4 threads, generating 8 masks
	/// At the end these 8 masks get merged into 2 and results written to output
	///
	/// Q: Why 4 pixels per thread? Why not 1 pixel per thread? Why not 2? Why not 16?
	///
	/// A: It's a sweetspot.
	/// - Very short threads cannot fill expensive GPUs with enough work (dispatch bound)
	/// - Lots of threads means lots of synchronization (e.g. evaluating min/max, merging masks)
	/// overhead, and also more LDS usage which reduces occupancy.
	/// - Long threads (e.g. 1 thread per block) misses parallelism opportunities
	void main()
	{
	float minVal, maxVal;
	float4 srcPixel;

	const uint blockThreadId = gl_LocalInvocationID.x;

	const uint2 pixelsToLoadBase = gl_GlobalInvocationID.yz << 2u;

	for( uint i = 0u; i < 4u; ++i )
	{
	const uint2 pixelsToLoad = pixelsToLoadBase + uint2( i, blockThreadId );

	const float4 value = OGRE_Load2D( srcTex, int2( pixelsToLoad ), 0 ).xyzw;
	srcPixel[i] = p_channelIdx == 0u ? value.x : ( p_channelIdx == 1u ? value.y : value.w );
	srcPixel[i] *= 255.0f;
	}

	minVal = min3( srcPixel.x, srcPixel.y, srcPixel.z );
	maxVal = max3( srcPixel.x, srcPixel.y, srcPixel.z );
	minVal = min( minVal, srcPixel.w );
	maxVal = max( maxVal, srcPixel.w );

	const uint minMaxIdxBase = ( gl_LocalInvocationID.z << 4u ) + ( gl_LocalInvocationID.y << 2u );
	const uint maskIdxBase = ( gl_LocalInvocationID.z << 2u ) + gl_LocalInvocationID.y;

	g_minMaxValues[minMaxIdxBase + blockThreadId] = float2( minVal, maxVal );
	g_mask[maskIdxBase] = uint2( 0u, 0u );

	__sharedOnlyBarrier;

	// Have all 4 threads in the block grab the min/max value by comparing what all 4 threads uploaded
	for( uint i = 0u; i < 4u; ++i )
	{
	minVal = min( g_minMaxValues[minMaxIdxBase + i].x, minVal );
	maxVal = max( g_minMaxValues[minMaxIdxBase + i].y, maxVal );
	}

	// determine bias and emit color indices
	// given the choice of maxVal/minVal, these indices are optimal:
	// http://fgiesen.wordpress.com/2009/12/15/dxt5-alpha-block-index-determination/
	float dist = maxVal - minVal;
	float dist4 = dist * 4.0f;
	float dist2 = dist * 2.0f;
	float bias = ( dist < 8.0f ) ? ( dist - 1.0f ) : ( trunc( dist * 0.5f ) + 2.0f );
	bias -= minVal * 7.0f;

	uint mask0 = 0u, mask1 = 0u;

	for( uint i = 0u; i < 4u; ++i )
	{
	float a = srcPixel[i] * 7.0f + bias;

	int ind = 0;

	// select index. this is a "linear scale" lerp factor between 0 (val=min) and 7 (val=max).
	if( a >= dist4 )
	{
	ind = 4;
	a -= dist4;
	}

	if( a >= dist2 )
	{
	ind += 2;
	a -= dist2;
	}

	if( a >= dist )
	ind += 1;

	// turn linear scale into DXT index (0/1 are extremal pts)
	ind = -ind & 7;
	ind ^= ( 2 > ind ) ? 1 : 0;

	// write index
	const uint bits = 16u + ( ( blockThreadId << 2u ) + i ) * 3u;
	if( bits < 32u )
	{
	mask0 \|= uint( ind ) << bits;
	if( bits + 3u > 32u )
	{
	mask1 \|= uint( ind ) >> ( 32u - bits );
	}
	}
	else
	{
	mask1 \|= uint( ind ) << ( bits - 32u );
	}
	}

	if( mask0 != 0u )
	atomicOr( g_mask[maskIdxBase].x, mask0 );
	if( mask1 != 0u )
	atomicOr( g_mask[maskIdxBase].y, mask1 );

	__sharedOnlyBarrier;

	if( blockThreadId == 0u )
	{
	// Save data
	uint4 outputBytes;

	if( p_useSNorm != 0u )
	{
	outputBytes.x =
	packSnorm4x8( float4( maxVal * ( 1.0f / 255.0f ) * 2.0f - 1.0f,
	minVal * ( 1.0f / 255.0f ) * 2.0f - 1.0f, 0.0f, 0.0f ) );
	}
	else
	{
	outputBytes.x = packUnorm4x8(
	float4( maxVal * ( 1.0f / 255.0f ), minVal * ( 1.0f / 255.0f ), 0.0f, 0.0f ) );
	}
	outputBytes.y = g_mask[maskIdxBase].x >> 16u;
	outputBytes.z = g_mask[maskIdxBase].y & 0xFFFFu;
	outputBytes.w = g_mask[maskIdxBase].y >> 16u;

	uint2 dstUV = gl_GlobalInvocationID.yz;
	imageStore( dstTexture, int2( dstUV ), outputBytes );
	}
	}